1mod bpf_skel;
10pub use bpf_skel::*;
11pub mod bpf_intf;
12pub use bpf_intf::*;
13
14mod cpu_order;
15use scx_utils::init_libbpf_logging;
16mod stats;
17use std::ffi::c_int;
18use std::ffi::CStr;
19use std::mem;
20use std::mem::MaybeUninit;
21use std::str;
22use std::sync::atomic::AtomicBool;
23use std::sync::atomic::Ordering;
24use std::sync::Arc;
25use std::thread::ThreadId;
26use std::time::Duration;
27
28use anyhow::Context;
29use anyhow::Result;
30use clap::Parser;
31use clap_num::number_range;
32use cpu_order::CpuOrder;
33use cpu_order::PerfCpuOrder;
34use crossbeam::channel;
35use crossbeam::channel::Receiver;
36use crossbeam::channel::RecvTimeoutError;
37use crossbeam::channel::Sender;
38use crossbeam::channel::TrySendError;
39use libbpf_rs::skel::Skel;
40use libbpf_rs::OpenObject;
41use libbpf_rs::PrintLevel;
42use libbpf_rs::ProgramInput;
43use libc::c_char;
44use plain::Plain;
45use scx_arena::ArenaLib;
46use scx_stats::prelude::*;
47use scx_utils::autopower::{fetch_power_profile, PowerProfile};
48use scx_utils::build_id;
49use scx_utils::compat;
50use scx_utils::ksym_exists;
51use scx_utils::libbpf_clap_opts::LibbpfOpts;
52use scx_utils::scx_ops_attach;
53use scx_utils::scx_ops_load;
54use scx_utils::scx_ops_open;
55use scx_utils::try_set_rlimit_infinity;
56use scx_utils::uei_exited;
57use scx_utils::uei_report;
58use scx_utils::EnergyModel;
59use scx_utils::TopologyArgs;
60use scx_utils::UserExitInfo;
61use scx_utils::NR_CPU_IDS;
62use stats::SchedSample;
63use stats::SchedSamples;
64use stats::StatsReq;
65use stats::StatsRes;
66use stats::SysStats;
67use tracing::{debug, info, warn};
68use tracing_subscriber::filter::EnvFilter;
69
70const SCHEDULER_NAME: &str = "scx_lavd";
71#[derive(Debug, Parser)]
77struct Opts {
78 #[clap(short = 'v', long, action = clap::ArgAction::Count)]
80 verbose: u8,
81
82 #[clap(long = "autopilot", action = clap::ArgAction::SetTrue)]
89 autopilot: bool,
90
91 #[clap(long = "autopower", action = clap::ArgAction::SetTrue)]
98 autopower: bool,
99
100 #[clap(long = "performance", action = clap::ArgAction::SetTrue)]
105 performance: bool,
106
107 #[clap(long = "powersave", action = clap::ArgAction::SetTrue)]
112 powersave: bool,
113
114 #[clap(long = "balanced", action = clap::ArgAction::SetTrue)]
119 balanced: bool,
120
121 #[clap(long = "slice-max-us", default_value = "5000")]
123 slice_max_us: u64,
124
125 #[clap(long = "slice-min-us", default_value = "500")]
127 slice_min_us: u64,
128
129 #[clap(long = "mig-delta-pct", default_value = "0", value_parser=Opts::mig_delta_pct_range)]
136 mig_delta_pct: u8,
137
138 #[clap(long = "lb-low-util-pct", default_value = "25", value_parser=Opts::lb_low_util_pct_range)]
144 lb_low_util_pct: u8,
145
146 #[clap(long = "lb-local-dsq-util-pct", default_value = "10", value_parser=Opts::lb_local_dsq_util_pct_range)]
153 lb_local_dsq_util_pct: u8,
154
155 #[clap(long = "pinned-slice-us", default_value = "5000")]
163 pinned_slice_us: Option<u64>,
164
165 #[clap(long = "preempt-shift", default_value = "6", value_parser=Opts::preempt_shift_range)]
170 preempt_shift: u8,
171
172 #[clap(long = "cpu-pref-order", default_value = "")]
178 cpu_pref_order: String,
179
180 #[clap(long = "no-use-em", action = clap::ArgAction::SetTrue)]
182 no_use_em: bool,
183
184 #[clap(long = "no-futex-boost", action = clap::ArgAction::SetTrue)]
186 no_futex_boost: bool,
187
188 #[clap(long = "no-preemption", action = clap::ArgAction::SetTrue)]
190 no_preemption: bool,
191
192 #[clap(long = "no-wake-sync", action = clap::ArgAction::SetTrue)]
194 no_wake_sync: bool,
195
196 #[clap(long = "no-slice-boost", action = clap::ArgAction::SetTrue)]
198 no_slice_boost: bool,
199
200 #[clap(long = "per-cpu-dsq", action = clap::ArgAction::SetTrue)]
206 per_cpu_dsq: bool,
207
208 #[clap(long = "enable-cpu-bw", action = clap::ArgAction::SetTrue)]
211 enable_cpu_bw: bool,
212
213 #[clap(long = "partial", action = clap::ArgAction::SetTrue)]
217 partial: bool,
218
219 #[clap(long = "no-core-compaction", action = clap::ArgAction::SetTrue)]
229 no_core_compaction: bool,
230
231 #[clap(long = "no-freq-scaling", action = clap::ArgAction::SetTrue)]
233 no_freq_scaling: bool,
234
235 #[clap(long)]
237 stats: Option<f64>,
238
239 #[clap(long)]
241 monitor: Option<f64>,
242
243 #[clap(long)]
246 monitor_sched_samples: Option<u64>,
247
248 #[clap(long, default_value = "info")]
251 log_level: String,
252
253 #[clap(short = 'V', long, action = clap::ArgAction::SetTrue)]
255 version: bool,
256
257 #[clap(long)]
259 run_id: Option<u64>,
260
261 #[clap(long)]
263 help_stats: bool,
264
265 #[clap(flatten, next_help_heading = "Libbpf Options")]
266 pub libbpf: LibbpfOpts,
267
268 #[clap(flatten)]
270 topology: Option<TopologyArgs>,
271}
272
273impl Opts {
274 fn can_autopilot(&self) -> bool {
275 self.autopower == false
276 && self.performance == false
277 && self.powersave == false
278 && self.balanced == false
279 && self.no_core_compaction == false
280 }
281
282 fn can_autopower(&self) -> bool {
283 self.autopilot == false
284 && self.performance == false
285 && self.powersave == false
286 && self.balanced == false
287 && self.no_core_compaction == false
288 }
289
290 fn can_performance(&self) -> bool {
291 self.autopilot == false
292 && self.autopower == false
293 && self.powersave == false
294 && self.balanced == false
295 }
296
297 fn can_balanced(&self) -> bool {
298 self.autopilot == false
299 && self.autopower == false
300 && self.performance == false
301 && self.powersave == false
302 && self.no_core_compaction == false
303 }
304
305 fn can_powersave(&self) -> bool {
306 self.autopilot == false
307 && self.autopower == false
308 && self.performance == false
309 && self.balanced == false
310 && self.no_core_compaction == false
311 }
312
313 fn proc(&mut self) -> Option<&mut Self> {
314 if !self.autopilot {
315 self.autopilot = self.can_autopilot();
316 }
317
318 if self.autopilot {
319 if !self.can_autopilot() {
320 info!("Autopilot mode cannot be used with conflicting options.");
321 return None;
322 }
323 info!("Autopilot mode is enabled.");
324 }
325
326 if self.autopower {
327 if !self.can_autopower() {
328 info!("Autopower mode cannot be used with conflicting options.");
329 return None;
330 }
331 info!("Autopower mode is enabled.");
332 }
333
334 if self.performance {
335 if !self.can_performance() {
336 info!("Performance mode cannot be used with conflicting options.");
337 return None;
338 }
339 info!("Performance mode is enabled.");
340 self.no_core_compaction = true;
341 }
342
343 if self.powersave {
344 if !self.can_powersave() {
345 info!("Powersave mode cannot be used with conflicting options.");
346 return None;
347 }
348 info!("Powersave mode is enabled.");
349 self.no_core_compaction = false;
350 }
351
352 if self.balanced {
353 if !self.can_balanced() {
354 info!("Balanced mode cannot be used with conflicting options.");
355 return None;
356 }
357 info!("Balanced mode is enabled.");
358 self.no_core_compaction = false;
359 }
360
361 if !EnergyModel::has_energy_model() || !self.cpu_pref_order.is_empty() {
362 self.no_use_em = true;
363 info!("Energy model won't be used for CPU preference order.");
364 }
365
366 if let Some(pinned_slice) = self.pinned_slice_us {
367 if pinned_slice == 0 {
368 info!("Pinned task slice mode is disabled. Pinned tasks will use per-domain DSQs.");
369 } else if pinned_slice < self.slice_min_us || pinned_slice > self.slice_max_us {
370 info!(
371 "pinned-slice-us ({}) must be between slice-min-us ({}) and slice-max-us ({})",
372 pinned_slice, self.slice_min_us, self.slice_max_us
373 );
374 return None;
375 } else {
376 info!(
377 "Pinned task slice mode is enabled ({} us). Pinned tasks will use per-CPU DSQs.",
378 pinned_slice
379 );
380 }
381 }
382
383 Some(self)
384 }
385
386 fn preempt_shift_range(s: &str) -> Result<u8, String> {
387 number_range(s, 0, 10)
388 }
389
390 fn mig_delta_pct_range(s: &str) -> Result<u8, String> {
391 number_range(s, 0, 100)
392 }
393
394 fn lb_low_util_pct_range(s: &str) -> Result<u8, String> {
395 number_range(s, 0, 100)
396 }
397
398 fn lb_local_dsq_util_pct_range(s: &str) -> Result<u8, String> {
399 number_range(s, 0, 100)
400 }
401}
402
403unsafe impl Plain for msg_task_ctx {}
404
405impl msg_task_ctx {
406 fn from_bytes(buf: &[u8]) -> &msg_task_ctx {
407 plain::from_bytes(buf).expect("The buffer is either too short or not aligned!")
408 }
409}
410
411impl introspec {
412 fn new() -> Self {
413 let intrspc = unsafe { mem::MaybeUninit::<introspec>::zeroed().assume_init() };
414 intrspc
415 }
416}
417
418struct Scheduler<'a> {
419 skel: BpfSkel<'a>,
420 struct_ops: Option<libbpf_rs::Link>,
421 rb_mgr: libbpf_rs::RingBuffer<'static>,
422 intrspc: introspec,
423 intrspc_rx: Receiver<SchedSample>,
424 monitor_tid: Option<ThreadId>,
425 stats_server: StatsServer<StatsReq, StatsRes>,
426 mseq_id: u64,
427}
428
429impl<'a> Scheduler<'a> {
430 fn init(opts: &'a Opts, open_object: &'a mut MaybeUninit<OpenObject>) -> Result<Self> {
431 if *NR_CPU_IDS > LAVD_CPU_ID_MAX as usize {
432 panic!(
433 "Num possible CPU IDs ({}) exceeds maximum of ({})",
434 *NR_CPU_IDS, LAVD_CPU_ID_MAX
435 );
436 }
437
438 try_set_rlimit_infinity();
439
440 let debug_level = if opts.log_level.contains("trace") {
442 2
443 } else if opts.log_level.contains("debug") {
444 1
445 } else {
446 0
447 };
448 let mut skel_builder = BpfSkelBuilder::default();
449 skel_builder.obj_builder.debug(debug_level > 1);
450 init_libbpf_logging(Some(PrintLevel::Debug));
451
452 let open_opts = opts.libbpf.clone().into_bpf_open_opts();
453 let mut skel = scx_ops_open!(skel_builder, open_object, lavd_ops, open_opts)?;
454
455 if !opts.no_futex_boost {
458 if Self::attach_futex_ftraces(&mut skel)? == false {
459 info!("Fail to attach futex ftraces. Try with tracepoints.");
460 if Self::attach_futex_tracepoints(&mut skel)? == false {
461 info!("Fail to attach futex tracepoints.");
462 }
463 }
464 }
465
466 let order = CpuOrder::new(opts.topology.as_ref()).unwrap();
468 Self::init_cpus(&mut skel, &order);
469 Self::init_cpdoms(&mut skel, &order);
470
471 if order.cpdom_map.len() > 1 {
474 Self::attach_execve_tracepoints(&mut skel)?;
475 }
476
477 Self::init_globals(&mut skel, &opts, &order, debug_level);
479
480 let mut skel = scx_ops_load!(skel, lavd_ops, uei)?;
482 let task_size = std::mem::size_of::<types::task_ctx>();
483 let arenalib = ArenaLib::init(skel.object_mut(), task_size, *NR_CPU_IDS)?;
484 arenalib.setup()?;
485
486 let struct_ops = Some(scx_ops_attach!(skel, lavd_ops)?);
488 let stats_server = StatsServer::new(stats::server_data(*NR_CPU_IDS as u64)).launch()?;
489
490 let (intrspc_tx, intrspc_rx) = channel::bounded(65536);
492 let rb_map = &mut skel.maps.introspec_msg;
493 let mut builder = libbpf_rs::RingBufferBuilder::new();
494 builder
495 .add(rb_map, move |data| {
496 Scheduler::relay_introspec(data, &intrspc_tx)
497 })
498 .unwrap();
499 let rb_mgr = builder.build().unwrap();
500
501 Ok(Self {
502 skel,
503 struct_ops,
504 rb_mgr,
505 intrspc: introspec::new(),
506 intrspc_rx,
507 monitor_tid: None,
508 stats_server,
509 mseq_id: 0,
510 })
511 }
512
513 fn attach_futex_ftraces(skel: &mut OpenBpfSkel) -> Result<bool> {
514 let ftraces = vec![
515 ("__futex_wait", &skel.progs.fexit___futex_wait),
516 ("futex_wait_multiple", &skel.progs.fexit_futex_wait_multiple),
517 (
518 "futex_wait_requeue_pi",
519 &skel.progs.fexit_futex_wait_requeue_pi,
520 ),
521 ("futex_wake", &skel.progs.fexit_futex_wake),
522 ("futex_wake_op", &skel.progs.fexit_futex_wake_op),
523 ("futex_lock_pi", &skel.progs.fexit_futex_lock_pi),
524 ("futex_unlock_pi", &skel.progs.fexit_futex_unlock_pi),
525 ];
526
527 if compat::tracer_available("function")? == false {
528 info!("Ftrace is not enabled in the kernel.");
529 return Ok(false);
530 }
531
532 compat::cond_kprobes_enable(ftraces)
533 }
534
535 fn attach_futex_tracepoints(skel: &mut OpenBpfSkel) -> Result<bool> {
536 let tracepoints = vec![
537 ("syscalls:sys_enter_futex", &skel.progs.rtp_sys_enter_futex),
538 ("syscalls:sys_exit_futex", &skel.progs.rtp_sys_exit_futex),
539 (
540 "syscalls:sys_exit_futex_wait",
541 &skel.progs.rtp_sys_exit_futex_wait,
542 ),
543 (
544 "syscalls:sys_exit_futex_waitv",
545 &skel.progs.rtp_sys_exit_futex_waitv,
546 ),
547 (
548 "syscalls:sys_exit_futex_wake",
549 &skel.progs.rtp_sys_exit_futex_wake,
550 ),
551 ];
552
553 compat::cond_tracepoints_enable(tracepoints)
554 }
555
556 fn attach_execve_tracepoints(skel: &mut OpenBpfSkel) -> Result<bool> {
557 let tracepoints = vec![
558 (
559 "syscalls:sys_enter_execve",
560 &skel.progs.cond_hook_sys_enter_execve,
561 ),
562 (
563 "syscalls:sys_enter_execveat",
564 &skel.progs.cond_hook_sys_enter_execveat,
565 ),
566 ];
567
568 compat::cond_tracepoints_enable(tracepoints)
569 }
570
571 fn init_cpus(skel: &mut OpenBpfSkel, order: &CpuOrder) {
572 debug!("{:#?}", order);
573
574 for cpu in order.cpuids.iter() {
576 skel.maps.rodata_data.as_mut().unwrap().cpu_capacity[cpu.cpu_adx] = cpu.cpu_cap as u16;
577 skel.maps.rodata_data.as_mut().unwrap().cpu_big[cpu.cpu_adx] = cpu.big_core as u8;
578 skel.maps.rodata_data.as_mut().unwrap().cpu_turbo[cpu.cpu_adx] = cpu.turbo_core as u8;
579 skel.maps.rodata_data.as_mut().unwrap().cpu_sibling[cpu.cpu_adx] =
580 cpu.cpu_sibling as u32;
581 }
582
583 let nr_pco_states: u8 = order.perf_cpu_order.len() as u8;
585 if nr_pco_states > LAVD_PCO_STATE_MAX as u8 {
586 panic!("Generated performance vs. CPU order stats are too complex ({nr_pco_states}) to handle");
587 }
588
589 skel.maps.rodata_data.as_mut().unwrap().nr_pco_states = nr_pco_states;
590 for (i, (_, pco)) in order.perf_cpu_order.iter().enumerate() {
591 Self::init_pco_tuple(skel, i, &pco);
592 info!("{:#}", pco);
593 }
594
595 let (_, last_pco) = order.perf_cpu_order.last_key_value().unwrap();
596 for i in nr_pco_states..LAVD_PCO_STATE_MAX as u8 {
597 Self::init_pco_tuple(skel, i as usize, &last_pco);
598 }
599 }
600
601 fn init_pco_tuple(skel: &mut OpenBpfSkel, i: usize, pco: &PerfCpuOrder) {
602 let cpus_perf = pco.cpus_perf.borrow();
603 let cpus_ovflw = pco.cpus_ovflw.borrow();
604 let pco_nr_primary = cpus_perf.len();
605
606 skel.maps.rodata_data.as_mut().unwrap().pco_bounds[i] = pco.perf_cap as u32;
607 skel.maps.rodata_data.as_mut().unwrap().pco_nr_primary[i] = pco_nr_primary as u16;
608
609 for (j, &cpu_adx) in cpus_perf.iter().enumerate() {
610 skel.maps.rodata_data.as_mut().unwrap().pco_table[i][j] = cpu_adx as u16;
611 }
612
613 for (j, &cpu_adx) in cpus_ovflw.iter().enumerate() {
614 let k = j + pco_nr_primary;
615 skel.maps.rodata_data.as_mut().unwrap().pco_table[i][k] = cpu_adx as u16;
616 }
617 }
618
619 fn init_cpdoms(skel: &mut OpenBpfSkel, order: &CpuOrder) {
620 for (k, v) in order.cpdom_map.iter() {
622 skel.maps.bss_data.as_mut().unwrap().cpdom_ctxs[v.cpdom_id].id = v.cpdom_id as u64;
623 skel.maps.bss_data.as_mut().unwrap().cpdom_ctxs[v.cpdom_id].alt_id =
624 v.cpdom_alt_id.get() as u64;
625 skel.maps.bss_data.as_mut().unwrap().cpdom_ctxs[v.cpdom_id].numa_id = k.numa_adx as u8;
626 skel.maps.bss_data.as_mut().unwrap().cpdom_ctxs[v.cpdom_id].llc_id = k.llc_adx as u8;
627 skel.maps.bss_data.as_mut().unwrap().cpdom_ctxs[v.cpdom_id].is_big = k.is_big as u8;
628 skel.maps.bss_data.as_mut().unwrap().cpdom_ctxs[v.cpdom_id].is_valid = 1;
629 for cpu_id in v.cpu_ids.iter() {
630 let i = cpu_id / 64;
631 let j = cpu_id % 64;
632 skel.maps.bss_data.as_mut().unwrap().cpdom_ctxs[v.cpdom_id].__cpumask[i] |=
633 0x01 << j;
634 }
635
636 if v.neighbor_map.borrow().iter().len() > LAVD_CPDOM_MAX_DIST as usize {
637 panic!("The processor topology is too complex to handle in BPF.");
638 }
639
640 for (k, (_d, neighbors)) in v.neighbor_map.borrow().iter().enumerate() {
641 let nr_neighbors = neighbors.borrow().len() as u8;
642 if nr_neighbors > LAVD_CPDOM_MAX_NR as u8 {
643 panic!("The processor topology is too complex to handle in BPF.");
644 }
645 skel.maps.bss_data.as_mut().unwrap().cpdom_ctxs[v.cpdom_id].nr_neighbors[k] =
646 nr_neighbors;
647 for (i, &id) in neighbors.borrow().iter().enumerate() {
648 let idx = (k * LAVD_CPDOM_MAX_NR as usize) + i;
649 skel.maps.bss_data.as_mut().unwrap().cpdom_ctxs[v.cpdom_id].neighbor_ids[idx] =
650 id as u8;
651 }
652 }
653 }
654 }
655
656 fn init_globals(skel: &mut OpenBpfSkel, opts: &Opts, order: &CpuOrder, debug_level: u8) {
657 let bss_data = skel.maps.bss_data.as_mut().unwrap();
658 bss_data.no_preemption = opts.no_preemption;
659 bss_data.no_core_compaction = opts.no_core_compaction;
660 bss_data.no_freq_scaling = opts.no_freq_scaling;
661 bss_data.is_powersave_mode = opts.powersave;
662 let rodata = skel.maps.rodata_data.as_mut().unwrap();
663 rodata.nr_llcs = order.nr_llcs as u64;
664 rodata.nr_cpu_ids = *NR_CPU_IDS as u32;
665 rodata.is_smt_active = order.smt_enabled;
666 rodata.is_autopilot_on = opts.autopilot;
667 rodata.verbose = debug_level;
668 rodata.slice_max_ns = opts.slice_max_us * 1000;
669 rodata.slice_min_ns = opts.slice_min_us * 1000;
670 rodata.pinned_slice_ns = opts.pinned_slice_us.map(|v| v * 1000).unwrap_or(0);
671 rodata.preempt_shift = opts.preempt_shift;
672 rodata.mig_delta_pct = opts.mig_delta_pct;
673 rodata.lb_low_util_wall = ((opts.lb_low_util_pct as u64) << 10) / 100;
674 rodata.lb_local_dsq_util_wall = ((opts.lb_local_dsq_util_pct as u64) << 10) / 100;
675 rodata.no_use_em = opts.no_use_em as u8;
676 rodata.no_wake_sync = opts.no_wake_sync;
677 rodata.no_slice_boost = opts.no_slice_boost;
678 rodata.per_cpu_dsq = opts.per_cpu_dsq;
679 rodata.enable_cpu_bw = opts.enable_cpu_bw;
680
681 if !ksym_exists("scx_group_set_bandwidth").unwrap() {
682 skel.struct_ops.lavd_ops_mut().cgroup_set_bandwidth = std::ptr::null_mut();
683 warn!("Kernel does not support ops.cgroup_set_bandwidth(), so disable it.");
684 }
685
686 skel.struct_ops.lavd_ops_mut().flags = *compat::SCX_OPS_ENQ_EXITING
687 | *compat::SCX_OPS_ENQ_LAST
688 | *compat::SCX_OPS_ENQ_MIGRATION_DISABLED
689 | *compat::SCX_OPS_KEEP_BUILTIN_IDLE;
690
691 if opts.partial {
692 skel.struct_ops.lavd_ops_mut().flags |= *compat::SCX_OPS_SWITCH_PARTIAL;
693 }
694 }
695
696 fn get_msg_seq_id() -> u64 {
697 static mut MSEQ: u64 = 0;
698 unsafe {
699 MSEQ += 1;
700 MSEQ
701 }
702 }
703
704 fn relay_introspec(data: &[u8], intrspc_tx: &Sender<SchedSample>) -> i32 {
705 let mt = msg_task_ctx::from_bytes(data);
706 let tx = mt.taskc_x;
707
708 if mt.hdr.kind != LAVD_MSG_TASKC {
710 return 0;
711 }
712
713 let mseq = Scheduler::get_msg_seq_id();
714
715 let c_tx_cm: *const c_char = (&tx.comm as *const [c_char; 17]) as *const c_char;
716 let c_tx_cm_str: &CStr = unsafe { CStr::from_ptr(c_tx_cm) };
717 let tx_comm: &str = c_tx_cm_str.to_str().unwrap();
718
719 let c_waker_cm: *const c_char = (&tx.waker_comm as *const [c_char; 17]) as *const c_char;
720 let c_waker_cm_str: &CStr = unsafe { CStr::from_ptr(c_waker_cm) };
721 let waker_comm: &str = c_waker_cm_str.to_str().unwrap();
722
723 let c_tx_st: *const c_char = (&tx.stat as *const [c_char; 5]) as *const c_char;
724 let c_tx_st_str: &CStr = unsafe { CStr::from_ptr(c_tx_st) };
725 let tx_stat: &str = c_tx_st_str.to_str().unwrap();
726
727 match intrspc_tx.try_send(SchedSample {
728 mseq,
729 pid: tx.pid,
730 comm: tx_comm.into(),
731 stat: tx_stat.into(),
732 cpu_id: tx.cpu_id,
733 prev_cpu_id: tx.prev_cpu_id,
734 suggested_cpu_id: tx.suggested_cpu_id,
735 waker_pid: tx.waker_pid,
736 waker_comm: waker_comm.into(),
737 slice_wall: tx.slice_wall,
738 lat_cri: tx.lat_cri,
739 avg_lat_cri: tx.avg_lat_cri,
740 static_prio: tx.static_prio,
741 rerunnable_interval_wall: tx.rerunnable_interval_wall,
742 resched_interval_wall: tx.resched_interval_wall,
743 run_freq: tx.run_freq,
744 avg_runtime_wall: tx.avg_runtime_wall,
745 wait_freq: tx.wait_freq,
746 wake_freq: tx.wake_freq,
747 perf_cri: tx.perf_cri,
748 thr_perf_cri: tx.thr_perf_cri,
749 cpuperf_cur: tx.cpuperf_cur,
750 cpu_util_wall: tx.cpu_util_wall,
751 cpu_util_invr: tx.cpu_util_invr,
752 steal_util_wall: tx.steal_util_wall,
753 steal_util_invr: tx.steal_util_invr,
754 dom_pinned_util_wall: tx.dom_pinned_util_wall,
755 dom_pinned_util_invr: tx.dom_pinned_util_invr,
756 nr_active: tx.nr_active,
757 dsq_id: tx.dsq_id,
758 dsq_consume_lat: tx.dsq_consume_lat,
759 slice_used_wall: tx.last_slice_used_wall,
760 }) {
761 Ok(()) | Err(TrySendError::Full(_)) => 0,
762 Err(e) => panic!("failed to send on intrspc_tx ({})", e),
763 }
764 }
765
766 fn prep_introspec(&mut self) {
767 if !self.skel.maps.bss_data.as_ref().unwrap().is_monitored {
768 self.skel.maps.bss_data.as_mut().unwrap().is_monitored = true;
769 }
770 self.skel.maps.bss_data.as_mut().unwrap().intrspc.cmd = self.intrspc.cmd;
771 self.skel.maps.bss_data.as_mut().unwrap().intrspc.arg = self.intrspc.arg;
772 }
773
774 fn cleanup_introspec(&mut self) {
775 self.skel.maps.bss_data.as_mut().unwrap().intrspc.cmd = LAVD_CMD_NOP;
776 }
777
778 fn get_pc(x: u64, y: u64) -> f64 {
779 return 100. * x as f64 / y as f64;
780 }
781
782 fn get_power_mode(power_mode: i32) -> &'static str {
783 match power_mode as u32 {
784 LAVD_PM_PERFORMANCE => "performance",
785 LAVD_PM_BALANCED => "balanced",
786 LAVD_PM_POWERSAVE => "powersave",
787 _ => "unknown",
788 }
789 }
790
791 fn stats_req_to_res(&mut self, req: &StatsReq) -> Result<StatsRes> {
792 Ok(match req {
793 StatsReq::NewSampler(tid) => {
794 self.rb_mgr.consume().unwrap();
795 self.monitor_tid = Some(*tid);
796 StatsRes::Ack
797 }
798 StatsReq::SysStatsReq { tid } => {
799 if Some(*tid) != self.monitor_tid {
800 return Ok(StatsRes::Bye);
801 }
802 self.mseq_id += 1;
803
804 let bss_data = self.skel.maps.bss_data.as_ref().unwrap();
805 let st = bss_data.sys_stat;
806
807 let mseq = self.mseq_id;
808 let nr_queued_task = st.nr_queued_task;
809 let nr_active = st.nr_active;
810 let nr_sched = st.nr_sched;
811 let nr_preempt = st.nr_preempt;
812 let pc_pc = Self::get_pc(st.nr_perf_cri, nr_sched);
813 let pc_lc = Self::get_pc(st.nr_lat_cri, nr_sched);
814 let pc_x_migration = Self::get_pc(st.nr_x_migration, nr_sched);
815 let nr_stealee = st.nr_stealee;
816 let nr_big = st.nr_big;
817 let pc_big = Self::get_pc(nr_big, nr_sched);
818 let pc_pc_on_big = Self::get_pc(st.nr_pc_on_big, nr_big);
819 let pc_lc_on_big = Self::get_pc(st.nr_lc_on_big, nr_big);
820 let power_mode = Self::get_power_mode(bss_data.power_mode);
821 let total_time = bss_data.performance_mode_ns
822 + bss_data.balanced_mode_ns
823 + bss_data.powersave_mode_ns;
824 let pc_performance = Self::get_pc(bss_data.performance_mode_ns, total_time);
825 let pc_balanced = Self::get_pc(bss_data.balanced_mode_ns, total_time);
826 let pc_powersave = Self::get_pc(bss_data.powersave_mode_ns, total_time);
827
828 StatsRes::SysStats(SysStats {
829 mseq,
830 nr_queued_task,
831 nr_active,
832 nr_sched,
833 nr_preempt,
834 pc_pc,
835 pc_lc,
836 pc_x_migration,
837 nr_stealee,
838 pc_big,
839 pc_pc_on_big,
840 pc_lc_on_big,
841 power_mode: power_mode.to_string(),
842 pc_performance,
843 pc_balanced,
844 pc_powersave,
845 })
846 }
847 StatsReq::SchedSamplesNr {
848 tid,
849 nr_samples,
850 interval_ms,
851 } => {
852 if Some(*tid) != self.monitor_tid {
853 return Ok(StatsRes::Bye);
854 }
855
856 self.intrspc.cmd = LAVD_CMD_SCHED_N;
857 self.intrspc.arg = *nr_samples;
858 self.prep_introspec();
859 std::thread::sleep(Duration::from_millis(*interval_ms));
860 self.rb_mgr.poll(Duration::from_millis(100)).unwrap();
861
862 let mut samples = vec![];
863 while let Ok(ts) = self.intrspc_rx.try_recv() {
864 samples.push(ts);
865 }
866
867 self.cleanup_introspec();
868
869 StatsRes::SchedSamples(SchedSamples { samples })
870 }
871 })
872 }
873
874 fn stop_monitoring(&mut self) {
875 if self.skel.maps.bss_data.as_ref().unwrap().is_monitored {
876 self.skel.maps.bss_data.as_mut().unwrap().is_monitored = false;
877 }
878 }
879
880 pub fn exited(&mut self) -> bool {
881 uei_exited!(&self.skel, uei)
882 }
883
884 fn set_power_profile(&mut self, mode: u32) -> Result<(), u32> {
885 let prog = &mut self.skel.progs.set_power_profile;
886 let mut args = power_arg {
887 power_mode: mode as c_int,
888 };
889 let input = ProgramInput {
890 context_in: Some(unsafe {
891 std::slice::from_raw_parts_mut(
892 &mut args as *mut _ as *mut u8,
893 std::mem::size_of_val(&args),
894 )
895 }),
896 ..Default::default()
897 };
898 let out = prog.test_run(input).unwrap();
899 if out.return_value != 0 {
900 return Err(out.return_value);
901 }
902
903 Ok(())
904 }
905
906 fn update_power_profile(&mut self, prev_profile: PowerProfile) -> (bool, PowerProfile) {
907 let profile = fetch_power_profile(false);
908 if profile == prev_profile {
909 return (true, profile);
911 }
912
913 let _ = match profile {
914 PowerProfile::Performance => self.set_power_profile(LAVD_PM_PERFORMANCE),
915 PowerProfile::Balanced { .. } => self.set_power_profile(LAVD_PM_BALANCED),
916 PowerProfile::Powersave => self.set_power_profile(LAVD_PM_POWERSAVE),
917 PowerProfile::Unknown => {
918 return (false, profile);
921 }
922 };
923
924 info!("Set the scheduler's power profile to {profile} mode.");
925 (true, profile)
926 }
927
928 fn run(&mut self, opts: &Opts, shutdown: Arc<AtomicBool>) -> Result<UserExitInfo> {
929 let (res_ch, req_ch) = self.stats_server.channels();
930 let mut autopower = opts.autopower;
931 let mut profile = PowerProfile::Unknown;
932
933 if opts.performance {
934 let _ = self.set_power_profile(LAVD_PM_PERFORMANCE);
935 } else if opts.powersave {
936 let _ = self.set_power_profile(LAVD_PM_POWERSAVE);
937 } else {
938 let _ = self.set_power_profile(LAVD_PM_BALANCED);
939 }
940
941 while !shutdown.load(Ordering::Relaxed) && !self.exited() {
942 if autopower {
943 (autopower, profile) = self.update_power_profile(profile);
944 }
945
946 match req_ch.recv_timeout(Duration::from_secs(1)) {
947 Ok(req) => {
948 let res = self.stats_req_to_res(&req)?;
949 res_ch.send(res)?;
950 }
951 Err(RecvTimeoutError::Timeout) => {
952 self.stop_monitoring();
953 }
954 Err(e) => {
955 self.stop_monitoring();
956 Err(e)?
957 }
958 }
959 self.cleanup_introspec();
960 }
961 self.rb_mgr.consume().unwrap();
962
963 let _ = self.struct_ops.take();
964 uei_report!(&self.skel, uei)
965 }
966}
967
968impl Drop for Scheduler<'_> {
969 fn drop(&mut self) {
970 info!("Unregister {SCHEDULER_NAME} scheduler");
971
972 if let Some(struct_ops) = self.struct_ops.take() {
973 drop(struct_ops);
974 }
975 }
976}
977
978fn init_log(opts: &Opts) {
979 let env_filter = EnvFilter::try_from_default_env()
980 .or_else(|_| match EnvFilter::try_new(&opts.log_level) {
981 Ok(filter) => Ok(filter),
982 Err(e) => {
983 eprintln!(
984 "invalid log envvar: {}, using info, err is: {}",
985 opts.log_level, e
986 );
987 EnvFilter::try_new("info")
988 }
989 })
990 .unwrap_or_else(|_| EnvFilter::new("info"));
991
992 match tracing_subscriber::fmt()
993 .with_env_filter(env_filter)
994 .with_target(true)
995 .with_thread_ids(true)
996 .with_file(true)
997 .with_line_number(true)
998 .try_init()
999 {
1000 Ok(()) => {}
1001 Err(e) => eprintln!("failed to init logger: {}", e),
1002 }
1003}
1004
1005#[clap_main::clap_main]
1006fn main(mut opts: Opts) -> Result<()> {
1007 if opts.version {
1008 println!(
1009 "scx_lavd {}",
1010 build_id::full_version(env!("CARGO_PKG_VERSION"))
1011 );
1012 return Ok(());
1013 }
1014
1015 if opts.help_stats {
1016 let sys_stats_meta_name = SysStats::meta().name;
1017 let sched_sample_meta_name = SchedSample::meta().name;
1018 let stats_meta_names: &[&str] = &[
1019 sys_stats_meta_name.as_str(),
1020 sched_sample_meta_name.as_str(),
1021 ];
1022 stats::server_data(0).describe_meta(&mut std::io::stdout(), Some(&stats_meta_names))?;
1023 return Ok(());
1024 }
1025
1026 init_log(&opts);
1027
1028 if opts.verbose > 0 {
1029 warn!("Setting verbose via -v is deprecated and will be an error in future releases.");
1030 }
1031
1032 if let Some(run_id) = opts.run_id {
1033 info!("scx_lavd run_id: {}", run_id);
1034 }
1035
1036 if opts.monitor.is_none() && opts.monitor_sched_samples.is_none() {
1037 opts.proc().unwrap();
1038 info!("{:#?}", opts);
1039 }
1040
1041 let shutdown = Arc::new(AtomicBool::new(false));
1042 let shutdown_clone = shutdown.clone();
1043 ctrlc::set_handler(move || {
1044 shutdown_clone.store(true, Ordering::Relaxed);
1045 })
1046 .context("Error setting Ctrl-C handler")?;
1047
1048 if let Some(nr_samples) = opts.monitor_sched_samples {
1049 let shutdown_copy = shutdown.clone();
1050 let jh = std::thread::spawn(move || {
1051 stats::monitor_sched_samples(nr_samples, shutdown_copy).unwrap()
1052 });
1053 let _ = jh.join();
1054 return Ok(());
1055 }
1056
1057 if let Some(intv) = opts.monitor.or(opts.stats) {
1058 let shutdown_copy = shutdown.clone();
1059 let jh = std::thread::spawn(move || {
1060 stats::monitor(Duration::from_secs_f64(intv), shutdown_copy).unwrap()
1061 });
1062 if opts.monitor.is_some() {
1063 let _ = jh.join();
1064 return Ok(());
1065 }
1066 }
1067
1068 let mut open_object = MaybeUninit::uninit();
1069 loop {
1070 let mut sched = Scheduler::init(&opts, &mut open_object)?;
1071 info!(
1072 "scx_lavd scheduler is initialized (build ID: {})",
1073 build_id::full_version(env!("CARGO_PKG_VERSION"))
1074 );
1075 info!("scx_lavd scheduler starts running.");
1076 if !sched.run(&opts, shutdown.clone())?.should_restart() {
1077 break;
1078 }
1079 }
1080
1081 Ok(())
1082}