1mod bpf_skel;
10pub use bpf_skel::*;
11pub mod bpf_intf;
12mod bpf_streams;
13pub use bpf_intf::*;
14
15mod cpu_order;
16use scx_utils::init_libbpf_logging;
17mod stats;
18use std::ffi::c_int;
19use std::ffi::CStr;
20use std::mem;
21use std::mem::MaybeUninit;
22use std::str;
23use std::sync::atomic::AtomicBool;
24use std::sync::atomic::Ordering;
25use std::sync::Arc;
26use std::thread::ThreadId;
27use std::time::Duration;
28
29use anyhow::Context;
30use anyhow::Result;
31use clap::Parser;
32use clap_num::number_range;
33use cpu_order::CpuOrder;
34use cpu_order::PerfCpuOrder;
35use crossbeam::channel;
36use crossbeam::channel::Receiver;
37use crossbeam::channel::RecvTimeoutError;
38use crossbeam::channel::Sender;
39use crossbeam::channel::TrySendError;
40use libbpf_rs::skel::Skel;
41use libbpf_rs::OpenObject;
42use libbpf_rs::PrintLevel;
43use libbpf_rs::ProgramInput;
44use libc::c_char;
45use plain::Plain;
46use scx_arena::ArenaLib;
47use scx_stats::prelude::*;
48use scx_utils::autopower::{fetch_power_profile, PowerProfile};
49use scx_utils::build_id;
50use scx_utils::compat;
51use scx_utils::ksym_exists;
52use scx_utils::libbpf_clap_opts::LibbpfOpts;
53use scx_utils::scx_ops_attach;
54use scx_utils::scx_ops_load;
55use scx_utils::scx_ops_open;
56use scx_utils::try_set_rlimit_infinity;
57use scx_utils::uei_exited;
58use scx_utils::uei_report;
59use scx_utils::EnergyModel;
60use scx_utils::TopologyArgs;
61use scx_utils::UserExitInfo;
62use scx_utils::NR_CPU_IDS;
63use stats::SchedSample;
64use stats::SchedSamples;
65use stats::StatsReq;
66use stats::StatsRes;
67use stats::SysStats;
68use tracing::{debug, info, warn};
69use tracing_subscriber::filter::EnvFilter;
70
71const SCHEDULER_NAME: &str = "scx_lavd";
72#[derive(Debug, Parser)]
78struct Opts {
79 #[clap(short = 'v', long, action = clap::ArgAction::Count)]
81 verbose: u8,
82
83 #[clap(long = "autopilot", action = clap::ArgAction::SetTrue)]
90 autopilot: bool,
91
92 #[clap(long = "autopower", action = clap::ArgAction::SetTrue)]
99 autopower: bool,
100
101 #[clap(long = "performance", action = clap::ArgAction::SetTrue)]
106 performance: bool,
107
108 #[clap(long = "powersave", action = clap::ArgAction::SetTrue)]
113 powersave: bool,
114
115 #[clap(long = "balanced", action = clap::ArgAction::SetTrue)]
120 balanced: bool,
121
122 #[clap(long = "slice-max-us", default_value = "5000")]
124 slice_max_us: u64,
125
126 #[clap(long = "slice-min-us", default_value = "500")]
128 slice_min_us: u64,
129
130 #[clap(long = "lat-load-target-pct", default_value = "100", value_parser=Opts::lat_load_target_pct_range)]
136 lat_load_target_pct: u16,
137
138 #[clap(long = "mig-delta-pct", default_value = "0", value_parser=Opts::mig_delta_pct_range)]
145 mig_delta_pct: u8,
146
147 #[clap(long = "lb-low-util-pct", default_value = "25", value_parser=Opts::lb_low_util_pct_range)]
153 lb_low_util_pct: u8,
154
155 #[clap(long = "lb-local-dsq-util-pct", default_value = "10", value_parser=Opts::lb_local_dsq_util_pct_range)]
162 lb_local_dsq_util_pct: u8,
163
164 #[clap(long = "pinned-slice-us", default_value = "5000")]
172 pinned_slice_us: Option<u64>,
173
174 #[clap(long = "preempt-shift", default_value = "6", value_parser=Opts::preempt_shift_range)]
179 preempt_shift: u8,
180
181 #[clap(long = "cpu-pref-order", default_value = "")]
187 cpu_pref_order: String,
188
189 #[clap(long = "no-use-em", action = clap::ArgAction::SetTrue)]
191 no_use_em: bool,
192
193 #[clap(long = "no-futex-boost", action = clap::ArgAction::SetTrue)]
195 no_futex_boost: bool,
196
197 #[clap(long = "no-fast-lb", action = clap::ArgAction::SetTrue)]
201 no_fast_lb: bool,
202
203 #[clap(long = "no-preemption", action = clap::ArgAction::SetTrue)]
205 no_preemption: bool,
206
207 #[clap(long = "no-wake-sync", action = clap::ArgAction::SetTrue)]
209 no_wake_sync: bool,
210
211 #[clap(long = "no-slice-boost", action = clap::ArgAction::SetTrue)]
213 no_slice_boost: bool,
214
215 #[clap(long = "per-cpu-dsq", action = clap::ArgAction::SetTrue)]
221 per_cpu_dsq: bool,
222
223 #[clap(long = "enable-cpu-bw", action = clap::ArgAction::SetTrue)]
226 enable_cpu_bw: bool,
227
228 #[clap(long = "partial", action = clap::ArgAction::SetTrue)]
232 partial: bool,
233
234 #[clap(long = "no-core-compaction", action = clap::ArgAction::SetTrue)]
244 no_core_compaction: bool,
245
246 #[clap(long = "no-freq-scaling", action = clap::ArgAction::SetTrue)]
248 no_freq_scaling: bool,
249
250 #[clap(long)]
252 stats: Option<f64>,
253
254 #[clap(long)]
256 monitor: Option<f64>,
257
258 #[clap(long)]
261 monitor_sched_samples: Option<u64>,
262
263 #[clap(long, default_value = "info")]
266 log_level: String,
267
268 #[clap(short = 'V', long, action = clap::ArgAction::SetTrue)]
270 version: bool,
271
272 #[clap(long)]
274 run_id: Option<u64>,
275
276 #[clap(long)]
278 help_stats: bool,
279
280 #[clap(flatten, next_help_heading = "Libbpf Options")]
281 pub libbpf: LibbpfOpts,
282
283 #[clap(flatten)]
285 topology: Option<TopologyArgs>,
286}
287
288impl Opts {
289 fn can_autopilot(&self) -> bool {
290 self.autopower == false
291 && self.performance == false
292 && self.powersave == false
293 && self.balanced == false
294 && self.no_core_compaction == false
295 }
296
297 fn can_autopower(&self) -> bool {
298 self.autopilot == false
299 && self.performance == false
300 && self.powersave == false
301 && self.balanced == false
302 && self.no_core_compaction == false
303 }
304
305 fn can_performance(&self) -> bool {
306 self.autopilot == false
307 && self.autopower == false
308 && self.powersave == false
309 && self.balanced == false
310 }
311
312 fn can_balanced(&self) -> bool {
313 self.autopilot == false
314 && self.autopower == false
315 && self.performance == false
316 && self.powersave == false
317 && self.no_core_compaction == false
318 }
319
320 fn can_powersave(&self) -> bool {
321 self.autopilot == false
322 && self.autopower == false
323 && self.performance == false
324 && self.balanced == false
325 && self.no_core_compaction == false
326 }
327
328 fn proc(&mut self) -> Option<&mut Self> {
329 if !self.autopilot {
330 self.autopilot = self.can_autopilot();
331 }
332
333 if self.autopilot {
334 if !self.can_autopilot() {
335 info!("Autopilot mode cannot be used with conflicting options.");
336 return None;
337 }
338 info!("Autopilot mode is enabled.");
339 }
340
341 if self.autopower {
342 if !self.can_autopower() {
343 info!("Autopower mode cannot be used with conflicting options.");
344 return None;
345 }
346 info!("Autopower mode is enabled.");
347 }
348
349 if self.performance {
350 if !self.can_performance() {
351 info!("Performance mode cannot be used with conflicting options.");
352 return None;
353 }
354 info!("Performance mode is enabled.");
355 self.no_core_compaction = true;
356 }
357
358 if self.powersave {
359 if !self.can_powersave() {
360 info!("Powersave mode cannot be used with conflicting options.");
361 return None;
362 }
363 info!("Powersave mode is enabled.");
364 self.no_core_compaction = false;
365 }
366
367 if self.balanced {
368 if !self.can_balanced() {
369 info!("Balanced mode cannot be used with conflicting options.");
370 return None;
371 }
372 info!("Balanced mode is enabled.");
373 self.no_core_compaction = false;
374 }
375
376 if !EnergyModel::has_energy_model() || !self.cpu_pref_order.is_empty() {
377 self.no_use_em = true;
378 }
379 if self.no_use_em {
380 info!("Energy model won't be used for CPU preference order.");
381 }
382
383 if let Some(pinned_slice) = self.pinned_slice_us {
384 if pinned_slice == 0 {
385 info!("Pinned task slice mode is disabled. Pinned tasks will use per-domain DSQs.");
386 } else if pinned_slice < self.slice_min_us || pinned_slice > self.slice_max_us {
387 info!(
388 "pinned-slice-us ({}) must be between slice-min-us ({}) and slice-max-us ({})",
389 pinned_slice, self.slice_min_us, self.slice_max_us
390 );
391 return None;
392 } else {
393 info!(
394 "Pinned task slice mode is enabled ({} us). Pinned tasks will use per-CPU DSQs.",
395 pinned_slice
396 );
397 }
398 }
399
400 Some(self)
401 }
402
403 fn preempt_shift_range(s: &str) -> Result<u8, String> {
404 number_range(s, 0, 10)
405 }
406
407 fn lat_load_target_pct_range(s: &str) -> Result<u16, String> {
408 number_range(s, 0, 200)
409 }
410
411 fn mig_delta_pct_range(s: &str) -> Result<u8, String> {
412 number_range(s, 0, 100)
413 }
414
415 fn lb_low_util_pct_range(s: &str) -> Result<u8, String> {
416 number_range(s, 0, 100)
417 }
418
419 fn lb_local_dsq_util_pct_range(s: &str) -> Result<u8, String> {
420 number_range(s, 0, 100)
421 }
422}
423
424unsafe impl Plain for msg_task_ctx {}
425
426impl msg_task_ctx {
427 fn from_bytes(buf: &[u8]) -> &msg_task_ctx {
428 plain::from_bytes(buf).expect("The buffer is either too short or not aligned!")
429 }
430}
431
432impl introspec {
433 fn new() -> Self {
434 let intrspc = unsafe { mem::MaybeUninit::<introspec>::zeroed().assume_init() };
435 intrspc
436 }
437}
438
439struct Scheduler<'a> {
440 skel: BpfSkel<'a>,
441 struct_ops: Option<libbpf_rs::Link>,
442 rb_mgr: libbpf_rs::RingBuffer<'static>,
443 intrspc: introspec,
444 intrspc_rx: Receiver<SchedSample>,
445 monitor_tid: Option<ThreadId>,
446 stats_server: StatsServer<StatsReq, StatsRes>,
447 mseq_id: u64,
448}
449
450impl<'a> Scheduler<'a> {
451 fn init(opts: &'a Opts, open_object: &'a mut MaybeUninit<OpenObject>) -> Result<Self> {
452 if *NR_CPU_IDS > LAVD_CPU_ID_MAX as usize {
453 panic!(
454 "Num possible CPU IDs ({}) exceeds maximum of ({})",
455 *NR_CPU_IDS, LAVD_CPU_ID_MAX
456 );
457 }
458
459 try_set_rlimit_infinity();
460
461 let debug_level = if opts.log_level.contains("trace") {
463 2
464 } else if opts.log_level.contains("debug") {
465 1
466 } else {
467 0
468 };
469 let mut skel_builder = BpfSkelBuilder::default();
470 skel_builder.obj_builder.debug(debug_level > 1);
471 init_libbpf_logging(Some(PrintLevel::Debug));
472
473 let open_opts = opts.libbpf.clone().into_bpf_open_opts();
474 let mut skel = scx_ops_open!(skel_builder, open_object, lavd_ops, open_opts)?;
475
476 if !opts.no_futex_boost {
479 if Self::attach_futex_ftraces(&mut skel)? == false {
480 info!("Fail to attach futex ftraces. Try with tracepoints.");
481 if Self::attach_futex_tracepoints(&mut skel)? == false {
482 info!("Fail to attach futex tracepoints.");
483 }
484 }
485 }
486
487 let order = CpuOrder::new(opts.topology.as_ref(), opts.no_use_em).unwrap();
489 Self::init_cpus(&mut skel, &order);
490 Self::init_cpdoms(&mut skel, &order);
491
492 if order.cpdom_map.len() > 1 {
495 Self::attach_execve_tracepoints(&mut skel)?;
496 }
497
498 Self::init_globals(&mut skel, &opts, &order, debug_level);
500
501 let mut skel = scx_ops_load!(skel, lavd_ops, uei)?;
503 let task_size = std::mem::size_of::<types::task_ctx>();
504 let arenalib = ArenaLib::init(skel.object_mut(), task_size, *NR_CPU_IDS)?;
505 arenalib.setup()?;
506
507 let struct_ops = Some(scx_ops_attach!(skel, lavd_ops)?);
509 let stats_server = StatsServer::new(stats::server_data(*NR_CPU_IDS as u64)).launch()?;
510
511 let (intrspc_tx, intrspc_rx) = channel::bounded(65536);
513 let rb_map = &mut skel.maps.introspec_msg;
514 let mut builder = libbpf_rs::RingBufferBuilder::new();
515 builder
516 .add(rb_map, move |data| {
517 Scheduler::relay_introspec(data, &intrspc_tx)
518 })
519 .unwrap();
520 let rb_mgr = builder.build().unwrap();
521
522 Ok(Self {
523 skel,
524 struct_ops,
525 rb_mgr,
526 intrspc: introspec::new(),
527 intrspc_rx,
528 monitor_tid: None,
529 stats_server,
530 mseq_id: 0,
531 })
532 }
533
534 fn attach_futex_ftraces(skel: &mut OpenBpfSkel) -> Result<bool> {
535 let ftraces = vec![
536 ("__futex_wait", &skel.progs.fexit___futex_wait),
537 ("futex_wait_multiple", &skel.progs.fexit_futex_wait_multiple),
538 (
539 "futex_wait_requeue_pi",
540 &skel.progs.fexit_futex_wait_requeue_pi,
541 ),
542 ("futex_wake", &skel.progs.fexit_futex_wake),
543 ("futex_wake_op", &skel.progs.fexit_futex_wake_op),
544 ("futex_lock_pi", &skel.progs.fexit_futex_lock_pi),
545 ("futex_unlock_pi", &skel.progs.fexit_futex_unlock_pi),
546 ];
547
548 if compat::tracer_available("function")? == false {
549 info!("Ftrace is not enabled in the kernel.");
550 return Ok(false);
551 }
552
553 compat::cond_kprobes_enable(ftraces)
554 }
555
556 fn attach_futex_tracepoints(skel: &mut OpenBpfSkel) -> Result<bool> {
557 let tracepoints = vec![
558 ("syscalls:sys_enter_futex", &skel.progs.rtp_sys_enter_futex),
559 ("syscalls:sys_exit_futex", &skel.progs.rtp_sys_exit_futex),
560 (
561 "syscalls:sys_exit_futex_wait",
562 &skel.progs.rtp_sys_exit_futex_wait,
563 ),
564 (
565 "syscalls:sys_exit_futex_waitv",
566 &skel.progs.rtp_sys_exit_futex_waitv,
567 ),
568 (
569 "syscalls:sys_exit_futex_wake",
570 &skel.progs.rtp_sys_exit_futex_wake,
571 ),
572 ];
573
574 compat::cond_tracepoints_enable(tracepoints)
575 }
576
577 fn attach_execve_tracepoints(skel: &mut OpenBpfSkel) -> Result<bool> {
578 let tracepoints = vec![
579 (
580 "syscalls:sys_enter_execve",
581 &skel.progs.cond_hook_sys_enter_execve,
582 ),
583 (
584 "syscalls:sys_enter_execveat",
585 &skel.progs.cond_hook_sys_enter_execveat,
586 ),
587 ];
588
589 compat::cond_tracepoints_enable(tracepoints)
590 }
591
592 fn init_cpus(skel: &mut OpenBpfSkel, order: &CpuOrder) {
593 debug!("{:#?}", order);
594
595 for cpu in order.cpuids.iter() {
597 skel.maps.rodata_data.as_mut().unwrap().cpu_capacity[cpu.cpu_adx] = cpu.cpu_cap as u16;
598 skel.maps.rodata_data.as_mut().unwrap().cpu_big[cpu.cpu_adx] = cpu.big_core as u8;
599 skel.maps.rodata_data.as_mut().unwrap().cpu_turbo[cpu.cpu_adx] = cpu.turbo_core as u8;
600 skel.maps.rodata_data.as_mut().unwrap().cpu_sibling[cpu.cpu_adx] =
601 cpu.cpu_sibling as u32;
602 }
603
604 let nr_pco_states: u8 = order.perf_cpu_order.len() as u8;
606 if nr_pco_states > LAVD_PCO_STATE_MAX as u8 {
607 panic!("Generated performance vs. CPU order stats are too complex ({nr_pco_states}) to handle");
608 }
609
610 skel.maps.rodata_data.as_mut().unwrap().nr_pco_states = nr_pco_states;
611 for (i, (_, pco)) in order.perf_cpu_order.iter().enumerate() {
612 Self::init_pco_tuple(skel, i, &pco);
613 info!("{:#}", pco);
614 }
615
616 let (_, last_pco) = order.perf_cpu_order.last_key_value().unwrap();
617 for i in nr_pco_states..LAVD_PCO_STATE_MAX as u8 {
618 Self::init_pco_tuple(skel, i as usize, &last_pco);
619 }
620 }
621
622 fn init_pco_tuple(skel: &mut OpenBpfSkel, i: usize, pco: &PerfCpuOrder) {
623 let cpus_perf = pco.cpus_perf.borrow();
624 let cpus_ovflw = pco.cpus_ovflw.borrow();
625 let pco_nr_primary = cpus_perf.len();
626
627 skel.maps.rodata_data.as_mut().unwrap().pco_bounds[i] = pco.perf_cap as u32;
628 skel.maps.rodata_data.as_mut().unwrap().pco_nr_primary[i] = pco_nr_primary as u16;
629
630 for (j, &cpu_adx) in cpus_perf.iter().enumerate() {
631 skel.maps.rodata_data.as_mut().unwrap().pco_table[i][j] = cpu_adx as u16;
632 }
633
634 for (j, &cpu_adx) in cpus_ovflw.iter().enumerate() {
635 let k = j + pco_nr_primary;
636 skel.maps.rodata_data.as_mut().unwrap().pco_table[i][k] = cpu_adx as u16;
637 }
638 }
639
640 fn init_cpdoms(skel: &mut OpenBpfSkel, order: &CpuOrder) {
641 for (k, v) in order.cpdom_map.iter() {
643 skel.maps.bss_data.as_mut().unwrap().cpdom_ctxs[v.cpdom_id].id = v.cpdom_id as u64;
644 skel.maps.bss_data.as_mut().unwrap().cpdom_ctxs[v.cpdom_id].alt_id =
645 v.cpdom_alt_id.get() as u64;
646 skel.maps.bss_data.as_mut().unwrap().cpdom_ctxs[v.cpdom_id].numa_id = k.numa_adx as u8;
647 skel.maps.bss_data.as_mut().unwrap().cpdom_ctxs[v.cpdom_id].llc_id = k.llc_adx as u8;
648 skel.maps.bss_data.as_mut().unwrap().cpdom_ctxs[v.cpdom_id].is_big = k.is_big as u8;
649 skel.maps.bss_data.as_mut().unwrap().cpdom_ctxs[v.cpdom_id].is_valid = 1;
650 for cpu_id in v.cpu_ids.iter() {
651 let i = cpu_id / 64;
652 let j = cpu_id % 64;
653 skel.maps.bss_data.as_mut().unwrap().cpdom_ctxs[v.cpdom_id].__cpumask[i] |=
654 0x01 << j;
655 }
656
657 if v.neighbor_map.borrow().iter().len() > LAVD_CPDOM_MAX_DIST as usize {
658 panic!("The processor topology is too complex to handle in BPF.");
659 }
660
661 for (k, (_d, neighbors)) in v.neighbor_map.borrow().iter().enumerate() {
662 let nr_neighbors = neighbors.borrow().len() as u8;
663 if nr_neighbors > LAVD_CPDOM_MAX_NR as u8 {
664 panic!("The processor topology is too complex to handle in BPF.");
665 }
666 skel.maps.bss_data.as_mut().unwrap().cpdom_ctxs[v.cpdom_id].nr_neighbors[k] =
667 nr_neighbors;
668 for (i, &id) in neighbors.borrow().iter().enumerate() {
669 let idx = (k * LAVD_CPDOM_MAX_NR as usize) + i;
670 skel.maps.bss_data.as_mut().unwrap().cpdom_ctxs[v.cpdom_id].neighbor_ids[idx] =
671 id as u8;
672 }
673 }
674 }
675 }
676
677 fn init_globals(skel: &mut OpenBpfSkel, opts: &Opts, order: &CpuOrder, debug_level: u8) {
678 let bss_data = skel.maps.bss_data.as_mut().unwrap();
679 bss_data.no_preemption = opts.no_preemption;
680 bss_data.no_core_compaction = opts.no_core_compaction;
681 bss_data.no_freq_scaling = opts.no_freq_scaling;
682 bss_data.is_powersave_mode = opts.powersave;
683 let rodata = skel.maps.rodata_data.as_mut().unwrap();
684 rodata.nr_llcs = order.nr_llcs as u64;
685 rodata.nr_cpu_ids = *NR_CPU_IDS as u32;
686 rodata.is_smt_active = order.smt_enabled;
687 rodata.is_autopilot_on = opts.autopilot;
688 rodata.verbose = debug_level;
689 rodata.slice_max_ns = opts.slice_max_us * 1000;
690 rodata.slice_min_ns = opts.slice_min_us * 1000;
691 rodata.pinned_slice_ns = opts.pinned_slice_us.map(|v| v * 1000).unwrap_or(0);
692 rodata.preempt_shift = opts.preempt_shift;
693 rodata.lat_load_target_pct = opts.lat_load_target_pct;
694 rodata.mig_delta_pct = opts.mig_delta_pct;
695 rodata.lb_low_util_wall = ((opts.lb_low_util_pct as u64) << 10) / 100;
696 rodata.lb_local_dsq_util_wall = ((opts.lb_local_dsq_util_pct as u64) << 10) / 100;
697 rodata.no_use_em = opts.no_use_em as u8;
698 rodata.no_fast_lb = opts.no_fast_lb as u8;
699 rodata.no_wake_sync = opts.no_wake_sync;
700 rodata.no_slice_boost = opts.no_slice_boost;
701 rodata.per_cpu_dsq = opts.per_cpu_dsq;
702 rodata.enable_cpu_bw = opts.enable_cpu_bw;
703
704 if !ksym_exists("scx_group_set_bandwidth").unwrap() {
705 skel.struct_ops.lavd_ops_mut().cgroup_set_bandwidth = std::ptr::null_mut();
706 warn!("Kernel does not support ops.cgroup_set_bandwidth(), so disable it.");
707 }
708
709 skel.struct_ops.lavd_ops_mut().flags = *compat::SCX_OPS_ENQ_EXITING
710 | *compat::SCX_OPS_ENQ_LAST
711 | *compat::SCX_OPS_ENQ_MIGRATION_DISABLED
712 | *compat::SCX_OPS_KEEP_BUILTIN_IDLE;
713
714 if opts.partial {
715 skel.struct_ops.lavd_ops_mut().flags |= *compat::SCX_OPS_SWITCH_PARTIAL;
716 }
717 }
718
719 fn get_msg_seq_id() -> u64 {
720 static mut MSEQ: u64 = 0;
721 unsafe {
722 MSEQ += 1;
723 MSEQ
724 }
725 }
726
727 fn relay_introspec(data: &[u8], intrspc_tx: &Sender<SchedSample>) -> i32 {
728 let mt = msg_task_ctx::from_bytes(data);
729 let tx = mt.taskc_x;
730
731 if mt.hdr.kind != LAVD_MSG_TASKC {
733 return 0;
734 }
735
736 let mseq = Scheduler::get_msg_seq_id();
737
738 let c_tx_cm: *const c_char = (&tx.comm as *const [c_char; 17]) as *const c_char;
739 let c_tx_cm_str: &CStr = unsafe { CStr::from_ptr(c_tx_cm) };
740 let tx_comm: &str = c_tx_cm_str.to_str().unwrap();
741
742 let c_waker_cm: *const c_char = (&tx.waker_comm as *const [c_char; 17]) as *const c_char;
743 let c_waker_cm_str: &CStr = unsafe { CStr::from_ptr(c_waker_cm) };
744 let waker_comm: &str = c_waker_cm_str.to_str().unwrap();
745
746 let c_tx_st: *const c_char = (&tx.stat as *const [c_char; 5]) as *const c_char;
747 let c_tx_st_str: &CStr = unsafe { CStr::from_ptr(c_tx_st) };
748 let tx_stat: &str = c_tx_st_str.to_str().unwrap();
749
750 match intrspc_tx.try_send(SchedSample {
751 mseq,
752 pid: tx.pid,
753 comm: tx_comm.into(),
754 stat: tx_stat.into(),
755 cpu_id: tx.cpu_id,
756 prev_cpu_id: tx.prev_cpu_id,
757 suggested_cpu_id: tx.suggested_cpu_id,
758 waker_pid: tx.waker_pid,
759 waker_comm: waker_comm.into(),
760 slice_wall: tx.slice_wall,
761 lat_cri: tx.lat_cri,
762 avg_lat_cri: tx.avg_lat_cri,
763 static_prio: tx.static_prio,
764 rerunnable_interval_wall: tx.rerunnable_interval_wall,
765 resched_interval_wall: tx.resched_interval_wall,
766 run_freq: tx.run_freq,
767 avg_runtime_wall: tx.avg_runtime_wall,
768 wait_freq: tx.wait_freq,
769 wake_freq: tx.wake_freq,
770 perf_cri: tx.perf_cri,
771 thr_perf_cri: tx.thr_perf_cri,
772 cpuperf_cur: tx.cpuperf_cur,
773 cpu_util_wall: tx.cpu_util_wall,
774 cpu_util_invr: tx.cpu_util_invr,
775 steal_util_wall: tx.steal_util_wall,
776 steal_util_invr: tx.steal_util_invr,
777 dom_pinned_util_wall: tx.dom_pinned_util_wall,
778 dom_pinned_util_invr: tx.dom_pinned_util_invr,
779 nr_active: tx.nr_active,
780 dsq_id: tx.dsq_id,
781 dsq_consume_lat: tx.dsq_consume_lat,
782 lat_headroom: tx.lat_headroom,
783 vuln_thresh: tx.vuln_thresh,
784 task_util_est: tx.task_util_est,
785 norm_lat_cri: tx.norm_lat_cri,
786 slice_used_wall: tx.last_slice_used_wall,
787 }) {
788 Ok(()) | Err(TrySendError::Full(_)) => 0,
789 Err(e) => panic!("failed to send on intrspc_tx ({})", e),
790 }
791 }
792
793 fn prep_introspec(&mut self) {
794 if !self.skel.maps.bss_data.as_ref().unwrap().is_monitored {
795 self.skel.maps.bss_data.as_mut().unwrap().is_monitored = true;
796 }
797 self.skel.maps.bss_data.as_mut().unwrap().intrspc.cmd = self.intrspc.cmd;
798 self.skel.maps.bss_data.as_mut().unwrap().intrspc.arg = self.intrspc.arg;
799 }
800
801 fn cleanup_introspec(&mut self) {
802 self.skel.maps.bss_data.as_mut().unwrap().intrspc.cmd = LAVD_CMD_NOP;
803 }
804
805 fn get_pc(x: u64, y: u64) -> f64 {
806 return 100. * x as f64 / y as f64;
807 }
808
809 fn get_power_mode(power_mode: i32) -> &'static str {
810 match power_mode as u32 {
811 LAVD_PM_PERFORMANCE => "performance",
812 LAVD_PM_BALANCED => "balanced",
813 LAVD_PM_POWERSAVE => "powersave",
814 _ => "unknown",
815 }
816 }
817
818 fn stats_req_to_res(&mut self, req: &StatsReq) -> Result<StatsRes> {
819 Ok(match req {
820 StatsReq::NewSampler(tid) => {
821 self.rb_mgr.consume().unwrap();
822 self.monitor_tid = Some(*tid);
823 StatsRes::Ack
824 }
825 StatsReq::SysStatsReq { tid } => {
826 if Some(*tid) != self.monitor_tid {
827 return Ok(StatsRes::Bye);
828 }
829 self.mseq_id += 1;
830
831 let bss_data = self.skel.maps.bss_data.as_ref().unwrap();
832 let st = bss_data.sys_stat;
833
834 let mseq = self.mseq_id;
835 let nr_queued_task = st.nr_queued_task;
836 let nr_active = st.nr_active;
837 let nr_sched = st.nr_sched;
838 let nr_preempt = st.nr_preempt;
839 let pc_pc = Self::get_pc(st.nr_perf_cri, nr_sched);
840 let pc_lc = Self::get_pc(st.nr_lat_cri, nr_sched);
841 let pc_x_migration = Self::get_pc(st.nr_x_migration, nr_sched);
842 let nr_stealee = st.nr_stealee;
843 let nr_big = st.nr_big;
844 let pc_big = Self::get_pc(nr_big, nr_sched);
845 let pc_pc_on_big = Self::get_pc(st.nr_pc_on_big, nr_big);
846 let pc_lc_on_big = Self::get_pc(st.nr_lc_on_big, nr_big);
847 let power_mode = Self::get_power_mode(bss_data.power_mode);
848 let total_time = bss_data.performance_mode_ns
849 + bss_data.balanced_mode_ns
850 + bss_data.powersave_mode_ns;
851 let pc_performance = Self::get_pc(bss_data.performance_mode_ns, total_time);
852 let pc_balanced = Self::get_pc(bss_data.balanced_mode_ns, total_time);
853 let pc_powersave = Self::get_pc(bss_data.powersave_mode_ns, total_time);
854
855 StatsRes::SysStats(SysStats {
856 mseq,
857 nr_queued_task,
858 nr_active,
859 nr_sched,
860 nr_preempt,
861 pc_pc,
862 pc_lc,
863 pc_x_migration,
864 nr_stealee,
865 pc_big,
866 pc_pc_on_big,
867 pc_lc_on_big,
868 power_mode: power_mode.to_string(),
869 pc_performance,
870 pc_balanced,
871 pc_powersave,
872 })
873 }
874 StatsReq::SchedSamplesNr {
875 tid,
876 nr_samples,
877 interval_ms,
878 } => {
879 if Some(*tid) != self.monitor_tid {
880 return Ok(StatsRes::Bye);
881 }
882
883 self.intrspc.cmd = LAVD_CMD_SCHED_N;
884 self.intrspc.arg = *nr_samples;
885 self.prep_introspec();
886 std::thread::sleep(Duration::from_millis(*interval_ms));
887 self.rb_mgr.poll(Duration::from_millis(100)).unwrap();
888
889 let mut samples = vec![];
890 while let Ok(ts) = self.intrspc_rx.try_recv() {
891 samples.push(ts);
892 }
893
894 self.cleanup_introspec();
895
896 StatsRes::SchedSamples(SchedSamples { samples })
897 }
898 })
899 }
900
901 fn stop_monitoring(&mut self) {
902 if self.skel.maps.bss_data.as_ref().unwrap().is_monitored {
903 self.skel.maps.bss_data.as_mut().unwrap().is_monitored = false;
904 }
905 }
906
907 pub fn exited(&mut self) -> bool {
908 uei_exited!(&self.skel, uei)
909 }
910
911 fn set_power_profile(&mut self, mode: u32) -> Result<(), u32> {
912 let prog = &mut self.skel.progs.set_power_profile;
913 let mut args = power_arg {
914 power_mode: mode as c_int,
915 };
916 let input = ProgramInput {
917 context_in: Some(unsafe {
918 std::slice::from_raw_parts_mut(
919 &mut args as *mut _ as *mut u8,
920 std::mem::size_of_val(&args),
921 )
922 }),
923 ..Default::default()
924 };
925 let out = prog.test_run(input).unwrap();
926 if out.return_value != 0 {
927 return Err(out.return_value);
928 }
929
930 Ok(())
931 }
932
933 fn update_power_profile(&mut self, prev_profile: PowerProfile) -> (bool, PowerProfile) {
934 let profile = fetch_power_profile(false);
935 if profile == prev_profile {
936 return (true, profile);
938 }
939
940 let _ = match profile {
941 PowerProfile::Performance => self.set_power_profile(LAVD_PM_PERFORMANCE),
942 PowerProfile::Balanced { .. } => self.set_power_profile(LAVD_PM_BALANCED),
943 PowerProfile::Powersave => self.set_power_profile(LAVD_PM_POWERSAVE),
944 PowerProfile::Unknown => {
945 return (false, profile);
948 }
949 };
950
951 info!("Set the scheduler's power profile to {profile} mode.");
952 (true, profile)
953 }
954
955 fn run(&mut self, opts: &Opts, shutdown: Arc<AtomicBool>) -> Result<UserExitInfo> {
956 let (res_ch, req_ch) = self.stats_server.channels();
957 let mut autopower = opts.autopower;
958 let mut profile = PowerProfile::Unknown;
959
960 if opts.performance {
961 let _ = self.set_power_profile(LAVD_PM_PERFORMANCE);
962 } else if opts.powersave {
963 let _ = self.set_power_profile(LAVD_PM_POWERSAVE);
964 } else {
965 let _ = self.set_power_profile(LAVD_PM_BALANCED);
966 }
967
968 while !shutdown.load(Ordering::Relaxed) && !self.exited() {
969 if autopower {
970 (autopower, profile) = self.update_power_profile(profile);
971 }
972
973 match req_ch.recv_timeout(Duration::from_secs(1)) {
974 Ok(req) => {
975 let res = self.stats_req_to_res(&req)?;
976 res_ch.send(res)?;
977 }
978 Err(RecvTimeoutError::Timeout) => {
979 self.stop_monitoring();
980 }
981 Err(e) => {
982 self.stop_monitoring();
983 Err(e)?
984 }
985 }
986 self.cleanup_introspec();
987 }
988 self.rb_mgr.consume().unwrap();
989
990 bpf_streams::dump_bpf_streams(&mut self.skel);
991 let _ = self.struct_ops.take();
992 uei_report!(&self.skel, uei)
993 }
994}
995
996impl Drop for Scheduler<'_> {
997 fn drop(&mut self) {
998 info!("Unregister {SCHEDULER_NAME} scheduler");
999
1000 if let Some(struct_ops) = self.struct_ops.take() {
1001 drop(struct_ops);
1002 }
1003 }
1004}
1005
1006fn init_log(opts: &Opts) {
1007 let env_filter = EnvFilter::try_from_default_env()
1008 .or_else(|_| match EnvFilter::try_new(&opts.log_level) {
1009 Ok(filter) => Ok(filter),
1010 Err(e) => {
1011 eprintln!(
1012 "invalid log envvar: {}, using info, err is: {}",
1013 opts.log_level, e
1014 );
1015 EnvFilter::try_new("info")
1016 }
1017 })
1018 .unwrap_or_else(|_| EnvFilter::new("info"));
1019
1020 match tracing_subscriber::fmt()
1021 .with_env_filter(env_filter)
1022 .with_target(true)
1023 .with_thread_ids(true)
1024 .with_file(true)
1025 .with_line_number(true)
1026 .try_init()
1027 {
1028 Ok(()) => {}
1029 Err(e) => eprintln!("failed to init logger: {}", e),
1030 }
1031}
1032
1033#[clap_main::clap_main]
1034fn main(mut opts: Opts) -> Result<()> {
1035 if opts.version {
1036 println!(
1037 "scx_lavd {}",
1038 build_id::full_version(env!("CARGO_PKG_VERSION"))
1039 );
1040 return Ok(());
1041 }
1042
1043 if opts.help_stats {
1044 let sys_stats_meta_name = SysStats::meta().name;
1045 let sched_sample_meta_name = SchedSample::meta().name;
1046 let stats_meta_names: &[&str] = &[
1047 sys_stats_meta_name.as_str(),
1048 sched_sample_meta_name.as_str(),
1049 ];
1050 stats::server_data(0).describe_meta(&mut std::io::stdout(), Some(&stats_meta_names))?;
1051 return Ok(());
1052 }
1053
1054 init_log(&opts);
1055
1056 if opts.verbose > 0 {
1057 warn!("Setting verbose via -v is deprecated and will be an error in future releases.");
1058 }
1059
1060 if let Some(run_id) = opts.run_id {
1061 info!("scx_lavd run_id: {}", run_id);
1062 }
1063
1064 if opts.monitor.is_none() && opts.monitor_sched_samples.is_none() {
1065 opts.proc().unwrap();
1066 info!("{:#?}", opts);
1067 }
1068
1069 let shutdown = Arc::new(AtomicBool::new(false));
1070 let shutdown_clone = shutdown.clone();
1071 ctrlc::set_handler(move || {
1072 shutdown_clone.store(true, Ordering::Relaxed);
1073 })
1074 .context("Error setting Ctrl-C handler")?;
1075
1076 if let Some(nr_samples) = opts.monitor_sched_samples {
1077 let shutdown_copy = shutdown.clone();
1078 let jh = std::thread::spawn(move || {
1079 stats::monitor_sched_samples(nr_samples, shutdown_copy).unwrap()
1080 });
1081 let _ = jh.join();
1082 return Ok(());
1083 }
1084
1085 if let Some(intv) = opts.monitor.or(opts.stats) {
1086 let shutdown_copy = shutdown.clone();
1087 let jh = std::thread::spawn(move || {
1088 stats::monitor(Duration::from_secs_f64(intv), shutdown_copy).unwrap()
1089 });
1090 if opts.monitor.is_some() {
1091 let _ = jh.join();
1092 return Ok(());
1093 }
1094 }
1095
1096 let mut open_object = MaybeUninit::uninit();
1097 loop {
1098 let mut sched = Scheduler::init(&opts, &mut open_object)?;
1099 info!(
1100 "scx_lavd scheduler is initialized (build ID: {})",
1101 build_id::full_version(env!("CARGO_PKG_VERSION"))
1102 );
1103 info!("scx_lavd scheduler starts running.");
1104 if !sched.run(&opts, shutdown.clone())?.should_restart() {
1105 break;
1106 }
1107 }
1108
1109 Ok(())
1110}