1mod bpf_skel;
9pub use bpf_skel::*;
10pub mod bpf_intf;
11pub use bpf_intf::*;
12
13mod stats;
14use std::collections::BTreeMap;
15use std::ffi::c_int;
16use std::fmt::Write;
17use std::mem::MaybeUninit;
18use std::sync::atomic::AtomicBool;
19use std::sync::atomic::Ordering;
20use std::sync::Arc;
21use std::time::Duration;
22
23use anyhow::Context;
24use anyhow::Result;
25use clap::Parser;
26use crossbeam::channel::RecvTimeoutError;
27use libbpf_rs::OpenObject;
28use libbpf_rs::ProgramInput;
29use log::warn;
30use log::{debug, info};
31use scx_stats::prelude::*;
32use scx_utils::autopower::{fetch_power_profile, PowerProfile};
33use scx_utils::build_id;
34use scx_utils::compat;
35use scx_utils::libbpf_clap_opts::LibbpfOpts;
36use scx_utils::pm::{cpu_idle_resume_latency_supported, update_cpu_idle_resume_latency};
37use scx_utils::scx_ops_attach;
38use scx_utils::scx_ops_load;
39use scx_utils::scx_ops_open;
40use scx_utils::try_set_rlimit_infinity;
41use scx_utils::uei_exited;
42use scx_utils::uei_report;
43use scx_utils::CoreType;
44use scx_utils::Cpumask;
45use scx_utils::Topology;
46use scx_utils::UserExitInfo;
47use scx_utils::NR_CPU_IDS;
48use stats::Metrics;
49
50const SCHEDULER_NAME: &str = "scx_bpfland";
51
52#[derive(PartialEq)]
53enum Powermode {
54 Performance,
55 Powersave,
56 Any,
57}
58
59fn get_primary_cpus(mode: Powermode) -> std::io::Result<Vec<usize>> {
60 let topo = Topology::new().unwrap();
61
62 let cpus: Vec<usize> = topo
63 .all_cores
64 .values()
65 .flat_map(|core| &core.cpus)
66 .filter_map(|(cpu_id, cpu)| match (&mode, &cpu.core_type) {
67 (Powermode::Performance, CoreType::Big { .. }) |
69 (Powermode::Powersave, CoreType::Little) => Some(*cpu_id),
71 (Powermode::Any, ..) => Some(*cpu_id),
72 _ => None,
73 })
74 .collect();
75
76 Ok(cpus)
77}
78
79fn cpus_to_cpumask(cpus: &Vec<usize>) -> String {
81 if cpus.is_empty() {
82 return String::from("none");
83 }
84
85 let max_cpu_id = *cpus.iter().max().unwrap();
87
88 let mut bitmask = vec![0u8; (max_cpu_id + 1 + 7) / 8];
90
91 for cpu_id in cpus {
93 let byte_index = cpu_id / 8;
94 let bit_index = cpu_id % 8;
95 bitmask[byte_index] |= 1 << bit_index;
96 }
97
98 let hex_str: String = bitmask.iter().rev().fold(String::new(), |mut f, byte| {
100 let _ = write!(&mut f, "{:02x}", byte);
101 f
102 });
103
104 format!("0x{}", hex_str)
105}
106
107#[derive(Debug, Parser)]
115struct Opts {
116 #[clap(long, default_value = "0")]
118 exit_dump_len: u32,
119
120 #[clap(short = 's', long, default_value = "20000")]
122 slice_us: u64,
123
124 #[clap(short = 'S', long, default_value = "1000")]
126 slice_us_min: u64,
127
128 #[clap(short = 'l', long, allow_hyphen_values = true, default_value = "20000")]
137 slice_us_lag: i64,
138
139 #[clap(short = 't', long, default_value = "0")]
144 throttle_us: u64,
145
146 #[clap(short = 'I', long, allow_hyphen_values = true, default_value = "-1")]
152 idle_resume_us: i64,
153
154 #[clap(short = 'n', long, action = clap::ArgAction::SetTrue)]
159 no_preempt: bool,
160
161 #[clap(short = 'p', long, action = clap::ArgAction::SetTrue)]
168 local_pcpu: bool,
169
170 #[clap(short = 'k', long, action = clap::ArgAction::SetTrue)]
179 local_kthreads: bool,
180
181 #[clap(short = 'w', long, action = clap::ArgAction::SetTrue)]
188 no_wake_sync: bool,
189
190 #[clap(short = 'm', long, default_value = "auto")]
201 primary_domain: String,
202
203 #[clap(long, action = clap::ArgAction::SetTrue)]
205 disable_l2: bool,
206
207 #[clap(long, action = clap::ArgAction::SetTrue)]
209 disable_l3: bool,
210
211 #[clap(long, action = clap::ArgAction::SetTrue)]
213 disable_smt: bool,
214
215 #[clap(long, action = clap::ArgAction::SetTrue)]
217 disable_numa: bool,
218
219 #[clap(short = 'f', long, action = clap::ArgAction::SetTrue)]
223 cpufreq: bool,
224
225 #[clap(short = 'c', long, default_value = "10", hide = true)]
230 nvcsw_max_thresh: u64,
231
232 #[clap(long)]
234 stats: Option<f64>,
235
236 #[clap(long)]
239 monitor: Option<f64>,
240
241 #[clap(short = 'd', long, action = clap::ArgAction::SetTrue)]
243 debug: bool,
244
245 #[clap(short = 'v', long, action = clap::ArgAction::SetTrue)]
247 verbose: bool,
248
249 #[clap(short = 'V', long, action = clap::ArgAction::SetTrue)]
251 version: bool,
252
253 #[clap(long)]
255 help_stats: bool,
256
257 #[clap(flatten, next_help_heading = "Libbpf Options")]
258 pub libbpf: LibbpfOpts,
259}
260
261struct Scheduler<'a> {
262 skel: BpfSkel<'a>,
263 struct_ops: Option<libbpf_rs::Link>,
264 opts: &'a Opts,
265 topo: Topology,
266 power_profile: PowerProfile,
267 stats_server: StatsServer<(), Metrics>,
268 user_restart: bool,
269}
270
271impl<'a> Scheduler<'a> {
272 fn init(opts: &'a Opts, open_object: &'a mut MaybeUninit<OpenObject>) -> Result<Self> {
273 try_set_rlimit_infinity();
274
275 assert!(opts.slice_us >= opts.slice_us_min);
277
278 let topo = Topology::new().unwrap();
280
281 let smt_enabled = !opts.disable_smt && topo.smt_enabled;
283
284 let nr_nodes = topo
286 .nodes
287 .values()
288 .filter(|node| !node.all_cpus.is_empty())
289 .count();
290 info!("NUMA nodes: {}", nr_nodes);
291
292 let numa_disabled = opts.disable_numa || nr_nodes == 1;
294 if numa_disabled {
295 info!("Disabling NUMA optimizations");
296 }
297
298 info!(
299 "{} {} {}",
300 SCHEDULER_NAME,
301 build_id::full_version(env!("CARGO_PKG_VERSION")),
302 if smt_enabled { "SMT on" } else { "SMT off" }
303 );
304
305 if opts.idle_resume_us >= 0 {
306 if !cpu_idle_resume_latency_supported() {
307 warn!("idle resume latency not supported");
308 } else {
309 info!("Setting idle QoS to {} us", opts.idle_resume_us);
310 for cpu in topo.all_cpus.values() {
311 update_cpu_idle_resume_latency(
312 cpu.id,
313 opts.idle_resume_us.try_into().unwrap(),
314 )?;
315 }
316 }
317 }
318
319 let mut skel_builder = BpfSkelBuilder::default();
321 skel_builder.obj_builder.debug(opts.verbose);
322 let open_opts = opts.libbpf.clone().into_bpf_open_opts();
323 let mut skel = scx_ops_open!(skel_builder, open_object, bpfland_ops, open_opts)?;
324
325 skel.struct_ops.bpfland_ops_mut().exit_dump_len = opts.exit_dump_len;
326
327 let rodata = skel.maps.rodata_data.as_mut().unwrap();
329 rodata.debug = opts.debug;
330 rodata.smt_enabled = smt_enabled;
331 rodata.numa_disabled = numa_disabled;
332 rodata.local_pcpu = opts.local_pcpu;
333 rodata.no_preempt = opts.no_preempt;
334 rodata.no_wake_sync = opts.no_wake_sync;
335 rodata.slice_max = opts.slice_us * 1000;
336 rodata.slice_min = opts.slice_us_min * 1000;
337 rodata.slice_lag = opts.slice_us_lag * 1000;
338 rodata.throttle_ns = opts.throttle_us * 1000;
339
340 rodata.local_kthreads = opts.local_kthreads || opts.throttle_us > 0;
343
344 rodata.__COMPAT_SCX_PICK_IDLE_IN_NODE = *compat::SCX_PICK_IDLE_IN_NODE;
346
347 skel.struct_ops.bpfland_ops_mut().flags = *compat::SCX_OPS_ENQ_EXITING
349 | *compat::SCX_OPS_ENQ_LAST
350 | *compat::SCX_OPS_ENQ_MIGRATION_DISABLED
351 | *compat::SCX_OPS_ALLOW_QUEUED_WAKEUP
352 | if numa_disabled {
353 0
354 } else {
355 *compat::SCX_OPS_BUILTIN_IDLE_PER_NODE
356 };
357 info!(
358 "scheduler flags: {:#x}",
359 skel.struct_ops.bpfland_ops_mut().flags
360 );
361
362 let mut skel = scx_ops_load!(skel, bpfland_ops, uei)?;
364
365 let power_profile = Self::power_profile();
367 if let Err(err) = Self::init_energy_domain(&mut skel, &opts.primary_domain, power_profile) {
368 warn!("failed to initialize primary domain: error {}", err);
369 }
370 if let Err(err) = Self::init_cpufreq_perf(&mut skel, &opts.primary_domain, opts.cpufreq) {
371 warn!(
372 "failed to initialize cpufreq performance level: error {}",
373 err
374 );
375 }
376
377 if smt_enabled {
379 Self::init_smt_domains(&mut skel, &topo)?;
380 }
381
382 if !opts.disable_l2 {
384 Self::init_l2_cache_domains(&mut skel, &topo)?;
385 }
386 if !opts.disable_l3 {
388 Self::init_l3_cache_domains(&mut skel, &topo)?;
389 }
390
391 let struct_ops = Some(scx_ops_attach!(skel, bpfland_ops)?);
393 let stats_server = StatsServer::new(stats::server_data()).launch()?;
394
395 Ok(Self {
396 skel,
397 struct_ops,
398 opts,
399 topo,
400 power_profile,
401 stats_server,
402 user_restart: false,
403 })
404 }
405
406 fn enable_primary_cpu(skel: &mut BpfSkel<'_>, cpu: i32) -> Result<(), u32> {
407 let prog = &mut skel.progs.enable_primary_cpu;
408 let mut args = cpu_arg {
409 cpu_id: cpu as c_int,
410 };
411 let input = ProgramInput {
412 context_in: Some(unsafe {
413 std::slice::from_raw_parts_mut(
414 &mut args as *mut _ as *mut u8,
415 std::mem::size_of_val(&args),
416 )
417 }),
418 ..Default::default()
419 };
420 let out = prog.test_run(input).unwrap();
421 if out.return_value != 0 {
422 return Err(out.return_value);
423 }
424
425 Ok(())
426 }
427
428 fn epp_to_cpumask(profile: Powermode) -> Result<Cpumask> {
429 let mut cpus = get_primary_cpus(profile).unwrap_or_default();
430 if cpus.is_empty() {
431 cpus = get_primary_cpus(Powermode::Any).unwrap_or_default();
432 }
433 Cpumask::from_str(&cpus_to_cpumask(&cpus))
434 }
435
436 fn init_energy_domain(
437 skel: &mut BpfSkel<'_>,
438 primary_domain: &str,
439 power_profile: PowerProfile,
440 ) -> Result<()> {
441 let domain = match primary_domain {
442 "powersave" => Self::epp_to_cpumask(Powermode::Powersave)?,
443 "performance" => Self::epp_to_cpumask(Powermode::Performance)?,
444 "auto" => match power_profile {
445 PowerProfile::Powersave => Self::epp_to_cpumask(Powermode::Powersave)?,
446 PowerProfile::Balanced { power: true } => {
447 Self::epp_to_cpumask(Powermode::Powersave)?
448 }
449 PowerProfile::Balanced { power: false } => Self::epp_to_cpumask(Powermode::Any)?,
450 PowerProfile::Performance => Self::epp_to_cpumask(Powermode::Any)?,
451 PowerProfile::Unknown => Self::epp_to_cpumask(Powermode::Any)?,
452 },
453 "all" => Self::epp_to_cpumask(Powermode::Any)?,
454 &_ => Cpumask::from_str(primary_domain)?,
455 };
456
457 info!("primary CPU domain = 0x{:x}", domain);
458
459 if let Err(err) = Self::enable_primary_cpu(skel, -1) {
461 warn!("failed to reset primary domain: error {}", err);
462 }
463 for cpu in 0..*NR_CPU_IDS {
465 if domain.test_cpu(cpu) {
466 if let Err(err) = Self::enable_primary_cpu(skel, cpu as i32) {
467 warn!("failed to add CPU {} to primary domain: error {}", cpu, err);
468 }
469 }
470 }
471
472 Ok(())
473 }
474
475 fn init_cpufreq_perf(
477 skel: &mut BpfSkel<'_>,
478 primary_domain: &String,
479 auto: bool,
480 ) -> Result<()> {
481 let perf_lvl: i64 = match primary_domain.as_str() {
484 "powersave" => 0,
485 _ if auto => -1,
486 _ => 1024,
487 };
488 info!(
489 "cpufreq performance level: {}",
490 match perf_lvl {
491 1024 => "max".into(),
492 0 => "min".into(),
493 n if n < 0 => "auto".into(),
494 _ => perf_lvl.to_string(),
495 }
496 );
497 skel.maps.bss_data.as_mut().unwrap().cpufreq_perf_lvl = perf_lvl;
498
499 Ok(())
500 }
501
502 fn power_profile() -> PowerProfile {
503 let profile = fetch_power_profile(true);
504 if profile == PowerProfile::Unknown {
505 fetch_power_profile(false)
506 } else {
507 profile
508 }
509 }
510
511 fn refresh_sched_domain(&mut self) -> bool {
512 if self.power_profile != PowerProfile::Unknown {
513 let power_profile = Self::power_profile();
514 if power_profile != self.power_profile {
515 self.power_profile = power_profile;
516
517 if self.opts.primary_domain == "auto" {
518 return true;
519 }
520 if let Err(err) = Self::init_cpufreq_perf(
521 &mut self.skel,
522 &self.opts.primary_domain,
523 self.opts.cpufreq,
524 ) {
525 warn!("failed to refresh cpufreq performance level: error {}", err);
526 }
527 }
528 }
529
530 false
531 }
532
533 fn enable_sibling_cpu(
534 skel: &mut BpfSkel<'_>,
535 lvl: usize,
536 cpu: usize,
537 sibling_cpu: usize,
538 ) -> Result<(), u32> {
539 let prog = &mut skel.progs.enable_sibling_cpu;
540 let mut args = domain_arg {
541 lvl_id: lvl as c_int,
542 cpu_id: cpu as c_int,
543 sibling_cpu_id: sibling_cpu as c_int,
544 };
545 let input = ProgramInput {
546 context_in: Some(unsafe {
547 std::slice::from_raw_parts_mut(
548 &mut args as *mut _ as *mut u8,
549 std::mem::size_of_val(&args),
550 )
551 }),
552 ..Default::default()
553 };
554 let out = prog.test_run(input).unwrap();
555 if out.return_value != 0 {
556 return Err(out.return_value);
557 }
558
559 Ok(())
560 }
561
562 fn init_smt_domains(skel: &mut BpfSkel<'_>, topo: &Topology) -> Result<(), std::io::Error> {
563 let smt_siblings = topo.sibling_cpus();
564
565 info!("SMT sibling CPUs: {:?}", smt_siblings);
566 for (cpu, sibling_cpu) in smt_siblings.iter().enumerate() {
567 Self::enable_sibling_cpu(skel, 0, cpu, *sibling_cpu as usize).unwrap();
568 }
569
570 Ok(())
571 }
572
573 fn are_smt_siblings(topo: &Topology, cpus: &[usize]) -> bool {
574 if cpus.len() <= 1 {
576 return true;
577 }
578
579 let first_cpu = cpus[0];
581 let smt_siblings = topo.sibling_cpus();
582 cpus.iter().all(|&cpu| {
583 cpu == first_cpu
584 || smt_siblings[cpu] == first_cpu as i32
585 || (smt_siblings[first_cpu] >= 0 && smt_siblings[first_cpu] == cpu as i32)
586 })
587 }
588
589 fn init_cache_domains(
590 skel: &mut BpfSkel<'_>,
591 topo: &Topology,
592 cache_lvl: usize,
593 enable_sibling_cpu_fn: &dyn Fn(&mut BpfSkel<'_>, usize, usize, usize) -> Result<(), u32>,
594 ) -> Result<(), std::io::Error> {
595 let mut cache_id_map: BTreeMap<usize, Vec<usize>> = BTreeMap::new();
597 for core in topo.all_cores.values() {
598 for (cpu_id, cpu) in &core.cpus {
599 let cache_id = match cache_lvl {
600 2 => cpu.l2_id,
601 3 => cpu.llc_id,
602 _ => panic!("invalid cache level {}", cache_lvl),
603 };
604 cache_id_map.entry(cache_id).or_default().push(*cpu_id);
605 }
606 }
607
608 for (cache_id, cpus) in cache_id_map {
610 if cpus.len() <= 1 {
612 continue;
613 }
614
615 if Self::are_smt_siblings(topo, &cpus) {
617 continue;
618 }
619
620 info!(
621 "L{} cache ID {}: sibling CPUs: {:?}",
622 cache_lvl, cache_id, cpus
623 );
624 for cpu in &cpus {
625 for sibling_cpu in &cpus {
626 if enable_sibling_cpu_fn(skel, cache_lvl, *cpu, *sibling_cpu).is_err() {
627 warn!(
628 "L{} cache ID {}: failed to set CPU {} sibling {}",
629 cache_lvl, cache_id, *cpu, *sibling_cpu
630 );
631 }
632 }
633 }
634 }
635
636 Ok(())
637 }
638
639 fn init_l2_cache_domains(
640 skel: &mut BpfSkel<'_>,
641 topo: &Topology,
642 ) -> Result<(), std::io::Error> {
643 Self::init_cache_domains(skel, topo, 2, &|skel, lvl, cpu, sibling_cpu| {
644 Self::enable_sibling_cpu(skel, lvl, cpu, sibling_cpu)
645 })
646 }
647
648 fn init_l3_cache_domains(
649 skel: &mut BpfSkel<'_>,
650 topo: &Topology,
651 ) -> Result<(), std::io::Error> {
652 Self::init_cache_domains(skel, topo, 3, &|skel, lvl, cpu, sibling_cpu| {
653 Self::enable_sibling_cpu(skel, lvl, cpu, sibling_cpu)
654 })
655 }
656
657 fn get_metrics(&self) -> Metrics {
658 let bss_data = self.skel.maps.bss_data.as_ref().unwrap();
659 Metrics {
660 nr_running: bss_data.nr_running,
661 nr_cpus: bss_data.nr_online_cpus,
662 nr_kthread_dispatches: bss_data.nr_kthread_dispatches,
663 nr_direct_dispatches: bss_data.nr_direct_dispatches,
664 nr_shared_dispatches: bss_data.nr_shared_dispatches,
665 }
666 }
667
668 pub fn exited(&mut self) -> bool {
669 uei_exited!(&self.skel, uei)
670 }
671
672 fn run(&mut self, shutdown: Arc<AtomicBool>) -> Result<UserExitInfo> {
673 let (res_ch, req_ch) = self.stats_server.channels();
674 while !shutdown.load(Ordering::Relaxed) && !self.exited() {
675 if self.refresh_sched_domain() {
676 self.user_restart = true;
677 break;
678 }
679 match req_ch.recv_timeout(Duration::from_secs(1)) {
680 Ok(()) => res_ch.send(self.get_metrics())?,
681 Err(RecvTimeoutError::Timeout) => {}
682 Err(e) => Err(e)?,
683 }
684 }
685
686 let _ = self.struct_ops.take();
687 uei_report!(&self.skel, uei)
688 }
689}
690
691impl Drop for Scheduler<'_> {
692 fn drop(&mut self) {
693 info!("Unregister {SCHEDULER_NAME} scheduler");
694
695 if self.opts.idle_resume_us >= 0 {
697 if cpu_idle_resume_latency_supported() {
698 for cpu in self.topo.all_cpus.values() {
699 update_cpu_idle_resume_latency(cpu.id, cpu.pm_qos_resume_latency_us as i32)
700 .unwrap();
701 }
702 }
703 }
704 }
705}
706
707fn main() -> Result<()> {
708 let opts = Opts::parse();
709
710 if opts.version {
711 println!(
712 "{} {}",
713 SCHEDULER_NAME,
714 build_id::full_version(env!("CARGO_PKG_VERSION"))
715 );
716 return Ok(());
717 }
718
719 if opts.help_stats {
720 stats::server_data().describe_meta(&mut std::io::stdout(), None)?;
721 return Ok(());
722 }
723
724 let loglevel = simplelog::LevelFilter::Info;
725
726 let mut lcfg = simplelog::ConfigBuilder::new();
727 lcfg.set_time_offset_to_local()
728 .expect("Failed to set local time offset")
729 .set_time_level(simplelog::LevelFilter::Error)
730 .set_location_level(simplelog::LevelFilter::Off)
731 .set_target_level(simplelog::LevelFilter::Off)
732 .set_thread_level(simplelog::LevelFilter::Off);
733 simplelog::TermLogger::init(
734 loglevel,
735 lcfg.build(),
736 simplelog::TerminalMode::Stderr,
737 simplelog::ColorChoice::Auto,
738 )?;
739
740 let shutdown = Arc::new(AtomicBool::new(false));
741 let shutdown_clone = shutdown.clone();
742 ctrlc::set_handler(move || {
743 shutdown_clone.store(true, Ordering::Relaxed);
744 })
745 .context("Error setting Ctrl-C handler")?;
746
747 if let Some(intv) = opts.monitor.or(opts.stats) {
748 let shutdown_copy = shutdown.clone();
749 let jh = std::thread::spawn(move || {
750 match stats::monitor(Duration::from_secs_f64(intv), shutdown_copy) {
751 Ok(_) => {
752 debug!("stats monitor thread finished successfully")
753 }
754 Err(error_object) => {
755 warn!(
756 "stats monitor thread finished because of an error {}",
757 error_object
758 )
759 }
760 }
761 });
762 if opts.monitor.is_some() {
763 let _ = jh.join();
764 return Ok(());
765 }
766 }
767
768 let mut open_object = MaybeUninit::uninit();
769 loop {
770 let mut sched = Scheduler::init(&opts, &mut open_object)?;
771 if !sched.run(shutdown.clone())?.should_restart() {
772 if sched.user_restart {
773 continue;
774 }
775 break;
776 }
777 }
778
779 Ok(())
780}