1mod bpf_skel;
9pub use bpf_skel::*;
10pub mod bpf_intf;
11pub use bpf_intf::*;
12
13mod stats;
14use std::collections::BTreeMap;
15use std::ffi::c_int;
16use std::fmt::Write;
17use std::mem::MaybeUninit;
18use std::sync::atomic::AtomicBool;
19use std::sync::atomic::Ordering;
20use std::sync::Arc;
21use std::time::Duration;
22
23use anyhow::Context;
24use anyhow::Result;
25use clap::Parser;
26use crossbeam::channel::RecvTimeoutError;
27use libbpf_rs::OpenObject;
28use libbpf_rs::ProgramInput;
29use log::warn;
30use log::{debug, info};
31use scx_stats::prelude::*;
32use scx_utils::autopower::{fetch_power_profile, PowerProfile};
33use scx_utils::build_id;
34use scx_utils::compat;
35use scx_utils::pm::{cpu_idle_resume_latency_supported, update_cpu_idle_resume_latency};
36use scx_utils::scx_ops_attach;
37use scx_utils::scx_ops_load;
38use scx_utils::scx_ops_open;
39use scx_utils::set_rlimit_infinity;
40use scx_utils::uei_exited;
41use scx_utils::uei_report;
42use scx_utils::CoreType;
43use scx_utils::Cpumask;
44use scx_utils::Topology;
45use scx_utils::UserExitInfo;
46use scx_utils::NR_CPU_IDS;
47use stats::Metrics;
48
49const SCHEDULER_NAME: &str = "scx_bpfland";
50
51#[derive(PartialEq)]
52enum Powermode {
53 Performance,
54 Powersave,
55 Any,
56}
57
58fn get_primary_cpus(mode: Powermode) -> std::io::Result<Vec<usize>> {
59 let topo = Topology::new().unwrap();
60
61 let cpus: Vec<usize> = topo
62 .all_cores
63 .values()
64 .flat_map(|core| &core.cpus)
65 .filter_map(|(cpu_id, cpu)| match (&mode, &cpu.core_type) {
66 (Powermode::Performance, CoreType::Big { .. }) |
68 (Powermode::Powersave, CoreType::Little) => Some(*cpu_id),
70 (Powermode::Any, ..) => Some(*cpu_id),
71 _ => None,
72 })
73 .collect();
74
75 Ok(cpus)
76}
77
78fn cpus_to_cpumask(cpus: &Vec<usize>) -> String {
80 if cpus.is_empty() {
81 return String::from("none");
82 }
83
84 let max_cpu_id = *cpus.iter().max().unwrap();
86
87 let mut bitmask = vec![0u8; (max_cpu_id + 1 + 7) / 8];
89
90 for cpu_id in cpus {
92 let byte_index = cpu_id / 8;
93 let bit_index = cpu_id % 8;
94 bitmask[byte_index] |= 1 << bit_index;
95 }
96
97 let hex_str: String = bitmask.iter().rev().fold(String::new(), |mut f, byte| {
99 let _ = write!(&mut f, "{:02x}", byte);
100 f
101 });
102
103 format!("0x{}", hex_str)
104}
105
106#[derive(Debug, Parser)]
114struct Opts {
115 #[clap(long, default_value = "0")]
117 exit_dump_len: u32,
118
119 #[clap(short = 's', long, default_value = "20000")]
121 slice_us: u64,
122
123 #[clap(short = 'S', long, default_value = "1000")]
125 slice_us_min: u64,
126
127 #[clap(short = 'l', long, allow_hyphen_values = true, default_value = "20000")]
136 slice_us_lag: i64,
137
138 #[clap(short = 't', long, default_value = "0")]
143 throttle_us: u64,
144
145 #[clap(short = 'I', long, allow_hyphen_values = true, default_value = "-1")]
151 idle_resume_us: i64,
152
153 #[clap(short = 'n', long, action = clap::ArgAction::SetTrue)]
158 no_preempt: bool,
159
160 #[clap(short = 'p', long, action = clap::ArgAction::SetTrue)]
167 local_pcpu: bool,
168
169 #[clap(short = 'k', long, action = clap::ArgAction::SetTrue)]
178 local_kthreads: bool,
179
180 #[clap(short = 'w', long, action = clap::ArgAction::SetTrue)]
187 no_wake_sync: bool,
188
189 #[clap(short = 'm', long, default_value = "auto")]
200 primary_domain: String,
201
202 #[clap(long, action = clap::ArgAction::SetTrue)]
204 disable_l2: bool,
205
206 #[clap(long, action = clap::ArgAction::SetTrue)]
208 disable_l3: bool,
209
210 #[clap(long, action = clap::ArgAction::SetTrue)]
212 disable_smt: bool,
213
214 #[clap(long, action = clap::ArgAction::SetTrue)]
216 disable_numa: bool,
217
218 #[clap(short = 'f', long, action = clap::ArgAction::SetTrue)]
222 cpufreq: bool,
223
224 #[clap(short = 'c', long, default_value = "10", hide = true)]
229 nvcsw_max_thresh: u64,
230
231 #[clap(long)]
233 stats: Option<f64>,
234
235 #[clap(long)]
238 monitor: Option<f64>,
239
240 #[clap(short = 'd', long, action = clap::ArgAction::SetTrue)]
242 debug: bool,
243
244 #[clap(short = 'v', long, action = clap::ArgAction::SetTrue)]
246 verbose: bool,
247
248 #[clap(short = 'V', long, action = clap::ArgAction::SetTrue)]
250 version: bool,
251
252 #[clap(long)]
254 help_stats: bool,
255}
256
257struct Scheduler<'a> {
258 skel: BpfSkel<'a>,
259 struct_ops: Option<libbpf_rs::Link>,
260 opts: &'a Opts,
261 topo: Topology,
262 power_profile: PowerProfile,
263 stats_server: StatsServer<(), Metrics>,
264 user_restart: bool,
265}
266
267impl<'a> Scheduler<'a> {
268 fn init(opts: &'a Opts, open_object: &'a mut MaybeUninit<OpenObject>) -> Result<Self> {
269 set_rlimit_infinity();
270
271 assert!(opts.slice_us >= opts.slice_us_min);
273
274 let topo = Topology::new().unwrap();
276
277 let smt_enabled = !opts.disable_smt && topo.smt_enabled;
279
280 let nr_nodes = topo
282 .nodes
283 .values()
284 .filter(|node| !node.all_cpus.is_empty())
285 .count();
286 info!("NUMA nodes: {}", nr_nodes);
287
288 let numa_disabled = opts.disable_numa || nr_nodes == 1;
290 if numa_disabled {
291 info!("Disabling NUMA optimizations");
292 }
293
294 info!(
295 "{} {} {}",
296 SCHEDULER_NAME,
297 build_id::full_version(env!("CARGO_PKG_VERSION")),
298 if smt_enabled { "SMT on" } else { "SMT off" }
299 );
300
301 if opts.idle_resume_us >= 0 {
302 if !cpu_idle_resume_latency_supported() {
303 warn!("idle resume latency not supported");
304 } else {
305 info!("Setting idle QoS to {} us", opts.idle_resume_us);
306 for cpu in topo.all_cpus.values() {
307 update_cpu_idle_resume_latency(
308 cpu.id,
309 opts.idle_resume_us.try_into().unwrap(),
310 )?;
311 }
312 }
313 }
314
315 let mut skel_builder = BpfSkelBuilder::default();
317 skel_builder.obj_builder.debug(opts.verbose);
318 let mut skel = scx_ops_open!(skel_builder, open_object, bpfland_ops)?;
319
320 skel.struct_ops.bpfland_ops_mut().exit_dump_len = opts.exit_dump_len;
321
322 skel.maps.rodata_data.debug = opts.debug;
324 skel.maps.rodata_data.smt_enabled = smt_enabled;
325 skel.maps.rodata_data.numa_disabled = numa_disabled;
326 skel.maps.rodata_data.local_pcpu = opts.local_pcpu;
327 skel.maps.rodata_data.no_preempt = opts.no_preempt;
328 skel.maps.rodata_data.no_wake_sync = opts.no_wake_sync;
329 skel.maps.rodata_data.slice_max = opts.slice_us * 1000;
330 skel.maps.rodata_data.slice_min = opts.slice_us_min * 1000;
331 skel.maps.rodata_data.slice_lag = opts.slice_us_lag * 1000;
332 skel.maps.rodata_data.throttle_ns = opts.throttle_us * 1000;
333
334 skel.maps.rodata_data.local_kthreads = opts.local_kthreads || opts.throttle_us > 0;
337
338 skel.maps.rodata_data.__COMPAT_SCX_PICK_IDLE_IN_NODE = *compat::SCX_PICK_IDLE_IN_NODE;
340
341 skel.struct_ops.bpfland_ops_mut().flags = *compat::SCX_OPS_ENQ_EXITING
343 | *compat::SCX_OPS_ENQ_LAST
344 | *compat::SCX_OPS_ENQ_MIGRATION_DISABLED
345 | *compat::SCX_OPS_ALLOW_QUEUED_WAKEUP
346 | if numa_disabled {
347 0
348 } else {
349 *compat::SCX_OPS_BUILTIN_IDLE_PER_NODE
350 };
351 info!(
352 "scheduler flags: {:#x}",
353 skel.struct_ops.bpfland_ops_mut().flags
354 );
355
356 let mut skel = scx_ops_load!(skel, bpfland_ops, uei)?;
358
359 let power_profile = Self::power_profile();
361 if let Err(err) = Self::init_energy_domain(&mut skel, &opts.primary_domain, power_profile) {
362 warn!("failed to initialize primary domain: error {}", err);
363 }
364 if let Err(err) = Self::init_cpufreq_perf(&mut skel, &opts.primary_domain, opts.cpufreq) {
365 warn!(
366 "failed to initialize cpufreq performance level: error {}",
367 err
368 );
369 }
370
371 if smt_enabled {
373 Self::init_smt_domains(&mut skel, &topo)?;
374 }
375
376 if !opts.disable_l2 {
378 Self::init_l2_cache_domains(&mut skel, &topo)?;
379 }
380 if !opts.disable_l3 {
382 Self::init_l3_cache_domains(&mut skel, &topo)?;
383 }
384
385 let struct_ops = Some(scx_ops_attach!(skel, bpfland_ops)?);
387 let stats_server = StatsServer::new(stats::server_data()).launch()?;
388
389 Ok(Self {
390 skel,
391 struct_ops,
392 opts,
393 topo,
394 power_profile,
395 stats_server,
396 user_restart: false,
397 })
398 }
399
400 fn enable_primary_cpu(skel: &mut BpfSkel<'_>, cpu: i32) -> Result<(), u32> {
401 let prog = &mut skel.progs.enable_primary_cpu;
402 let mut args = cpu_arg {
403 cpu_id: cpu as c_int,
404 };
405 let input = ProgramInput {
406 context_in: Some(unsafe {
407 std::slice::from_raw_parts_mut(
408 &mut args as *mut _ as *mut u8,
409 std::mem::size_of_val(&args),
410 )
411 }),
412 ..Default::default()
413 };
414 let out = prog.test_run(input).unwrap();
415 if out.return_value != 0 {
416 return Err(out.return_value);
417 }
418
419 Ok(())
420 }
421
422 fn epp_to_cpumask(profile: Powermode) -> Result<Cpumask> {
423 let mut cpus = get_primary_cpus(profile).unwrap_or_default();
424 if cpus.is_empty() {
425 cpus = get_primary_cpus(Powermode::Any).unwrap_or_default();
426 }
427 Cpumask::from_str(&cpus_to_cpumask(&cpus))
428 }
429
430 fn init_energy_domain(
431 skel: &mut BpfSkel<'_>,
432 primary_domain: &str,
433 power_profile: PowerProfile,
434 ) -> Result<()> {
435 let domain = match primary_domain {
436 "powersave" => Self::epp_to_cpumask(Powermode::Powersave)?,
437 "performance" => Self::epp_to_cpumask(Powermode::Performance)?,
438 "auto" => match power_profile {
439 PowerProfile::Powersave => Self::epp_to_cpumask(Powermode::Powersave)?,
440 PowerProfile::Balanced { power: true } => {
441 Self::epp_to_cpumask(Powermode::Powersave)?
442 }
443 PowerProfile::Balanced { power: false } => Self::epp_to_cpumask(Powermode::Any)?,
444 PowerProfile::Performance => Self::epp_to_cpumask(Powermode::Any)?,
445 PowerProfile::Unknown => Self::epp_to_cpumask(Powermode::Any)?,
446 },
447 "all" => Self::epp_to_cpumask(Powermode::Any)?,
448 &_ => Cpumask::from_str(primary_domain)?,
449 };
450
451 info!("primary CPU domain = 0x{:x}", domain);
452
453 if let Err(err) = Self::enable_primary_cpu(skel, -1) {
455 warn!("failed to reset primary domain: error {}", err);
456 }
457 for cpu in 0..*NR_CPU_IDS {
459 if domain.test_cpu(cpu) {
460 if let Err(err) = Self::enable_primary_cpu(skel, cpu as i32) {
461 warn!("failed to add CPU {} to primary domain: error {}", cpu, err);
462 }
463 }
464 }
465
466 Ok(())
467 }
468
469 fn init_cpufreq_perf(
471 skel: &mut BpfSkel<'_>,
472 primary_domain: &String,
473 auto: bool,
474 ) -> Result<()> {
475 let perf_lvl: i64 = match primary_domain.as_str() {
478 "powersave" => 0,
479 _ if auto => -1,
480 _ => 1024,
481 };
482 info!(
483 "cpufreq performance level: {}",
484 match perf_lvl {
485 1024 => "max".into(),
486 0 => "min".into(),
487 n if n < 0 => "auto".into(),
488 _ => perf_lvl.to_string(),
489 }
490 );
491 skel.maps.bss_data.cpufreq_perf_lvl = perf_lvl;
492
493 Ok(())
494 }
495
496 fn power_profile() -> PowerProfile {
497 let profile = fetch_power_profile(true);
498 if profile == PowerProfile::Unknown {
499 fetch_power_profile(false)
500 } else {
501 profile
502 }
503 }
504
505 fn refresh_sched_domain(&mut self) -> bool {
506 if self.power_profile != PowerProfile::Unknown {
507 let power_profile = Self::power_profile();
508 if power_profile != self.power_profile {
509 self.power_profile = power_profile;
510
511 if self.opts.primary_domain == "auto" {
512 return true;
513 }
514 if let Err(err) = Self::init_cpufreq_perf(
515 &mut self.skel,
516 &self.opts.primary_domain,
517 self.opts.cpufreq,
518 ) {
519 warn!("failed to refresh cpufreq performance level: error {}", err);
520 }
521 }
522 }
523
524 false
525 }
526
527 fn enable_sibling_cpu(
528 skel: &mut BpfSkel<'_>,
529 lvl: usize,
530 cpu: usize,
531 sibling_cpu: usize,
532 ) -> Result<(), u32> {
533 let prog = &mut skel.progs.enable_sibling_cpu;
534 let mut args = domain_arg {
535 lvl_id: lvl as c_int,
536 cpu_id: cpu as c_int,
537 sibling_cpu_id: sibling_cpu as c_int,
538 };
539 let input = ProgramInput {
540 context_in: Some(unsafe {
541 std::slice::from_raw_parts_mut(
542 &mut args as *mut _ as *mut u8,
543 std::mem::size_of_val(&args),
544 )
545 }),
546 ..Default::default()
547 };
548 let out = prog.test_run(input).unwrap();
549 if out.return_value != 0 {
550 return Err(out.return_value);
551 }
552
553 Ok(())
554 }
555
556 fn init_smt_domains(skel: &mut BpfSkel<'_>, topo: &Topology) -> Result<(), std::io::Error> {
557 let smt_siblings = topo.sibling_cpus();
558
559 info!("SMT sibling CPUs: {:?}", smt_siblings);
560 for (cpu, sibling_cpu) in smt_siblings.iter().enumerate() {
561 Self::enable_sibling_cpu(skel, 0, cpu, *sibling_cpu as usize).unwrap();
562 }
563
564 Ok(())
565 }
566
567 fn are_smt_siblings(topo: &Topology, cpus: &[usize]) -> bool {
568 if cpus.len() <= 1 {
570 return true;
571 }
572
573 let first_cpu = cpus[0];
575 let smt_siblings = topo.sibling_cpus();
576 cpus.iter().all(|&cpu| {
577 cpu == first_cpu
578 || smt_siblings[cpu] == first_cpu as i32
579 || (smt_siblings[first_cpu] >= 0 && smt_siblings[first_cpu] == cpu as i32)
580 })
581 }
582
583 fn init_cache_domains(
584 skel: &mut BpfSkel<'_>,
585 topo: &Topology,
586 cache_lvl: usize,
587 enable_sibling_cpu_fn: &dyn Fn(&mut BpfSkel<'_>, usize, usize, usize) -> Result<(), u32>,
588 ) -> Result<(), std::io::Error> {
589 let mut cache_id_map: BTreeMap<usize, Vec<usize>> = BTreeMap::new();
591 for core in topo.all_cores.values() {
592 for (cpu_id, cpu) in &core.cpus {
593 let cache_id = match cache_lvl {
594 2 => cpu.l2_id,
595 3 => cpu.llc_id,
596 _ => panic!("invalid cache level {}", cache_lvl),
597 };
598 cache_id_map.entry(cache_id).or_default().push(*cpu_id);
599 }
600 }
601
602 for (cache_id, cpus) in cache_id_map {
604 if cpus.len() <= 1 {
606 continue;
607 }
608
609 if Self::are_smt_siblings(topo, &cpus) {
611 continue;
612 }
613
614 info!(
615 "L{} cache ID {}: sibling CPUs: {:?}",
616 cache_lvl, cache_id, cpus
617 );
618 for cpu in &cpus {
619 for sibling_cpu in &cpus {
620 if enable_sibling_cpu_fn(skel, cache_lvl, *cpu, *sibling_cpu).is_err() {
621 warn!(
622 "L{} cache ID {}: failed to set CPU {} sibling {}",
623 cache_lvl, cache_id, *cpu, *sibling_cpu
624 );
625 }
626 }
627 }
628 }
629
630 Ok(())
631 }
632
633 fn init_l2_cache_domains(
634 skel: &mut BpfSkel<'_>,
635 topo: &Topology,
636 ) -> Result<(), std::io::Error> {
637 Self::init_cache_domains(skel, topo, 2, &|skel, lvl, cpu, sibling_cpu| {
638 Self::enable_sibling_cpu(skel, lvl, cpu, sibling_cpu)
639 })
640 }
641
642 fn init_l3_cache_domains(
643 skel: &mut BpfSkel<'_>,
644 topo: &Topology,
645 ) -> Result<(), std::io::Error> {
646 Self::init_cache_domains(skel, topo, 3, &|skel, lvl, cpu, sibling_cpu| {
647 Self::enable_sibling_cpu(skel, lvl, cpu, sibling_cpu)
648 })
649 }
650
651 fn get_metrics(&self) -> Metrics {
652 Metrics {
653 nr_running: self.skel.maps.bss_data.nr_running,
654 nr_cpus: self.skel.maps.bss_data.nr_online_cpus,
655 nr_kthread_dispatches: self.skel.maps.bss_data.nr_kthread_dispatches,
656 nr_direct_dispatches: self.skel.maps.bss_data.nr_direct_dispatches,
657 nr_shared_dispatches: self.skel.maps.bss_data.nr_shared_dispatches,
658 }
659 }
660
661 pub fn exited(&mut self) -> bool {
662 uei_exited!(&self.skel, uei)
663 }
664
665 fn run(&mut self, shutdown: Arc<AtomicBool>) -> Result<UserExitInfo> {
666 let (res_ch, req_ch) = self.stats_server.channels();
667 while !shutdown.load(Ordering::Relaxed) && !self.exited() {
668 if self.refresh_sched_domain() {
669 self.user_restart = true;
670 break;
671 }
672 match req_ch.recv_timeout(Duration::from_secs(1)) {
673 Ok(()) => res_ch.send(self.get_metrics())?,
674 Err(RecvTimeoutError::Timeout) => {}
675 Err(e) => Err(e)?,
676 }
677 }
678
679 let _ = self.struct_ops.take();
680 uei_report!(&self.skel, uei)
681 }
682}
683
684impl Drop for Scheduler<'_> {
685 fn drop(&mut self) {
686 info!("Unregister {} scheduler", SCHEDULER_NAME);
687
688 if self.opts.idle_resume_us >= 0 {
690 if cpu_idle_resume_latency_supported() {
691 for cpu in self.topo.all_cpus.values() {
692 update_cpu_idle_resume_latency(cpu.id, cpu.pm_qos_resume_latency_us as i32)
693 .unwrap();
694 }
695 }
696 }
697 }
698}
699
700fn main() -> Result<()> {
701 let opts = Opts::parse();
702
703 if opts.version {
704 println!(
705 "{} {}",
706 SCHEDULER_NAME,
707 build_id::full_version(env!("CARGO_PKG_VERSION"))
708 );
709 return Ok(());
710 }
711
712 if opts.help_stats {
713 stats::server_data().describe_meta(&mut std::io::stdout(), None)?;
714 return Ok(());
715 }
716
717 let loglevel = simplelog::LevelFilter::Info;
718
719 let mut lcfg = simplelog::ConfigBuilder::new();
720 lcfg.set_time_offset_to_local()
721 .expect("Failed to set local time offset")
722 .set_time_level(simplelog::LevelFilter::Error)
723 .set_location_level(simplelog::LevelFilter::Off)
724 .set_target_level(simplelog::LevelFilter::Off)
725 .set_thread_level(simplelog::LevelFilter::Off);
726 simplelog::TermLogger::init(
727 loglevel,
728 lcfg.build(),
729 simplelog::TerminalMode::Stderr,
730 simplelog::ColorChoice::Auto,
731 )?;
732
733 let shutdown = Arc::new(AtomicBool::new(false));
734 let shutdown_clone = shutdown.clone();
735 ctrlc::set_handler(move || {
736 shutdown_clone.store(true, Ordering::Relaxed);
737 })
738 .context("Error setting Ctrl-C handler")?;
739
740 if let Some(intv) = opts.monitor.or(opts.stats) {
741 let shutdown_copy = shutdown.clone();
742 let jh = std::thread::spawn(move || {
743 match stats::monitor(Duration::from_secs_f64(intv), shutdown_copy) {
744 Ok(_) => {
745 debug!("stats monitor thread finished successfully")
746 }
747 Err(error_object) => {
748 warn!(
749 "stats monitor thread finished because of an error {}",
750 error_object
751 )
752 }
753 }
754 });
755 if opts.monitor.is_some() {
756 let _ = jh.join();
757 return Ok(());
758 }
759 }
760
761 let mut open_object = MaybeUninit::uninit();
762 loop {
763 let mut sched = Scheduler::init(&opts, &mut open_object)?;
764 if !sched.run(shutdown.clone())?.should_restart() {
765 if sched.user_restart {
766 continue;
767 }
768 break;
769 }
770 }
771
772 Ok(())
773}