1mod bpf_skel;
9pub use bpf_skel::*;
10pub mod bpf_intf;
11pub use bpf_intf::*;
12
13mod stats;
14use std::collections::BTreeMap;
15use std::ffi::c_int;
16use std::fmt::Write;
17use std::mem::MaybeUninit;
18use std::sync::Arc;
19use std::sync::atomic::AtomicBool;
20use std::sync::atomic::Ordering;
21use std::time::Duration;
22
23use anyhow::Context;
24use anyhow::Result;
25use clap::Parser;
26use crossbeam::channel::RecvTimeoutError;
27use libbpf_rs::OpenObject;
28use libbpf_rs::ProgramInput;
29use log::warn;
30use log::{debug, info};
31use scx_stats::prelude::*;
32use scx_utils::CoreType;
33use scx_utils::Cpumask;
34use scx_utils::NR_CPU_IDS;
35use scx_utils::Topology;
36use scx_utils::UserExitInfo;
37use scx_utils::autopower::{PowerProfile, fetch_power_profile};
38use scx_utils::build_id;
39use scx_utils::compat;
40use scx_utils::pm::{cpu_idle_resume_latency_supported, update_cpu_idle_resume_latency};
41use scx_utils::scx_ops_attach;
42use scx_utils::scx_ops_load;
43use scx_utils::scx_ops_open;
44use scx_utils::set_rlimit_infinity;
45use scx_utils::uei_exited;
46use scx_utils::uei_report;
47use stats::Metrics;
48
49const SCHEDULER_NAME: &str = "scx_bpfland";
50
51#[derive(PartialEq)]
52enum Powermode {
53 Performance,
54 Powersave,
55 Any,
56}
57
58fn get_primary_cpus(mode: Powermode) -> std::io::Result<Vec<usize>> {
59 let topo = Topology::new().unwrap();
60
61 let cpus: Vec<usize> = topo
62 .all_cores
63 .values()
64 .flat_map(|core| &core.cpus)
65 .filter_map(|(cpu_id, cpu)| match (&mode, &cpu.core_type) {
66 (Powermode::Performance, CoreType::Big { .. }) |
68 (Powermode::Powersave, CoreType::Little) => Some(*cpu_id),
70 (Powermode::Any, ..) => Some(*cpu_id),
71 _ => None,
72 })
73 .collect();
74
75 Ok(cpus)
76}
77
78fn cpus_to_cpumask(cpus: &Vec<usize>) -> String {
80 if cpus.is_empty() {
81 return String::from("none");
82 }
83
84 let max_cpu_id = *cpus.iter().max().unwrap();
86
87 let mut bitmask = vec![0u8; (max_cpu_id + 1 + 7) / 8];
89
90 for cpu_id in cpus {
92 let byte_index = cpu_id / 8;
93 let bit_index = cpu_id % 8;
94 bitmask[byte_index] |= 1 << bit_index;
95 }
96
97 let hex_str: String = bitmask.iter().rev().fold(String::new(), |mut f, byte| {
99 let _ = write!(&mut f, "{:02x}", byte);
100 f
101 });
102
103 format!("0x{}", hex_str)
104}
105
106#[derive(Debug, Parser)]
114struct Opts {
115 #[clap(long, default_value = "0")]
117 exit_dump_len: u32,
118
119 #[clap(short = 's', long, default_value = "20000")]
121 slice_us: u64,
122
123 #[clap(short = 'S', long, default_value = "1000")]
125 slice_us_min: u64,
126
127 #[clap(short = 'l', long, allow_hyphen_values = true, default_value = "20000")]
136 slice_us_lag: i64,
137
138 #[clap(short = 'I', long, allow_hyphen_values = true, default_value = "-1")]
144 idle_resume_us: i64,
145
146 #[clap(short = 'n', long, action = clap::ArgAction::SetTrue)]
151 no_preempt: bool,
152
153 #[clap(short = 'p', long, action = clap::ArgAction::SetTrue)]
160 local_pcpu: bool,
161
162 #[clap(short = 'k', long, action = clap::ArgAction::SetTrue)]
171 local_kthreads: bool,
172
173 #[clap(short = 'w', long, action = clap::ArgAction::SetTrue)]
180 no_wake_sync: bool,
181
182 #[clap(short = 'm', long, default_value = "auto")]
193 primary_domain: String,
194
195 #[clap(long, action = clap::ArgAction::SetTrue)]
197 disable_l2: bool,
198
199 #[clap(long, action = clap::ArgAction::SetTrue)]
201 disable_l3: bool,
202
203 #[clap(long, action = clap::ArgAction::SetTrue)]
205 disable_smt: bool,
206
207 #[clap(long, action = clap::ArgAction::SetTrue)]
209 disable_numa: bool,
210
211 #[clap(short = 'f', long, action = clap::ArgAction::SetTrue)]
215 cpufreq: bool,
216
217 #[clap(short = 'c', long, default_value = "10", hide = true)]
222 nvcsw_max_thresh: u64,
223
224 #[clap(long)]
226 stats: Option<f64>,
227
228 #[clap(long)]
231 monitor: Option<f64>,
232
233 #[clap(short = 'd', long, action = clap::ArgAction::SetTrue)]
235 debug: bool,
236
237 #[clap(short = 'v', long, action = clap::ArgAction::SetTrue)]
239 verbose: bool,
240
241 #[clap(short = 'V', long, action = clap::ArgAction::SetTrue)]
243 version: bool,
244
245 #[clap(long)]
247 help_stats: bool,
248}
249
250struct Scheduler<'a> {
251 skel: BpfSkel<'a>,
252 struct_ops: Option<libbpf_rs::Link>,
253 opts: &'a Opts,
254 topo: Topology,
255 power_profile: PowerProfile,
256 stats_server: StatsServer<(), Metrics>,
257 user_restart: bool,
258}
259
260impl<'a> Scheduler<'a> {
261 fn init(opts: &'a Opts, open_object: &'a mut MaybeUninit<OpenObject>) -> Result<Self> {
262 set_rlimit_infinity();
263
264 assert!(opts.slice_us >= opts.slice_us_min);
266
267 let topo = Topology::new().unwrap();
269
270 let smt_enabled = !opts.disable_smt && topo.smt_enabled;
272
273 info!(
274 "{} {} {}",
275 SCHEDULER_NAME,
276 build_id::full_version(env!("CARGO_PKG_VERSION")),
277 if smt_enabled { "SMT on" } else { "SMT off" }
278 );
279
280 if opts.idle_resume_us >= 0 {
281 if !cpu_idle_resume_latency_supported() {
282 warn!("idle resume latency not supported");
283 } else {
284 info!("Setting idle QoS to {} us", opts.idle_resume_us);
285 for cpu in topo.all_cpus.values() {
286 update_cpu_idle_resume_latency(
287 cpu.id,
288 opts.idle_resume_us.try_into().unwrap(),
289 )?;
290 }
291 }
292 }
293
294 let mut skel_builder = BpfSkelBuilder::default();
296 skel_builder.obj_builder.debug(opts.verbose);
297 let mut skel = scx_ops_open!(skel_builder, open_object, bpfland_ops)?;
298
299 skel.struct_ops.bpfland_ops_mut().exit_dump_len = opts.exit_dump_len;
300
301 skel.maps.rodata_data.debug = opts.debug;
303 skel.maps.rodata_data.smt_enabled = smt_enabled;
304 skel.maps.rodata_data.numa_disabled = opts.disable_numa;
305 skel.maps.rodata_data.local_pcpu = opts.local_pcpu;
306 skel.maps.rodata_data.local_kthreads = opts.local_kthreads;
307 skel.maps.rodata_data.no_preempt = opts.no_preempt;
308 skel.maps.rodata_data.no_wake_sync = opts.no_wake_sync;
309 skel.maps.rodata_data.slice_max = opts.slice_us * 1000;
310 skel.maps.rodata_data.slice_min = opts.slice_us_min * 1000;
311 skel.maps.rodata_data.slice_lag = opts.slice_us_lag * 1000;
312
313 skel.maps.rodata_data.__COMPAT_SCX_PICK_IDLE_IN_NODE = *compat::SCX_PICK_IDLE_IN_NODE;
315
316 skel.struct_ops.bpfland_ops_mut().flags = *compat::SCX_OPS_ENQ_EXITING
318 | *compat::SCX_OPS_ENQ_LAST
319 | *compat::SCX_OPS_ENQ_MIGRATION_DISABLED
320 | *compat::SCX_OPS_BUILTIN_IDLE_PER_NODE
321 | *compat::SCX_OPS_ALLOW_QUEUED_WAKEUP;
322 info!(
323 "scheduler flags: {:#x}",
324 skel.struct_ops.bpfland_ops_mut().flags
325 );
326
327 let mut skel = scx_ops_load!(skel, bpfland_ops, uei)?;
329
330 let power_profile = fetch_power_profile(false);
332 if let Err(err) = Self::init_energy_domain(&mut skel, &opts.primary_domain, power_profile) {
333 warn!("failed to initialize primary domain: error {}", err);
334 }
335 if let Err(err) = Self::init_cpufreq_perf(&mut skel, &opts.primary_domain, opts.cpufreq) {
336 warn!(
337 "failed to initialize cpufreq performance level: error {}",
338 err
339 );
340 }
341
342 if smt_enabled {
344 Self::init_smt_domains(&mut skel, &topo)?;
345 }
346
347 if !opts.disable_l2 {
349 Self::init_l2_cache_domains(&mut skel, &topo)?;
350 }
351 if !opts.disable_l3 {
353 Self::init_l3_cache_domains(&mut skel, &topo)?;
354 }
355
356 let struct_ops = Some(scx_ops_attach!(skel, bpfland_ops)?);
358 let stats_server = StatsServer::new(stats::server_data()).launch()?;
359
360 Ok(Self {
361 skel,
362 struct_ops,
363 opts,
364 topo,
365 power_profile,
366 stats_server,
367 user_restart: false,
368 })
369 }
370
371 fn enable_primary_cpu(skel: &mut BpfSkel<'_>, cpu: i32) -> Result<(), u32> {
372 let prog = &mut skel.progs.enable_primary_cpu;
373 let mut args = cpu_arg {
374 cpu_id: cpu as c_int,
375 };
376 let input = ProgramInput {
377 context_in: Some(unsafe {
378 std::slice::from_raw_parts_mut(
379 &mut args as *mut _ as *mut u8,
380 std::mem::size_of_val(&args),
381 )
382 }),
383 ..Default::default()
384 };
385 let out = prog.test_run(input).unwrap();
386 if out.return_value != 0 {
387 return Err(out.return_value);
388 }
389
390 Ok(())
391 }
392
393 fn epp_to_cpumask(profile: Powermode) -> Result<Cpumask> {
394 let mut cpus = get_primary_cpus(profile).unwrap_or_default();
395 if cpus.is_empty() {
396 cpus = get_primary_cpus(Powermode::Any).unwrap_or_default();
397 }
398 Cpumask::from_str(&cpus_to_cpumask(&cpus))
399 }
400
401 fn init_energy_domain(
402 skel: &mut BpfSkel<'_>,
403 primary_domain: &str,
404 power_profile: PowerProfile,
405 ) -> Result<()> {
406 let domain = match primary_domain {
407 "powersave" => Self::epp_to_cpumask(Powermode::Powersave)?,
408 "performance" => Self::epp_to_cpumask(Powermode::Performance)?,
409 "auto" => match power_profile {
410 PowerProfile::Powersave => Self::epp_to_cpumask(Powermode::Powersave)?,
411 PowerProfile::Performance | PowerProfile::Balanced => {
412 Self::epp_to_cpumask(Powermode::Performance)?
413 }
414 PowerProfile::Unknown => Self::epp_to_cpumask(Powermode::Any)?,
415 },
416 "all" => Self::epp_to_cpumask(Powermode::Any)?,
417 &_ => Cpumask::from_str(primary_domain)?,
418 };
419
420 info!("primary CPU domain = 0x{:x}", domain);
421
422 if let Err(err) = Self::enable_primary_cpu(skel, -1) {
424 warn!("failed to reset primary domain: error {}", err);
425 }
426 for cpu in 0..*NR_CPU_IDS {
428 if domain.test_cpu(cpu) {
429 if let Err(err) = Self::enable_primary_cpu(skel, cpu as i32) {
430 warn!("failed to add CPU {} to primary domain: error {}", cpu, err);
431 }
432 }
433 }
434
435 Ok(())
436 }
437
438 fn init_cpufreq_perf(
440 skel: &mut BpfSkel<'_>,
441 primary_domain: &String,
442 auto: bool,
443 ) -> Result<()> {
444 let perf_lvl: i64 = match primary_domain.as_str() {
447 "powersave" => 0,
448 _ if auto => -1,
449 _ => 1024,
450 };
451 info!(
452 "cpufreq performance level: {}",
453 match perf_lvl {
454 1024 => "max".into(),
455 0 => "min".into(),
456 n if n < 0 => "auto".into(),
457 _ => perf_lvl.to_string(),
458 }
459 );
460 skel.maps.bss_data.cpufreq_perf_lvl = perf_lvl;
461
462 Ok(())
463 }
464
465 fn refresh_sched_domain(&mut self) -> bool {
466 if self.power_profile != PowerProfile::Unknown {
467 let power_profile = fetch_power_profile(false);
468 if power_profile != self.power_profile {
469 self.power_profile = power_profile;
470
471 if self.opts.primary_domain == "auto" {
472 return true;
473 }
474 if let Err(err) = Self::init_cpufreq_perf(
475 &mut self.skel,
476 &self.opts.primary_domain,
477 self.opts.cpufreq,
478 ) {
479 warn!("failed to refresh cpufreq performance level: error {}", err);
480 }
481 }
482 }
483
484 false
485 }
486
487 fn enable_sibling_cpu(
488 skel: &mut BpfSkel<'_>,
489 lvl: usize,
490 cpu: usize,
491 sibling_cpu: usize,
492 ) -> Result<(), u32> {
493 let prog = &mut skel.progs.enable_sibling_cpu;
494 let mut args = domain_arg {
495 lvl_id: lvl as c_int,
496 cpu_id: cpu as c_int,
497 sibling_cpu_id: sibling_cpu as c_int,
498 };
499 let input = ProgramInput {
500 context_in: Some(unsafe {
501 std::slice::from_raw_parts_mut(
502 &mut args as *mut _ as *mut u8,
503 std::mem::size_of_val(&args),
504 )
505 }),
506 ..Default::default()
507 };
508 let out = prog.test_run(input).unwrap();
509 if out.return_value != 0 {
510 return Err(out.return_value);
511 }
512
513 Ok(())
514 }
515
516 fn init_smt_domains(skel: &mut BpfSkel<'_>, topo: &Topology) -> Result<(), std::io::Error> {
517 let smt_siblings = topo.sibling_cpus();
518
519 info!("SMT sibling CPUs: {:?}", smt_siblings);
520 for (cpu, sibling_cpu) in smt_siblings.iter().enumerate() {
521 Self::enable_sibling_cpu(skel, 0, cpu, *sibling_cpu as usize).unwrap();
522 }
523
524 Ok(())
525 }
526
527 fn are_smt_siblings(topo: &Topology, cpus: &[usize]) -> bool {
528 if cpus.len() <= 1 {
530 return true;
531 }
532
533 let first_cpu = cpus[0];
535 let smt_siblings = topo.sibling_cpus();
536 cpus.iter().all(|&cpu| {
537 cpu == first_cpu
538 || smt_siblings[cpu] == first_cpu as i32
539 || (smt_siblings[first_cpu] >= 0 && smt_siblings[first_cpu] == cpu as i32)
540 })
541 }
542
543 fn init_cache_domains(
544 skel: &mut BpfSkel<'_>,
545 topo: &Topology,
546 cache_lvl: usize,
547 enable_sibling_cpu_fn: &dyn Fn(&mut BpfSkel<'_>, usize, usize, usize) -> Result<(), u32>,
548 ) -> Result<(), std::io::Error> {
549 let mut cache_id_map: BTreeMap<usize, Vec<usize>> = BTreeMap::new();
551 for core in topo.all_cores.values() {
552 for (cpu_id, cpu) in &core.cpus {
553 let cache_id = match cache_lvl {
554 2 => cpu.l2_id,
555 3 => cpu.llc_id,
556 _ => panic!("invalid cache level {}", cache_lvl),
557 };
558 cache_id_map.entry(cache_id).or_default().push(*cpu_id);
559 }
560 }
561
562 for (cache_id, cpus) in cache_id_map {
564 if cpus.len() <= 1 {
566 continue;
567 }
568
569 if Self::are_smt_siblings(topo, &cpus) {
571 continue;
572 }
573
574 info!(
575 "L{} cache ID {}: sibling CPUs: {:?}",
576 cache_lvl, cache_id, cpus
577 );
578 for cpu in &cpus {
579 for sibling_cpu in &cpus {
580 match enable_sibling_cpu_fn(skel, cache_lvl, *cpu, *sibling_cpu) {
581 Ok(()) => {}
582 Err(_) => {
583 warn!(
584 "L{} cache ID {}: failed to set CPU {} sibling {}",
585 cache_lvl, cache_id, *cpu, *sibling_cpu
586 );
587 }
588 }
589 }
590 }
591 }
592
593 Ok(())
594 }
595
596 fn init_l2_cache_domains(
597 skel: &mut BpfSkel<'_>,
598 topo: &Topology,
599 ) -> Result<(), std::io::Error> {
600 Self::init_cache_domains(skel, topo, 2, &|skel, lvl, cpu, sibling_cpu| {
601 Self::enable_sibling_cpu(skel, lvl, cpu, sibling_cpu)
602 })
603 }
604
605 fn init_l3_cache_domains(
606 skel: &mut BpfSkel<'_>,
607 topo: &Topology,
608 ) -> Result<(), std::io::Error> {
609 Self::init_cache_domains(skel, topo, 3, &|skel, lvl, cpu, sibling_cpu| {
610 Self::enable_sibling_cpu(skel, lvl, cpu, sibling_cpu)
611 })
612 }
613
614 fn get_metrics(&self) -> Metrics {
615 Metrics {
616 nr_running: self.skel.maps.bss_data.nr_running,
617 nr_cpus: self.skel.maps.bss_data.nr_online_cpus,
618 nr_kthread_dispatches: self.skel.maps.bss_data.nr_kthread_dispatches,
619 nr_direct_dispatches: self.skel.maps.bss_data.nr_direct_dispatches,
620 nr_shared_dispatches: self.skel.maps.bss_data.nr_shared_dispatches,
621 }
622 }
623
624 pub fn exited(&mut self) -> bool {
625 uei_exited!(&self.skel, uei)
626 }
627
628 fn run(&mut self, shutdown: Arc<AtomicBool>) -> Result<UserExitInfo> {
629 let (res_ch, req_ch) = self.stats_server.channels();
630 while !shutdown.load(Ordering::Relaxed) && !self.exited() {
631 if self.refresh_sched_domain() {
632 self.user_restart = true;
633 break;
634 }
635 match req_ch.recv_timeout(Duration::from_secs(1)) {
636 Ok(()) => res_ch.send(self.get_metrics())?,
637 Err(RecvTimeoutError::Timeout) => {}
638 Err(e) => Err(e)?,
639 }
640 }
641
642 self.struct_ops.take();
643 uei_report!(&self.skel, uei)
644 }
645}
646
647impl Drop for Scheduler<'_> {
648 fn drop(&mut self) {
649 info!("Unregister {} scheduler", SCHEDULER_NAME);
650
651 if self.opts.idle_resume_us >= 0 {
653 if cpu_idle_resume_latency_supported() {
654 for cpu in self.topo.all_cpus.values() {
655 update_cpu_idle_resume_latency(cpu.id, cpu.pm_qos_resume_latency_us as i32)
656 .unwrap();
657 }
658 }
659 }
660 }
661}
662
663fn main() -> Result<()> {
664 let opts = Opts::parse();
665
666 if opts.version {
667 println!(
668 "{} {}",
669 SCHEDULER_NAME,
670 build_id::full_version(env!("CARGO_PKG_VERSION"))
671 );
672 return Ok(());
673 }
674
675 if opts.help_stats {
676 stats::server_data().describe_meta(&mut std::io::stdout(), None)?;
677 return Ok(());
678 }
679
680 let loglevel = simplelog::LevelFilter::Info;
681
682 let mut lcfg = simplelog::ConfigBuilder::new();
683 lcfg.set_time_level(simplelog::LevelFilter::Error)
684 .set_location_level(simplelog::LevelFilter::Off)
685 .set_target_level(simplelog::LevelFilter::Off)
686 .set_thread_level(simplelog::LevelFilter::Off);
687 simplelog::TermLogger::init(
688 loglevel,
689 lcfg.build(),
690 simplelog::TerminalMode::Stderr,
691 simplelog::ColorChoice::Auto,
692 )?;
693
694 let shutdown = Arc::new(AtomicBool::new(false));
695 let shutdown_clone = shutdown.clone();
696 ctrlc::set_handler(move || {
697 shutdown_clone.store(true, Ordering::Relaxed);
698 })
699 .context("Error setting Ctrl-C handler")?;
700
701 if let Some(intv) = opts.monitor.or(opts.stats) {
702 let shutdown_copy = shutdown.clone();
703 let jh = std::thread::spawn(move || {
704 match stats::monitor(Duration::from_secs_f64(intv), shutdown_copy) {
705 Ok(_) => {
706 debug!("stats monitor thread finished successfully")
707 }
708 Err(error_object) => {
709 warn!(
710 "stats monitor thread finished because of an error {}",
711 error_object
712 )
713 }
714 }
715 });
716 if opts.monitor.is_some() {
717 let _ = jh.join();
718 return Ok(());
719 }
720 }
721
722 let mut open_object = MaybeUninit::uninit();
723 loop {
724 let mut sched = Scheduler::init(&opts, &mut open_object)?;
725 if !sched.run(shutdown.clone())?.should_restart() {
726 if sched.user_restart {
727 continue;
728 }
729 break;
730 }
731 }
732
733 Ok(())
734}