1mod bpf_skel;
9pub use bpf_skel::*;
10pub mod bpf_intf;
11pub use bpf_intf::*;
12
13mod stats;
14use std::collections::BTreeMap;
15use std::ffi::c_int;
16use std::fmt::Write;
17use std::mem::MaybeUninit;
18use std::sync::atomic::AtomicBool;
19use std::sync::atomic::Ordering;
20use std::sync::Arc;
21use std::time::Duration;
22
23use anyhow::Context;
24use anyhow::Result;
25use clap::Parser;
26use crossbeam::channel::RecvTimeoutError;
27use libbpf_rs::OpenObject;
28use libbpf_rs::ProgramInput;
29use log::warn;
30use log::{debug, info};
31use scx_stats::prelude::*;
32use scx_utils::autopower::{fetch_power_profile, PowerProfile};
33use scx_utils::build_id;
34use scx_utils::compat;
35use scx_utils::pm::{cpu_idle_resume_latency_supported, update_cpu_idle_resume_latency};
36use scx_utils::scx_ops_attach;
37use scx_utils::scx_ops_load;
38use scx_utils::scx_ops_open;
39use scx_utils::set_rlimit_infinity;
40use scx_utils::uei_exited;
41use scx_utils::uei_report;
42use scx_utils::CoreType;
43use scx_utils::Cpumask;
44use scx_utils::Topology;
45use scx_utils::UserExitInfo;
46use scx_utils::NR_CPU_IDS;
47use stats::Metrics;
48
49const SCHEDULER_NAME: &str = "scx_flash";
50
51#[derive(PartialEq)]
52enum Powermode {
53 Turbo,
54 Performance,
55 Powersave,
56 Any,
57}
58
59fn get_primary_cpus(mode: Powermode) -> std::io::Result<Vec<usize>> {
60 let topo = Topology::new().unwrap();
61
62 let cpus: Vec<usize> = topo
63 .all_cores
64 .values()
65 .flat_map(|core| &core.cpus)
66 .filter_map(|(cpu_id, cpu)| match (&mode, &cpu.core_type) {
67 (Powermode::Turbo, CoreType::Big { turbo: true }) |
69 (Powermode::Performance, CoreType::Big { .. }) |
71 (Powermode::Powersave, CoreType::Little) => Some(*cpu_id),
73 (Powermode::Any, ..) => Some(*cpu_id),
74 _ => None,
75 })
76 .collect();
77
78 Ok(cpus)
79}
80
81fn cpus_to_cpumask(cpus: &Vec<usize>) -> String {
83 if cpus.is_empty() {
84 return String::from("none");
85 }
86
87 let max_cpu_id = *cpus.iter().max().unwrap();
89
90 let mut bitmask = vec![0u8; (max_cpu_id + 1 + 7) / 8];
92
93 for cpu_id in cpus {
95 let byte_index = cpu_id / 8;
96 let bit_index = cpu_id % 8;
97 bitmask[byte_index] |= 1 << bit_index;
98 }
99
100 let hex_str: String = bitmask.iter().rev().fold(String::new(), |mut f, byte| {
102 let _ = write!(&mut f, "{:02x}", byte);
103 f
104 });
105
106 format!("0x{}", hex_str)
107}
108
109#[derive(Debug, clap::Parser)]
110#[command(
111 name = "scx_flash",
112 version,
113 disable_version_flag = true,
114 about = "A deadline-based scheduler focused on fairness and performance predictability.",
115 long_about = r#"
116scx_flash is scheduler that focuses on ensuring fairness and performance predictability.
117
118It operates using an earliest deadline first (EDF) policy. The deadline of each task deadline is
119defined as:
120
121 deadline = vruntime + exec_vruntime
122
123`vruntime` represents the task's accumulated runtime, inversely scaled by its weight, while
124`exec_vruntime` accounts for the vruntime accumulated since the last sleep event.
125
126Fairness is ensured through `vruntime`, whereas `exec_vruntime` helps prioritize latency-sensitive
127tasks. Tasks that are frequently blocked waiting for an event (typically latency-sensitive)
128accumulate a smaller `exec_vruntime` compared to tasks that continuously consume CPU without
129interruption.
130
131As a result, tasks with a smaller `exec_vruntime` will have a shorter deadline and will be
132dispatched earlier, ensuring better responsiveness for latency-sensitive tasks.
133
134Moreover, tasks can accumulate a maximum `vruntime` credit while they're sleeping, based on how
135often they voluntarily release the CPU (`avg_nvcsw`). This allows prioritizing frequent sleepers
136over less-frequent ones.
137"#
138)]
139struct Opts {
140 #[clap(long, default_value = "0")]
142 exit_dump_len: u32,
143
144 #[clap(short = 's', long, default_value = "20000")]
146 slice_us: u64,
147
148 #[clap(short = 'S', long, default_value = "1000")]
150 slice_us_min: u64,
151
152 #[clap(short = 'l', long, default_value = "20000")]
157 slice_us_lag: u64,
158
159 #[clap(short = 'r', long, default_value = "20000")]
164 run_us_lag: u64,
165
166 #[clap(short = 'c', long, default_value = "128")]
175 max_avg_nvcsw: u64,
176
177 #[clap(short = 't', long, default_value = "0")]
182 throttle_us: u64,
183
184 #[clap(short = 'I', long, allow_hyphen_values = true, default_value = "-1")]
190 idle_resume_us: i64,
191
192 #[clap(short = 'p', long, action = clap::ArgAction::SetTrue)]
199 local_pcpu: bool,
200
201 #[clap(short = 'n', long, action = clap::ArgAction::SetTrue)]
207 native_priority: bool,
208
209 #[clap(short = 'k', long, action = clap::ArgAction::SetTrue)]
218 local_kthreads: bool,
219
220 #[clap(short = 'w', long, action = clap::ArgAction::SetTrue)]
227 no_wake_sync: bool,
228
229 #[clap(short = 'm', long, default_value = "auto")]
241 primary_domain: String,
242
243 #[clap(long, action = clap::ArgAction::SetTrue)]
245 disable_l2: bool,
246
247 #[clap(long, action = clap::ArgAction::SetTrue)]
249 disable_l3: bool,
250
251 #[clap(long, action = clap::ArgAction::SetTrue)]
253 disable_smt: bool,
254
255 #[clap(long, action = clap::ArgAction::SetTrue)]
257 disable_numa: bool,
258
259 #[clap(short = 'f', long, action = clap::ArgAction::SetTrue)]
263 cpufreq: bool,
264
265 #[clap(long)]
267 stats: Option<f64>,
268
269 #[clap(long)]
272 monitor: Option<f64>,
273
274 #[clap(short = 'd', long, action = clap::ArgAction::SetTrue)]
276 debug: bool,
277
278 #[clap(short = 'v', long, action = clap::ArgAction::SetTrue)]
280 verbose: bool,
281
282 #[clap(short = 'V', long, action = clap::ArgAction::SetTrue)]
284 version: bool,
285
286 #[clap(long)]
288 help_stats: bool,
289}
290
291struct Scheduler<'a> {
292 skel: BpfSkel<'a>,
293 struct_ops: Option<libbpf_rs::Link>,
294 opts: &'a Opts,
295 topo: Topology,
296 power_profile: PowerProfile,
297 stats_server: StatsServer<(), Metrics>,
298 user_restart: bool,
299}
300
301impl<'a> Scheduler<'a> {
302 fn init(opts: &'a Opts, open_object: &'a mut MaybeUninit<OpenObject>) -> Result<Self> {
303 set_rlimit_infinity();
304
305 assert!(opts.slice_us >= opts.slice_us_min);
307
308 let topo = Topology::new().unwrap();
310
311 let smt_enabled = !opts.disable_smt && topo.smt_enabled;
313
314 info!(
315 "{} {} {}",
316 SCHEDULER_NAME,
317 build_id::full_version(env!("CARGO_PKG_VERSION")),
318 if smt_enabled { "SMT on" } else { "SMT off" }
319 );
320
321 info!(
323 "scheduler options: {}",
324 std::env::args().collect::<Vec<_>>().join(" ")
325 );
326
327 if opts.idle_resume_us >= 0 {
328 if !cpu_idle_resume_latency_supported() {
329 warn!("idle resume latency not supported");
330 } else {
331 info!("Setting idle QoS to {} us", opts.idle_resume_us);
332 for cpu in topo.all_cpus.values() {
333 update_cpu_idle_resume_latency(
334 cpu.id,
335 opts.idle_resume_us.try_into().unwrap(),
336 )?;
337 }
338 }
339 }
340
341 let mut skel_builder = BpfSkelBuilder::default();
343 skel_builder.obj_builder.debug(opts.verbose);
344 let mut skel = scx_ops_open!(skel_builder, open_object, flash_ops)?;
345
346 skel.struct_ops.flash_ops_mut().exit_dump_len = opts.exit_dump_len;
347
348 skel.maps.rodata_data.debug = opts.debug;
350 skel.maps.rodata_data.smt_enabled = smt_enabled;
351 skel.maps.rodata_data.numa_disabled = opts.disable_numa;
352 skel.maps.rodata_data.local_pcpu = opts.local_pcpu;
353 skel.maps.rodata_data.no_wake_sync = opts.no_wake_sync;
354 skel.maps.rodata_data.native_priority = opts.native_priority;
355 skel.maps.rodata_data.slice_max = opts.slice_us * 1000;
356 skel.maps.rodata_data.slice_min = opts.slice_us_min * 1000;
357 skel.maps.rodata_data.slice_lag = opts.slice_us_lag * 1000;
358 skel.maps.rodata_data.run_lag = opts.run_us_lag * 1000;
359 skel.maps.rodata_data.throttle_ns = opts.throttle_us * 1000;
360 skel.maps.rodata_data.max_avg_nvcsw = opts.max_avg_nvcsw;
361
362 skel.maps.rodata_data.local_kthreads = opts.local_kthreads || opts.throttle_us > 0;
365
366 skel.maps.rodata_data.__COMPAT_SCX_PICK_IDLE_IN_NODE = *compat::SCX_PICK_IDLE_IN_NODE;
368
369 skel.struct_ops.flash_ops_mut().flags = *compat::SCX_OPS_ENQ_EXITING
371 | *compat::SCX_OPS_ENQ_LAST
372 | *compat::SCX_OPS_ENQ_MIGRATION_DISABLED
373 | *compat::SCX_OPS_BUILTIN_IDLE_PER_NODE
374 | *compat::SCX_OPS_ALLOW_QUEUED_WAKEUP;
375 info!(
376 "scheduler flags: {:#x}",
377 skel.struct_ops.flash_ops_mut().flags
378 );
379
380 let mut skel = scx_ops_load!(skel, flash_ops, uei)?;
382
383 let power_profile = Self::power_profile();
385 if let Err(err) = Self::init_energy_domain(&mut skel, &opts.primary_domain, power_profile) {
386 warn!("failed to initialize primary domain: error {}", err);
387 }
388 if let Err(err) = Self::init_cpufreq_perf(&mut skel, &opts.primary_domain, opts.cpufreq) {
389 warn!(
390 "failed to initialize cpufreq performance level: error {}",
391 err
392 );
393 }
394
395 if smt_enabled {
397 Self::init_smt_domains(&mut skel, &topo)?;
398 }
399
400 if !opts.disable_l2 {
402 Self::init_l2_cache_domains(&mut skel, &topo)?;
403 }
404 if !opts.disable_l3 {
406 Self::init_l3_cache_domains(&mut skel, &topo)?;
407 }
408
409 let struct_ops = Some(scx_ops_attach!(skel, flash_ops)?);
411 let stats_server = StatsServer::new(stats::server_data()).launch()?;
412
413 Ok(Self {
414 skel,
415 struct_ops,
416 opts,
417 topo,
418 power_profile,
419 stats_server,
420 user_restart: false,
421 })
422 }
423
424 fn enable_primary_cpu(skel: &mut BpfSkel<'_>, cpu: i32) -> Result<(), u32> {
425 let prog = &mut skel.progs.enable_primary_cpu;
426 let mut args = cpu_arg {
427 cpu_id: cpu as c_int,
428 };
429 let input = ProgramInput {
430 context_in: Some(unsafe {
431 std::slice::from_raw_parts_mut(
432 &mut args as *mut _ as *mut u8,
433 std::mem::size_of_val(&args),
434 )
435 }),
436 ..Default::default()
437 };
438 let out = prog.test_run(input).unwrap();
439 if out.return_value != 0 {
440 return Err(out.return_value);
441 }
442
443 Ok(())
444 }
445
446 fn epp_to_cpumask(profile: Powermode) -> Result<Cpumask> {
447 let mut cpus = get_primary_cpus(profile).unwrap_or_default();
448 if cpus.is_empty() {
449 cpus = get_primary_cpus(Powermode::Any).unwrap_or_default();
450 }
451 Cpumask::from_str(&cpus_to_cpumask(&cpus))
452 }
453
454 fn init_energy_domain(
455 skel: &mut BpfSkel<'_>,
456 primary_domain: &str,
457 power_profile: PowerProfile,
458 ) -> Result<()> {
459 let domain = match primary_domain {
460 "powersave" => Self::epp_to_cpumask(Powermode::Powersave)?,
461 "performance" => Self::epp_to_cpumask(Powermode::Performance)?,
462 "turbo" => Self::epp_to_cpumask(Powermode::Turbo)?,
463 "auto" => match power_profile {
464 PowerProfile::Powersave => Self::epp_to_cpumask(Powermode::Powersave)?,
465 PowerProfile::Balanced { power: true } => {
466 Self::epp_to_cpumask(Powermode::Powersave)?
467 }
468 PowerProfile::Balanced { power: false } => Self::epp_to_cpumask(Powermode::Any)?,
469 PowerProfile::Performance => Self::epp_to_cpumask(Powermode::Any)?,
470 PowerProfile::Unknown => Self::epp_to_cpumask(Powermode::Any)?,
471 },
472 "all" => Self::epp_to_cpumask(Powermode::Any)?,
473 &_ => Cpumask::from_str(primary_domain)?,
474 };
475
476 info!("primary CPU domain = 0x{:x}", domain);
477
478 if let Err(err) = Self::enable_primary_cpu(skel, -1) {
480 warn!("failed to reset primary domain: error {}", err);
481 }
482 for cpu in 0..*NR_CPU_IDS {
484 if domain.test_cpu(cpu) {
485 if let Err(err) = Self::enable_primary_cpu(skel, cpu as i32) {
486 warn!("failed to add CPU {} to primary domain: error {}", cpu, err);
487 }
488 }
489 }
490
491 Ok(())
492 }
493
494 fn init_cpufreq_perf(
496 skel: &mut BpfSkel<'_>,
497 primary_domain: &String,
498 auto: bool,
499 ) -> Result<()> {
500 let perf_lvl: i64 = match primary_domain.as_str() {
503 "powersave" => 0,
504 _ if auto => -1,
505 _ => 1024,
506 };
507 info!(
508 "cpufreq performance level: {}",
509 match perf_lvl {
510 1024 => "max".into(),
511 0 => "min".into(),
512 n if n < 0 => "auto".into(),
513 _ => perf_lvl.to_string(),
514 }
515 );
516 skel.maps.bss_data.cpufreq_perf_lvl = perf_lvl;
517
518 Ok(())
519 }
520
521 fn power_profile() -> PowerProfile {
522 let profile = fetch_power_profile(true);
523 if profile == PowerProfile::Unknown {
524 fetch_power_profile(false)
525 } else {
526 profile
527 }
528 }
529
530 fn refresh_sched_domain(&mut self) -> bool {
531 if self.power_profile != PowerProfile::Unknown {
532 let power_profile = Self::power_profile();
533 if power_profile != self.power_profile {
534 self.power_profile = power_profile;
535
536 if self.opts.primary_domain == "auto" {
537 return true;
538 }
539 if let Err(err) = Self::init_cpufreq_perf(
540 &mut self.skel,
541 &self.opts.primary_domain,
542 self.opts.cpufreq,
543 ) {
544 warn!("failed to refresh cpufreq performance level: error {}", err);
545 }
546 }
547 }
548
549 false
550 }
551
552 fn enable_sibling_cpu(
553 skel: &mut BpfSkel<'_>,
554 lvl: usize,
555 cpu: usize,
556 sibling_cpu: usize,
557 ) -> Result<(), u32> {
558 let prog = &mut skel.progs.enable_sibling_cpu;
559 let mut args = domain_arg {
560 lvl_id: lvl as c_int,
561 cpu_id: cpu as c_int,
562 sibling_cpu_id: sibling_cpu as c_int,
563 };
564 let input = ProgramInput {
565 context_in: Some(unsafe {
566 std::slice::from_raw_parts_mut(
567 &mut args as *mut _ as *mut u8,
568 std::mem::size_of_val(&args),
569 )
570 }),
571 ..Default::default()
572 };
573 let out = prog.test_run(input).unwrap();
574 if out.return_value != 0 {
575 return Err(out.return_value);
576 }
577
578 Ok(())
579 }
580
581 fn init_smt_domains(skel: &mut BpfSkel<'_>, topo: &Topology) -> Result<(), std::io::Error> {
582 let smt_siblings = topo.sibling_cpus();
583
584 info!("SMT sibling CPUs: {:?}", smt_siblings);
585 for (cpu, sibling_cpu) in smt_siblings.iter().enumerate() {
586 Self::enable_sibling_cpu(skel, 0, cpu, *sibling_cpu as usize).unwrap();
587 }
588
589 Ok(())
590 }
591
592 fn are_smt_siblings(topo: &Topology, cpus: &[usize]) -> bool {
593 if cpus.len() <= 1 {
595 return true;
596 }
597
598 let first_cpu = cpus[0];
600 let smt_siblings = topo.sibling_cpus();
601 cpus.iter().all(|&cpu| {
602 cpu == first_cpu
603 || smt_siblings[cpu] == first_cpu as i32
604 || (smt_siblings[first_cpu] >= 0 && smt_siblings[first_cpu] == cpu as i32)
605 })
606 }
607
608 fn init_cache_domains(
609 skel: &mut BpfSkel<'_>,
610 topo: &Topology,
611 cache_lvl: usize,
612 enable_sibling_cpu_fn: &dyn Fn(&mut BpfSkel<'_>, usize, usize, usize) -> Result<(), u32>,
613 ) -> Result<(), std::io::Error> {
614 let mut cache_id_map: BTreeMap<usize, Vec<usize>> = BTreeMap::new();
616 for core in topo.all_cores.values() {
617 for (cpu_id, cpu) in &core.cpus {
618 let cache_id = match cache_lvl {
619 2 => cpu.l2_id,
620 3 => cpu.llc_id,
621 _ => panic!("invalid cache level {}", cache_lvl),
622 };
623 cache_id_map.entry(cache_id).or_default().push(*cpu_id);
624 }
625 }
626
627 for (cache_id, cpus) in cache_id_map {
629 if cpus.len() <= 1 {
631 continue;
632 }
633
634 if Self::are_smt_siblings(topo, &cpus) {
636 continue;
637 }
638
639 info!(
640 "L{} cache ID {}: sibling CPUs: {:?}",
641 cache_lvl, cache_id, cpus
642 );
643 for cpu in &cpus {
644 for sibling_cpu in &cpus {
645 if enable_sibling_cpu_fn(skel, cache_lvl, *cpu, *sibling_cpu).is_err() {
646 warn!(
647 "L{} cache ID {}: failed to set CPU {} sibling {}",
648 cache_lvl, cache_id, *cpu, *sibling_cpu
649 );
650 }
651 }
652 }
653 }
654
655 Ok(())
656 }
657
658 fn init_l2_cache_domains(
659 skel: &mut BpfSkel<'_>,
660 topo: &Topology,
661 ) -> Result<(), std::io::Error> {
662 Self::init_cache_domains(skel, topo, 2, &|skel, lvl, cpu, sibling_cpu| {
663 Self::enable_sibling_cpu(skel, lvl, cpu, sibling_cpu)
664 })
665 }
666
667 fn init_l3_cache_domains(
668 skel: &mut BpfSkel<'_>,
669 topo: &Topology,
670 ) -> Result<(), std::io::Error> {
671 Self::init_cache_domains(skel, topo, 3, &|skel, lvl, cpu, sibling_cpu| {
672 Self::enable_sibling_cpu(skel, lvl, cpu, sibling_cpu)
673 })
674 }
675
676 fn get_metrics(&self) -> Metrics {
677 Metrics {
678 nr_running: self.skel.maps.bss_data.nr_running,
679 nr_cpus: self.skel.maps.bss_data.nr_online_cpus,
680 nr_kthread_dispatches: self.skel.maps.bss_data.nr_kthread_dispatches,
681 nr_direct_dispatches: self.skel.maps.bss_data.nr_direct_dispatches,
682 nr_shared_dispatches: self.skel.maps.bss_data.nr_shared_dispatches,
683 }
684 }
685
686 pub fn exited(&mut self) -> bool {
687 uei_exited!(&self.skel, uei)
688 }
689
690 fn run(&mut self, shutdown: Arc<AtomicBool>) -> Result<UserExitInfo> {
691 let (res_ch, req_ch) = self.stats_server.channels();
692 while !shutdown.load(Ordering::Relaxed) && !self.exited() {
693 if self.refresh_sched_domain() {
694 self.user_restart = true;
695 break;
696 }
697 match req_ch.recv_timeout(Duration::from_secs(1)) {
698 Ok(()) => res_ch.send(self.get_metrics())?,
699 Err(RecvTimeoutError::Timeout) => {}
700 Err(e) => Err(e)?,
701 }
702 }
703
704 let _ = self.struct_ops.take();
705 uei_report!(&self.skel, uei)
706 }
707}
708
709impl Drop for Scheduler<'_> {
710 fn drop(&mut self) {
711 info!("Unregister {} scheduler", SCHEDULER_NAME);
712
713 if self.opts.idle_resume_us >= 0 {
715 if cpu_idle_resume_latency_supported() {
716 for cpu in self.topo.all_cpus.values() {
717 update_cpu_idle_resume_latency(cpu.id, cpu.pm_qos_resume_latency_us as i32)
718 .unwrap();
719 }
720 }
721 }
722 }
723}
724
725fn main() -> Result<()> {
726 let opts = Opts::parse();
727
728 if opts.version {
729 println!(
730 "{} {}",
731 SCHEDULER_NAME,
732 build_id::full_version(env!("CARGO_PKG_VERSION"))
733 );
734 return Ok(());
735 }
736
737 if opts.help_stats {
738 stats::server_data().describe_meta(&mut std::io::stdout(), None)?;
739 return Ok(());
740 }
741
742 let loglevel = simplelog::LevelFilter::Info;
743
744 let mut lcfg = simplelog::ConfigBuilder::new();
745 lcfg.set_time_offset_to_local()
746 .expect("Failed to set local time offset")
747 .set_time_level(simplelog::LevelFilter::Error)
748 .set_location_level(simplelog::LevelFilter::Off)
749 .set_target_level(simplelog::LevelFilter::Off)
750 .set_thread_level(simplelog::LevelFilter::Off);
751 simplelog::TermLogger::init(
752 loglevel,
753 lcfg.build(),
754 simplelog::TerminalMode::Stderr,
755 simplelog::ColorChoice::Auto,
756 )?;
757
758 let shutdown = Arc::new(AtomicBool::new(false));
759 let shutdown_clone = shutdown.clone();
760 ctrlc::set_handler(move || {
761 shutdown_clone.store(true, Ordering::Relaxed);
762 })
763 .context("Error setting Ctrl-C handler")?;
764
765 if let Some(intv) = opts.monitor.or(opts.stats) {
766 let shutdown_copy = shutdown.clone();
767 let jh = std::thread::spawn(move || {
768 match stats::monitor(Duration::from_secs_f64(intv), shutdown_copy) {
769 Ok(_) => {
770 debug!("stats monitor thread finished successfully")
771 }
772 Err(error_object) => {
773 warn!(
774 "stats monitor thread finished because of an error {}",
775 error_object
776 )
777 }
778 }
779 });
780 if opts.monitor.is_some() {
781 let _ = jh.join();
782 return Ok(());
783 }
784 }
785
786 let mut open_object = MaybeUninit::uninit();
787 loop {
788 let mut sched = Scheduler::init(&opts, &mut open_object)?;
789 if !sched.run(shutdown.clone())?.should_restart() {
790 if sched.user_restart {
791 continue;
792 }
793 break;
794 }
795 }
796
797 Ok(())
798}