1mod bpf_skel;
9pub use bpf_skel::*;
10pub mod bpf_intf;
11pub use bpf_intf::*;
12
13mod stats;
14use std::ffi::c_int;
15use std::fmt::Write;
16use std::mem::MaybeUninit;
17use std::sync::atomic::AtomicBool;
18use std::sync::atomic::Ordering;
19use std::sync::Arc;
20use std::time::Duration;
21
22use anyhow::anyhow;
23use anyhow::bail;
24use anyhow::Context;
25use anyhow::Result;
26use clap::Parser;
27use crossbeam::channel::RecvTimeoutError;
28use libbpf_rs::OpenObject;
29use libbpf_rs::ProgramInput;
30use log::{debug, info, warn};
31use scx_stats::prelude::*;
32use scx_utils::autopower::{fetch_power_profile, PowerProfile};
33use scx_utils::build_id;
34use scx_utils::compat;
35use scx_utils::get_primary_cpus;
36use scx_utils::libbpf_clap_opts::LibbpfOpts;
37use scx_utils::pm::{cpu_idle_resume_latency_supported, update_cpu_idle_resume_latency};
38use scx_utils::scx_ops_attach;
39use scx_utils::scx_ops_load;
40use scx_utils::scx_ops_open;
41use scx_utils::try_set_rlimit_infinity;
42use scx_utils::uei_exited;
43use scx_utils::uei_report;
44use scx_utils::Cpumask;
45use scx_utils::Powermode;
46use scx_utils::Topology;
47use scx_utils::UserExitInfo;
48use scx_utils::NR_CPU_IDS;
49use stats::Metrics;
50
51const SCHEDULER_NAME: &str = "scx_flash";
52
53fn cpus_to_cpumask(cpus: &Vec<usize>) -> String {
55 if cpus.is_empty() {
56 return String::from("none");
57 }
58
59 let max_cpu_id = *cpus.iter().max().unwrap();
61
62 let mut bitmask = vec![0u8; (max_cpu_id + 1 + 7) / 8];
64
65 for cpu_id in cpus {
67 let byte_index = cpu_id / 8;
68 let bit_index = cpu_id % 8;
69 bitmask[byte_index] |= 1 << bit_index;
70 }
71
72 let hex_str: String = bitmask.iter().rev().fold(String::new(), |mut f, byte| {
74 let _ = write!(&mut f, "{:02x}", byte);
75 f
76 });
77
78 format!("0x{}", hex_str)
79}
80
81#[derive(Debug, clap::Parser)]
82#[command(
83 name = "scx_flash",
84 version,
85 disable_version_flag = true,
86 about = "A deadline-based scheduler focused on fairness and performance predictability.",
87 long_about = r#"
88scx_flash is scheduler that focuses on ensuring fairness and performance predictability.
89
90It operates using an earliest deadline first (EDF) policy. The deadline of each task deadline is
91defined as:
92
93 deadline = vruntime + exec_vruntime
94
95Here, `vruntime` represents the task's total accumulated runtime, inversely scaled by its weight,
96while `exec_vruntime` accounts for the scaled runtime accumulated since the last sleep event.
97
98Fairness is driven by `vruntime`, while `exec_vruntime` helps prioritize latency-sensitive tasks
99that sleep frequently and use the CPU in short bursts.
100
101To prevent sleeping tasks from gaining excessive priority, the maximum vruntime credit a task can
102accumulate while sleeping is capped by `slice_lag`: tasks that sleep frequently can receive a larger
103credit, while tasks that perform fewer, longer sleeps are granted a smaller credit. This encourages
104responsive behavior without excessively boosting idle tasks.
105
106When dynamic fairness is enabled (`--slice-lag-scaling`), the maximum vruntime sleep credit is also
107scaled depending on the user-mode CPU utilization:
108
109 - At low utilization (mostly idle system), the impact of `vruntime` is reduced, and scheduling
110 decisions are driven primarily by `exec_vruntime`. This favors bursty, latency-sensitive
111 workloads (i.e., hackbench), improving their performance and latency.
112
113 - At high utilization, sleeping tasks regain their vruntime credit, increasing the influence of
114 `vruntime` in deadline calculation. This restores fairness and ensures system responsiveness
115 under load.
116
117This adaptive behavior allows the scheduler to prioritize intense message-passing workloads when
118the system is lightly loaded, while maintaining fairness and responsiveness when the system is
119saturated or overcommitted.
120"#
121)]
122struct Opts {
123 #[clap(long, default_value = "0")]
125 exit_dump_len: u32,
126
127 #[clap(short = 's', long, default_value = "700")]
129 slice_us: u64,
130
131 #[clap(short = 'l', long, default_value = "20000")]
136 slice_us_lag: u64,
137
138 #[clap(short = 't', long, default_value = "0")]
143 throttle_us: u64,
144
145 #[clap(short = 'I', long, allow_hyphen_values = true, default_value = "32")]
151 idle_resume_us: i64,
152
153 #[clap(short = 'T', long, action = clap::ArgAction::SetTrue)]
159 tickless: bool,
160
161 #[clap(short = 'R', long, action = clap::ArgAction::SetTrue)]
166 rr_sched: bool,
167
168 #[clap(long, action = clap::ArgAction::SetTrue)]
175 wakeup_throttle: bool,
176
177 #[clap(short = 'm', long, default_value = "auto")]
189 primary_domain: String,
190
191 #[clap(long, action = clap::ArgAction::SetTrue)]
193 disable_smt: bool,
194
195 #[clap(long, action = clap::ArgAction::SetTrue)]
197 disable_numa: bool,
198
199 #[clap(short = 'f', long, action = clap::ArgAction::SetTrue)]
203 cpufreq: bool,
204
205 #[clap(long)]
207 stats: Option<f64>,
208
209 #[clap(long)]
212 monitor: Option<f64>,
213
214 #[clap(short = 'd', long, action = clap::ArgAction::SetTrue)]
216 debug: bool,
217
218 #[clap(short = 'v', long, action = clap::ArgAction::SetTrue)]
220 verbose: bool,
221
222 #[clap(short = 'V', long, action = clap::ArgAction::SetTrue)]
224 version: bool,
225
226 #[clap(long)]
228 help_stats: bool,
229
230 #[clap(flatten, next_help_heading = "Libbpf Options")]
231 pub libbpf: LibbpfOpts,
232}
233
234struct Scheduler<'a> {
235 skel: BpfSkel<'a>,
236 struct_ops: Option<libbpf_rs::Link>,
237 opts: &'a Opts,
238 topo: Topology,
239 power_profile: PowerProfile,
240 stats_server: StatsServer<(), Metrics>,
241 user_restart: bool,
242}
243
244impl<'a> Scheduler<'a> {
245 fn init(opts: &'a Opts, open_object: &'a mut MaybeUninit<OpenObject>) -> Result<Self> {
246 try_set_rlimit_infinity();
247
248 let topo = Topology::new().unwrap();
250
251 let smt_enabled = !opts.disable_smt && topo.smt_enabled;
253
254 info!(
255 "{} {} {}",
256 SCHEDULER_NAME,
257 build_id::full_version(env!("CARGO_PKG_VERSION")),
258 if smt_enabled { "SMT on" } else { "SMT off" }
259 );
260
261 info!(
263 "scheduler options: {}",
264 std::env::args().collect::<Vec<_>>().join(" ")
265 );
266
267 if opts.idle_resume_us >= 0 {
268 if !cpu_idle_resume_latency_supported() {
269 warn!("idle resume latency not supported");
270 } else {
271 info!("Setting idle QoS to {} us", opts.idle_resume_us);
272 for cpu in topo.all_cpus.values() {
273 update_cpu_idle_resume_latency(
274 cpu.id,
275 opts.idle_resume_us.try_into().unwrap(),
276 )?;
277 }
278 }
279 }
280
281 let nr_nodes = topo
283 .nodes
284 .values()
285 .filter(|node| !node.all_cpus.is_empty())
286 .count();
287 info!("NUMA nodes: {}", nr_nodes);
288
289 let numa_disabled = opts.disable_numa || nr_nodes == 1;
291 if numa_disabled {
292 info!("Disabling NUMA optimizations");
293 }
294
295 let power_profile = Self::power_profile();
297 let domain =
298 Self::resolve_energy_domain(&opts.primary_domain, power_profile).map_err(|err| {
299 anyhow!(
300 "failed to resolve primary domain '{}': {}",
301 &opts.primary_domain,
302 err
303 )
304 })?;
305
306 let mut skel_builder = BpfSkelBuilder::default();
308 skel_builder.obj_builder.debug(opts.verbose);
309 let open_opts = opts.libbpf.clone().into_bpf_open_opts();
310 let mut skel = scx_ops_open!(skel_builder, open_object, flash_ops, open_opts)?;
311
312 skel.struct_ops.flash_ops_mut().exit_dump_len = opts.exit_dump_len;
313
314 let rodata = skel.maps.rodata_data.as_mut().unwrap();
316 rodata.debug = opts.debug;
317 rodata.smt_enabled = smt_enabled;
318 rodata.numa_disabled = numa_disabled;
319 rodata.rr_sched = opts.rr_sched;
320 rodata.wakeup_throttle = opts.wakeup_throttle;
321 rodata.tickless_sched = opts.tickless;
322 rodata.slice_max = opts.slice_us * 1000;
323 rodata.slice_lag = opts.slice_us_lag * 1000;
324 rodata.throttle_ns = opts.throttle_us * 1000;
325 rodata.primary_all = domain.weight() == *NR_CPU_IDS;
326
327 skel.struct_ops.flash_ops_mut().flags = *compat::SCX_OPS_ENQ_EXITING
329 | *compat::SCX_OPS_ENQ_LAST
330 | *compat::SCX_OPS_ENQ_MIGRATION_DISABLED
331 | *compat::SCX_OPS_ALLOW_QUEUED_WAKEUP
332 | if numa_disabled {
333 0
334 } else {
335 *compat::SCX_OPS_BUILTIN_IDLE_PER_NODE
336 };
337 info!(
338 "scheduler flags: {:#x}",
339 skel.struct_ops.flash_ops_mut().flags
340 );
341
342 let mut skel = scx_ops_load!(skel, flash_ops, uei)?;
344
345 Self::init_energy_domain(&mut skel, &domain).map_err(|err| {
347 anyhow!(
348 "failed to initialize primary domain 0x{:x}: {}",
349 domain,
350 err
351 )
352 })?;
353
354 if let Err(err) = Self::init_cpufreq_perf(&mut skel, &opts.primary_domain, opts.cpufreq) {
355 bail!(
356 "failed to initialize cpufreq performance level: error {}",
357 err
358 );
359 }
360
361 if smt_enabled {
363 Self::init_smt_domains(&mut skel, &topo)?;
364 }
365
366 let struct_ops = Some(scx_ops_attach!(skel, flash_ops)?);
368 let stats_server = StatsServer::new(stats::server_data()).launch()?;
369
370 Ok(Self {
371 skel,
372 struct_ops,
373 opts,
374 topo,
375 power_profile,
376 stats_server,
377 user_restart: false,
378 })
379 }
380
381 fn enable_primary_cpu(skel: &mut BpfSkel<'_>, cpu: i32) -> Result<(), u32> {
382 let prog = &mut skel.progs.enable_primary_cpu;
383 let mut args = cpu_arg {
384 cpu_id: cpu as c_int,
385 };
386 let input = ProgramInput {
387 context_in: Some(unsafe {
388 std::slice::from_raw_parts_mut(
389 &mut args as *mut _ as *mut u8,
390 std::mem::size_of_val(&args),
391 )
392 }),
393 ..Default::default()
394 };
395 let out = prog.test_run(input).unwrap();
396 if out.return_value != 0 {
397 return Err(out.return_value);
398 }
399
400 Ok(())
401 }
402
403 fn epp_to_cpumask(profile: Powermode) -> Result<Cpumask> {
404 let mut cpus = get_primary_cpus(profile).unwrap_or_default();
405 if cpus.is_empty() {
406 cpus = get_primary_cpus(Powermode::Any).unwrap_or_default();
407 }
408 Cpumask::from_str(&cpus_to_cpumask(&cpus))
409 }
410
411 fn resolve_energy_domain(primary_domain: &str, power_profile: PowerProfile) -> Result<Cpumask> {
412 let domain = match primary_domain {
413 "powersave" => Self::epp_to_cpumask(Powermode::Powersave)?,
414 "performance" => Self::epp_to_cpumask(Powermode::Performance)?,
415 "turbo" => Self::epp_to_cpumask(Powermode::Turbo)?,
416 "auto" => match power_profile {
417 PowerProfile::Powersave => Self::epp_to_cpumask(Powermode::Powersave)?,
418 PowerProfile::Balanced { power: true } => {
419 Self::epp_to_cpumask(Powermode::Powersave)?
420 }
421 PowerProfile::Balanced { power: false }
422 | PowerProfile::Performance
423 | PowerProfile::Unknown => Self::epp_to_cpumask(Powermode::Any)?,
424 },
425 "all" => Self::epp_to_cpumask(Powermode::Any)?,
426 &_ => Cpumask::from_str(primary_domain)?,
427 };
428
429 Ok(domain)
430 }
431
432 fn init_energy_domain(skel: &mut BpfSkel<'_>, domain: &Cpumask) -> Result<()> {
433 info!("primary CPU domain = 0x{:x}", domain);
434
435 if let Err(err) = Self::enable_primary_cpu(skel, -1) {
437 bail!("failed to reset primary domain: error {}", err);
438 }
439
440 for cpu in 0..*NR_CPU_IDS {
442 if domain.test_cpu(cpu) {
443 if let Err(err) = Self::enable_primary_cpu(skel, cpu as i32) {
444 bail!("failed to add CPU {} to primary domain: error {}", cpu, err);
445 }
446 }
447 }
448
449 Ok(())
450 }
451
452 fn init_cpufreq_perf(
454 skel: &mut BpfSkel<'_>,
455 primary_domain: &String,
456 auto: bool,
457 ) -> Result<()> {
458 let perf_lvl: i64 = match primary_domain.as_str() {
461 "powersave" => 0,
462 _ if auto => -1,
463 _ => 1024,
464 };
465 info!(
466 "cpufreq performance level: {}",
467 match perf_lvl {
468 1024 => "max".into(),
469 0 => "min".into(),
470 n if n < 0 => "auto".into(),
471 _ => perf_lvl.to_string(),
472 }
473 );
474 skel.maps.bss_data.as_mut().unwrap().cpufreq_perf_lvl = perf_lvl;
475
476 Ok(())
477 }
478
479 fn power_profile() -> PowerProfile {
480 let profile = fetch_power_profile(true);
481 if profile == PowerProfile::Unknown {
482 fetch_power_profile(false)
483 } else {
484 profile
485 }
486 }
487
488 fn refresh_sched_domain(&mut self) -> bool {
489 if self.power_profile != PowerProfile::Unknown {
490 let power_profile = Self::power_profile();
491 if power_profile != self.power_profile {
492 self.power_profile = power_profile;
493
494 if self.opts.primary_domain == "auto" {
495 return true;
496 }
497 if let Err(err) = Self::init_cpufreq_perf(
498 &mut self.skel,
499 &self.opts.primary_domain,
500 self.opts.cpufreq,
501 ) {
502 warn!("failed to refresh cpufreq performance level: error {}", err);
503 }
504 }
505 }
506
507 false
508 }
509
510 fn enable_sibling_cpu(
511 skel: &mut BpfSkel<'_>,
512 lvl: usize,
513 cpu: usize,
514 sibling_cpu: usize,
515 ) -> Result<(), u32> {
516 let prog = &mut skel.progs.enable_sibling_cpu;
517 let mut args = domain_arg {
518 lvl_id: lvl as c_int,
519 cpu_id: cpu as c_int,
520 sibling_cpu_id: sibling_cpu as c_int,
521 };
522 let input = ProgramInput {
523 context_in: Some(unsafe {
524 std::slice::from_raw_parts_mut(
525 &mut args as *mut _ as *mut u8,
526 std::mem::size_of_val(&args),
527 )
528 }),
529 ..Default::default()
530 };
531 let out = prog.test_run(input).unwrap();
532 if out.return_value != 0 {
533 return Err(out.return_value);
534 }
535
536 Ok(())
537 }
538
539 fn init_smt_domains(skel: &mut BpfSkel<'_>, topo: &Topology) -> Result<(), std::io::Error> {
540 let smt_siblings = topo.sibling_cpus();
541
542 info!("SMT sibling CPUs: {:?}", smt_siblings);
543 for (cpu, sibling_cpu) in smt_siblings.iter().enumerate() {
544 Self::enable_sibling_cpu(skel, 0, cpu, *sibling_cpu as usize).unwrap();
545 }
546
547 Ok(())
548 }
549
550 fn get_metrics(&self) -> Metrics {
551 let bss_data = self.skel.maps.bss_data.as_ref().unwrap();
552 Metrics {
553 nr_running: bss_data.nr_running,
554 nr_cpus: bss_data.nr_online_cpus,
555 nr_kthread_dispatches: bss_data.nr_kthread_dispatches,
556 nr_direct_dispatches: bss_data.nr_direct_dispatches,
557 nr_shared_dispatches: bss_data.nr_shared_dispatches,
558 }
559 }
560
561 pub fn exited(&mut self) -> bool {
562 uei_exited!(&self.skel, uei)
563 }
564
565 fn run(&mut self, shutdown: Arc<AtomicBool>) -> Result<UserExitInfo> {
566 let (res_ch, req_ch) = self.stats_server.channels();
567
568 while !shutdown.load(Ordering::Relaxed) && !self.exited() {
569 if self.refresh_sched_domain() {
570 self.user_restart = true;
571 break;
572 }
573
574 match req_ch.recv_timeout(Duration::from_secs(1)) {
575 Ok(()) => res_ch.send(self.get_metrics())?,
576 Err(RecvTimeoutError::Timeout) => {}
577 Err(e) => Err(e)?,
578 }
579 }
580
581 let _ = self.struct_ops.take();
582 uei_report!(&self.skel, uei)
583 }
584}
585
586impl Drop for Scheduler<'_> {
587 fn drop(&mut self) {
588 info!("Unregister {SCHEDULER_NAME} scheduler");
589
590 if self.opts.idle_resume_us >= 0 {
592 if cpu_idle_resume_latency_supported() {
593 for cpu in self.topo.all_cpus.values() {
594 update_cpu_idle_resume_latency(cpu.id, cpu.pm_qos_resume_latency_us as i32)
595 .unwrap();
596 }
597 }
598 }
599 }
600}
601
602fn main() -> Result<()> {
603 let opts = Opts::parse();
604
605 if opts.version {
606 println!(
607 "{} {}",
608 SCHEDULER_NAME,
609 build_id::full_version(env!("CARGO_PKG_VERSION"))
610 );
611 return Ok(());
612 }
613
614 if opts.help_stats {
615 stats::server_data().describe_meta(&mut std::io::stdout(), None)?;
616 return Ok(());
617 }
618
619 let loglevel = simplelog::LevelFilter::Info;
620
621 let mut lcfg = simplelog::ConfigBuilder::new();
622 lcfg.set_time_offset_to_local()
623 .expect("Failed to set local time offset")
624 .set_time_level(simplelog::LevelFilter::Error)
625 .set_location_level(simplelog::LevelFilter::Off)
626 .set_target_level(simplelog::LevelFilter::Off)
627 .set_thread_level(simplelog::LevelFilter::Off);
628 simplelog::TermLogger::init(
629 loglevel,
630 lcfg.build(),
631 simplelog::TerminalMode::Stderr,
632 simplelog::ColorChoice::Auto,
633 )?;
634
635 let shutdown = Arc::new(AtomicBool::new(false));
636 let shutdown_clone = shutdown.clone();
637 ctrlc::set_handler(move || {
638 shutdown_clone.store(true, Ordering::Relaxed);
639 })
640 .context("Error setting Ctrl-C handler")?;
641
642 if let Some(intv) = opts.monitor.or(opts.stats) {
643 let shutdown_copy = shutdown.clone();
644 let jh = std::thread::spawn(move || {
645 match stats::monitor(Duration::from_secs_f64(intv), shutdown_copy) {
646 Ok(_) => {
647 debug!("stats monitor thread finished successfully")
648 }
649 Err(error_object) => {
650 warn!(
651 "stats monitor thread finished because of an error {}",
652 error_object
653 )
654 }
655 }
656 });
657 if opts.monitor.is_some() {
658 let _ = jh.join();
659 return Ok(());
660 }
661 }
662
663 let mut open_object = MaybeUninit::uninit();
664 loop {
665 let mut sched = Scheduler::init(&opts, &mut open_object)?;
666 if !sched.run(shutdown.clone())?.should_restart() {
667 if sched.user_restart {
668 continue;
669 }
670 break;
671 }
672 }
673
674 Ok(())
675}