1mod bpf_skel;
9pub use bpf_skel::*;
10pub mod bpf_intf;
11pub use bpf_intf::*;
12
13mod stats;
14use std::ffi::c_int;
15use std::fmt::Write;
16use std::mem::MaybeUninit;
17use std::sync::atomic::AtomicBool;
18use std::sync::atomic::Ordering;
19use std::sync::Arc;
20use std::time::Duration;
21
22use anyhow::anyhow;
23use anyhow::bail;
24use anyhow::Context;
25use anyhow::Result;
26use clap::Parser;
27use crossbeam::channel::RecvTimeoutError;
28use libbpf_rs::OpenObject;
29use libbpf_rs::ProgramInput;
30use log::{debug, info, warn};
31use scx_stats::prelude::*;
32use scx_utils::autopower::{fetch_power_profile, PowerProfile};
33use scx_utils::build_id;
34use scx_utils::compat;
35use scx_utils::libbpf_clap_opts::LibbpfOpts;
36use scx_utils::pm::{cpu_idle_resume_latency_supported, update_cpu_idle_resume_latency};
37use scx_utils::scx_ops_attach;
38use scx_utils::scx_ops_load;
39use scx_utils::scx_ops_open;
40use scx_utils::try_set_rlimit_infinity;
41use scx_utils::uei_exited;
42use scx_utils::uei_report;
43use scx_utils::CoreType;
44use scx_utils::Cpumask;
45use scx_utils::Topology;
46use scx_utils::UserExitInfo;
47use scx_utils::NR_CPU_IDS;
48use stats::Metrics;
49
50const SCHEDULER_NAME: &str = "scx_flash";
51
52#[derive(PartialEq)]
53enum Powermode {
54 Turbo,
55 Performance,
56 Powersave,
57 Any,
58}
59
60fn get_primary_cpus(mode: Powermode) -> std::io::Result<Vec<usize>> {
61 let topo = Topology::new().unwrap();
62
63 let cpus: Vec<usize> = topo
64 .all_cores
65 .values()
66 .flat_map(|core| &core.cpus)
67 .filter_map(|(cpu_id, cpu)| match (&mode, &cpu.core_type) {
68 (Powermode::Turbo, CoreType::Big { turbo: true }) |
70 (Powermode::Performance, CoreType::Big { .. }) |
72 (Powermode::Powersave, CoreType::Little) => Some(*cpu_id),
74 (Powermode::Any, ..) => Some(*cpu_id),
75 _ => None,
76 })
77 .collect();
78
79 Ok(cpus)
80}
81
82fn cpus_to_cpumask(cpus: &Vec<usize>) -> String {
84 if cpus.is_empty() {
85 return String::from("none");
86 }
87
88 let max_cpu_id = *cpus.iter().max().unwrap();
90
91 let mut bitmask = vec![0u8; (max_cpu_id + 1 + 7) / 8];
93
94 for cpu_id in cpus {
96 let byte_index = cpu_id / 8;
97 let bit_index = cpu_id % 8;
98 bitmask[byte_index] |= 1 << bit_index;
99 }
100
101 let hex_str: String = bitmask.iter().rev().fold(String::new(), |mut f, byte| {
103 let _ = write!(&mut f, "{:02x}", byte);
104 f
105 });
106
107 format!("0x{}", hex_str)
108}
109
110#[derive(Debug, clap::Parser)]
111#[command(
112 name = "scx_flash",
113 version,
114 disable_version_flag = true,
115 about = "A deadline-based scheduler focused on fairness and performance predictability.",
116 long_about = r#"
117scx_flash is scheduler that focuses on ensuring fairness and performance predictability.
118
119It operates using an earliest deadline first (EDF) policy. The deadline of each task deadline is
120defined as:
121
122 deadline = vruntime + exec_vruntime
123
124Here, `vruntime` represents the task's total accumulated runtime, inversely scaled by its weight,
125while `exec_vruntime` accounts for the scaled runtime accumulated since the last sleep event.
126
127Fairness is driven by `vruntime`, while `exec_vruntime` helps prioritize latency-sensitive tasks
128that sleep frequently and use the CPU in short bursts.
129
130To prevent sleeping tasks from gaining excessive priority, the maximum vruntime credit a task can
131accumulate while sleeping is capped by `slice_lag`: tasks that sleep frequently can receive a larger
132credit, while tasks that perform fewer, longer sleeps are granted a smaller credit. This encourages
133responsive behavior without excessively boosting idle tasks.
134
135When dynamic fairness is enabled (`--slice-lag-scaling`), the maximum vruntime sleep credit is also
136scaled depending on the user-mode CPU utilization:
137
138 - At low utilization (mostly idle system), the impact of `vruntime` is reduced, and scheduling
139 decisions are driven primarily by `exec_vruntime`. This favors bursty, latency-sensitive
140 workloads (i.e., hackbench), improving their performance and latency.
141
142 - At high utilization, sleeping tasks regain their vruntime credit, increasing the influence of
143 `vruntime` in deadline calculation. This restores fairness and ensures system responsiveness
144 under load.
145
146This adaptive behavior allows the scheduler to prioritize intense message-passing workloads when
147the system is lightly loaded, while maintaining fairness and responsiveness when the system is
148saturated or overcommitted.
149"#
150)]
151struct Opts {
152 #[clap(long, default_value = "0")]
154 exit_dump_len: u32,
155
156 #[clap(short = 's', long, default_value = "700")]
158 slice_us: u64,
159
160 #[clap(short = 'l', long, default_value = "20000")]
165 slice_us_lag: u64,
166
167 #[clap(short = 't', long, default_value = "0")]
172 throttle_us: u64,
173
174 #[clap(short = 'I', long, allow_hyphen_values = true, default_value = "32")]
180 idle_resume_us: i64,
181
182 #[clap(short = 'T', long, action = clap::ArgAction::SetTrue)]
188 tickless: bool,
189
190 #[clap(short = 'R', long, action = clap::ArgAction::SetTrue)]
195 rr_sched: bool,
196
197 #[clap(short = 'm', long, default_value = "auto")]
209 primary_domain: String,
210
211 #[clap(long, action = clap::ArgAction::SetTrue)]
213 disable_smt: bool,
214
215 #[clap(long, action = clap::ArgAction::SetTrue)]
217 disable_numa: bool,
218
219 #[clap(short = 'f', long, action = clap::ArgAction::SetTrue)]
223 cpufreq: bool,
224
225 #[clap(long)]
227 stats: Option<f64>,
228
229 #[clap(long)]
232 monitor: Option<f64>,
233
234 #[clap(short = 'd', long, action = clap::ArgAction::SetTrue)]
236 debug: bool,
237
238 #[clap(short = 'v', long, action = clap::ArgAction::SetTrue)]
240 verbose: bool,
241
242 #[clap(short = 'V', long, action = clap::ArgAction::SetTrue)]
244 version: bool,
245
246 #[clap(long)]
248 help_stats: bool,
249
250 #[clap(flatten, next_help_heading = "Libbpf Options")]
251 pub libbpf: LibbpfOpts,
252}
253
254struct Scheduler<'a> {
255 skel: BpfSkel<'a>,
256 struct_ops: Option<libbpf_rs::Link>,
257 opts: &'a Opts,
258 topo: Topology,
259 power_profile: PowerProfile,
260 stats_server: StatsServer<(), Metrics>,
261 user_restart: bool,
262}
263
264impl<'a> Scheduler<'a> {
265 fn init(opts: &'a Opts, open_object: &'a mut MaybeUninit<OpenObject>) -> Result<Self> {
266 try_set_rlimit_infinity();
267
268 let topo = Topology::new().unwrap();
270
271 let smt_enabled = !opts.disable_smt && topo.smt_enabled;
273
274 info!(
275 "{} {} {}",
276 SCHEDULER_NAME,
277 build_id::full_version(env!("CARGO_PKG_VERSION")),
278 if smt_enabled { "SMT on" } else { "SMT off" }
279 );
280
281 info!(
283 "scheduler options: {}",
284 std::env::args().collect::<Vec<_>>().join(" ")
285 );
286
287 if opts.idle_resume_us >= 0 {
288 if !cpu_idle_resume_latency_supported() {
289 warn!("idle resume latency not supported");
290 } else {
291 info!("Setting idle QoS to {} us", opts.idle_resume_us);
292 for cpu in topo.all_cpus.values() {
293 update_cpu_idle_resume_latency(
294 cpu.id,
295 opts.idle_resume_us.try_into().unwrap(),
296 )?;
297 }
298 }
299 }
300
301 let nr_nodes = topo
303 .nodes
304 .values()
305 .filter(|node| !node.all_cpus.is_empty())
306 .count();
307 info!("NUMA nodes: {}", nr_nodes);
308
309 let numa_disabled = opts.disable_numa || nr_nodes == 1;
311 if numa_disabled {
312 info!("Disabling NUMA optimizations");
313 }
314
315 let power_profile = Self::power_profile();
317 let domain =
318 Self::resolve_energy_domain(&opts.primary_domain, power_profile).map_err(|err| {
319 anyhow!(
320 "failed to resolve primary domain '{}': {}",
321 &opts.primary_domain,
322 err
323 )
324 })?;
325
326 let mut skel_builder = BpfSkelBuilder::default();
328 skel_builder.obj_builder.debug(opts.verbose);
329 let open_opts = opts.libbpf.clone().into_bpf_open_opts();
330 let mut skel = scx_ops_open!(skel_builder, open_object, flash_ops, open_opts)?;
331
332 skel.struct_ops.flash_ops_mut().exit_dump_len = opts.exit_dump_len;
333
334 let rodata = skel.maps.rodata_data.as_mut().unwrap();
336 rodata.debug = opts.debug;
337 rodata.smt_enabled = smt_enabled;
338 rodata.numa_disabled = numa_disabled;
339 rodata.rr_sched = opts.rr_sched;
340 rodata.tickless_sched = opts.tickless;
341 rodata.slice_max = opts.slice_us * 1000;
342 rodata.slice_lag = opts.slice_us_lag * 1000;
343 rodata.throttle_ns = opts.throttle_us * 1000;
344 rodata.primary_all = domain.weight() == *NR_CPU_IDS;
345
346 skel.struct_ops.flash_ops_mut().flags = *compat::SCX_OPS_ENQ_EXITING
348 | *compat::SCX_OPS_ENQ_LAST
349 | *compat::SCX_OPS_ENQ_MIGRATION_DISABLED
350 | *compat::SCX_OPS_ALLOW_QUEUED_WAKEUP
351 | if numa_disabled {
352 0
353 } else {
354 *compat::SCX_OPS_BUILTIN_IDLE_PER_NODE
355 };
356 info!(
357 "scheduler flags: {:#x}",
358 skel.struct_ops.flash_ops_mut().flags
359 );
360
361 let mut skel = scx_ops_load!(skel, flash_ops, uei)?;
363
364 Self::init_energy_domain(&mut skel, &domain).map_err(|err| {
366 anyhow!(
367 "failed to initialize primary domain 0x{:x}: {}",
368 domain,
369 err
370 )
371 })?;
372
373 if let Err(err) = Self::init_cpufreq_perf(&mut skel, &opts.primary_domain, opts.cpufreq) {
374 bail!(
375 "failed to initialize cpufreq performance level: error {}",
376 err
377 );
378 }
379
380 if smt_enabled {
382 Self::init_smt_domains(&mut skel, &topo)?;
383 }
384
385 let struct_ops = Some(scx_ops_attach!(skel, flash_ops)?);
387 let stats_server = StatsServer::new(stats::server_data()).launch()?;
388
389 Ok(Self {
390 skel,
391 struct_ops,
392 opts,
393 topo,
394 power_profile,
395 stats_server,
396 user_restart: false,
397 })
398 }
399
400 fn enable_primary_cpu(skel: &mut BpfSkel<'_>, cpu: i32) -> Result<(), u32> {
401 let prog = &mut skel.progs.enable_primary_cpu;
402 let mut args = cpu_arg {
403 cpu_id: cpu as c_int,
404 };
405 let input = ProgramInput {
406 context_in: Some(unsafe {
407 std::slice::from_raw_parts_mut(
408 &mut args as *mut _ as *mut u8,
409 std::mem::size_of_val(&args),
410 )
411 }),
412 ..Default::default()
413 };
414 let out = prog.test_run(input).unwrap();
415 if out.return_value != 0 {
416 return Err(out.return_value);
417 }
418
419 Ok(())
420 }
421
422 fn epp_to_cpumask(profile: Powermode) -> Result<Cpumask> {
423 let mut cpus = get_primary_cpus(profile).unwrap_or_default();
424 if cpus.is_empty() {
425 cpus = get_primary_cpus(Powermode::Any).unwrap_or_default();
426 }
427 Cpumask::from_str(&cpus_to_cpumask(&cpus))
428 }
429
430 fn resolve_energy_domain(primary_domain: &str, power_profile: PowerProfile) -> Result<Cpumask> {
431 let domain = match primary_domain {
432 "powersave" => Self::epp_to_cpumask(Powermode::Powersave)?,
433 "performance" => Self::epp_to_cpumask(Powermode::Performance)?,
434 "turbo" => Self::epp_to_cpumask(Powermode::Turbo)?,
435 "auto" => match power_profile {
436 PowerProfile::Powersave => Self::epp_to_cpumask(Powermode::Powersave)?,
437 PowerProfile::Balanced { power: true } => {
438 Self::epp_to_cpumask(Powermode::Powersave)?
439 }
440 PowerProfile::Balanced { power: false }
441 | PowerProfile::Performance
442 | PowerProfile::Unknown => Self::epp_to_cpumask(Powermode::Any)?,
443 },
444 "all" => Self::epp_to_cpumask(Powermode::Any)?,
445 &_ => Cpumask::from_str(primary_domain)?,
446 };
447
448 Ok(domain)
449 }
450
451 fn init_energy_domain(skel: &mut BpfSkel<'_>, domain: &Cpumask) -> Result<()> {
452 info!("primary CPU domain = 0x{:x}", domain);
453
454 if let Err(err) = Self::enable_primary_cpu(skel, -1) {
456 bail!("failed to reset primary domain: error {}", err);
457 }
458
459 for cpu in 0..*NR_CPU_IDS {
461 if domain.test_cpu(cpu) {
462 if let Err(err) = Self::enable_primary_cpu(skel, cpu as i32) {
463 bail!("failed to add CPU {} to primary domain: error {}", cpu, err);
464 }
465 }
466 }
467
468 Ok(())
469 }
470
471 fn init_cpufreq_perf(
473 skel: &mut BpfSkel<'_>,
474 primary_domain: &String,
475 auto: bool,
476 ) -> Result<()> {
477 let perf_lvl: i64 = match primary_domain.as_str() {
480 "powersave" => 0,
481 _ if auto => -1,
482 _ => 1024,
483 };
484 info!(
485 "cpufreq performance level: {}",
486 match perf_lvl {
487 1024 => "max".into(),
488 0 => "min".into(),
489 n if n < 0 => "auto".into(),
490 _ => perf_lvl.to_string(),
491 }
492 );
493 skel.maps.bss_data.as_mut().unwrap().cpufreq_perf_lvl = perf_lvl;
494
495 Ok(())
496 }
497
498 fn power_profile() -> PowerProfile {
499 let profile = fetch_power_profile(true);
500 if profile == PowerProfile::Unknown {
501 fetch_power_profile(false)
502 } else {
503 profile
504 }
505 }
506
507 fn refresh_sched_domain(&mut self) -> bool {
508 if self.power_profile != PowerProfile::Unknown {
509 let power_profile = Self::power_profile();
510 if power_profile != self.power_profile {
511 self.power_profile = power_profile;
512
513 if self.opts.primary_domain == "auto" {
514 return true;
515 }
516 if let Err(err) = Self::init_cpufreq_perf(
517 &mut self.skel,
518 &self.opts.primary_domain,
519 self.opts.cpufreq,
520 ) {
521 warn!("failed to refresh cpufreq performance level: error {}", err);
522 }
523 }
524 }
525
526 false
527 }
528
529 fn enable_sibling_cpu(
530 skel: &mut BpfSkel<'_>,
531 lvl: usize,
532 cpu: usize,
533 sibling_cpu: usize,
534 ) -> Result<(), u32> {
535 let prog = &mut skel.progs.enable_sibling_cpu;
536 let mut args = domain_arg {
537 lvl_id: lvl as c_int,
538 cpu_id: cpu as c_int,
539 sibling_cpu_id: sibling_cpu as c_int,
540 };
541 let input = ProgramInput {
542 context_in: Some(unsafe {
543 std::slice::from_raw_parts_mut(
544 &mut args as *mut _ as *mut u8,
545 std::mem::size_of_val(&args),
546 )
547 }),
548 ..Default::default()
549 };
550 let out = prog.test_run(input).unwrap();
551 if out.return_value != 0 {
552 return Err(out.return_value);
553 }
554
555 Ok(())
556 }
557
558 fn init_smt_domains(skel: &mut BpfSkel<'_>, topo: &Topology) -> Result<(), std::io::Error> {
559 let smt_siblings = topo.sibling_cpus();
560
561 info!("SMT sibling CPUs: {:?}", smt_siblings);
562 for (cpu, sibling_cpu) in smt_siblings.iter().enumerate() {
563 Self::enable_sibling_cpu(skel, 0, cpu, *sibling_cpu as usize).unwrap();
564 }
565
566 Ok(())
567 }
568
569 fn get_metrics(&self) -> Metrics {
570 let bss_data = self.skel.maps.bss_data.as_ref().unwrap();
571 Metrics {
572 nr_running: bss_data.nr_running,
573 nr_cpus: bss_data.nr_online_cpus,
574 nr_kthread_dispatches: bss_data.nr_kthread_dispatches,
575 nr_direct_dispatches: bss_data.nr_direct_dispatches,
576 nr_shared_dispatches: bss_data.nr_shared_dispatches,
577 }
578 }
579
580 pub fn exited(&mut self) -> bool {
581 uei_exited!(&self.skel, uei)
582 }
583
584 fn run(&mut self, shutdown: Arc<AtomicBool>) -> Result<UserExitInfo> {
585 let (res_ch, req_ch) = self.stats_server.channels();
586
587 while !shutdown.load(Ordering::Relaxed) && !self.exited() {
588 if self.refresh_sched_domain() {
589 self.user_restart = true;
590 break;
591 }
592
593 match req_ch.recv_timeout(Duration::from_secs(1)) {
594 Ok(()) => res_ch.send(self.get_metrics())?,
595 Err(RecvTimeoutError::Timeout) => {}
596 Err(e) => Err(e)?,
597 }
598 }
599
600 let _ = self.struct_ops.take();
601 uei_report!(&self.skel, uei)
602 }
603}
604
605impl Drop for Scheduler<'_> {
606 fn drop(&mut self) {
607 info!("Unregister {SCHEDULER_NAME} scheduler");
608
609 if self.opts.idle_resume_us >= 0 {
611 if cpu_idle_resume_latency_supported() {
612 for cpu in self.topo.all_cpus.values() {
613 update_cpu_idle_resume_latency(cpu.id, cpu.pm_qos_resume_latency_us as i32)
614 .unwrap();
615 }
616 }
617 }
618 }
619}
620
621fn main() -> Result<()> {
622 let opts = Opts::parse();
623
624 if opts.version {
625 println!(
626 "{} {}",
627 SCHEDULER_NAME,
628 build_id::full_version(env!("CARGO_PKG_VERSION"))
629 );
630 return Ok(());
631 }
632
633 if opts.help_stats {
634 stats::server_data().describe_meta(&mut std::io::stdout(), None)?;
635 return Ok(());
636 }
637
638 let loglevel = simplelog::LevelFilter::Info;
639
640 let mut lcfg = simplelog::ConfigBuilder::new();
641 lcfg.set_time_offset_to_local()
642 .expect("Failed to set local time offset")
643 .set_time_level(simplelog::LevelFilter::Error)
644 .set_location_level(simplelog::LevelFilter::Off)
645 .set_target_level(simplelog::LevelFilter::Off)
646 .set_thread_level(simplelog::LevelFilter::Off);
647 simplelog::TermLogger::init(
648 loglevel,
649 lcfg.build(),
650 simplelog::TerminalMode::Stderr,
651 simplelog::ColorChoice::Auto,
652 )?;
653
654 let shutdown = Arc::new(AtomicBool::new(false));
655 let shutdown_clone = shutdown.clone();
656 ctrlc::set_handler(move || {
657 shutdown_clone.store(true, Ordering::Relaxed);
658 })
659 .context("Error setting Ctrl-C handler")?;
660
661 if let Some(intv) = opts.monitor.or(opts.stats) {
662 let shutdown_copy = shutdown.clone();
663 let jh = std::thread::spawn(move || {
664 match stats::monitor(Duration::from_secs_f64(intv), shutdown_copy) {
665 Ok(_) => {
666 debug!("stats monitor thread finished successfully")
667 }
668 Err(error_object) => {
669 warn!(
670 "stats monitor thread finished because of an error {}",
671 error_object
672 )
673 }
674 }
675 });
676 if opts.monitor.is_some() {
677 let _ = jh.join();
678 return Ok(());
679 }
680 }
681
682 let mut open_object = MaybeUninit::uninit();
683 loop {
684 let mut sched = Scheduler::init(&opts, &mut open_object)?;
685 if !sched.run(shutdown.clone())?.should_restart() {
686 if sched.user_restart {
687 continue;
688 }
689 break;
690 }
691 }
692
693 Ok(())
694}