1mod bpf_skel;
9pub use bpf_skel::*;
10pub mod bpf_intf;
11pub use bpf_intf::*;
12
13mod stats;
14use std::ffi::{c_int, c_ulong};
15use std::fmt::Write;
16use std::mem::MaybeUninit;
17use std::sync::atomic::AtomicBool;
18use std::sync::atomic::Ordering;
19use std::sync::Arc;
20use std::time::Duration;
21
22use anyhow::anyhow;
23use anyhow::bail;
24use anyhow::Context;
25use anyhow::Result;
26use clap::Parser;
27use crossbeam::channel::RecvTimeoutError;
28use libbpf_rs::OpenObject;
29use libbpf_rs::ProgramInput;
30use log::warn;
31use log::{debug, info};
32use scx_stats::prelude::*;
33use scx_utils::autopower::{fetch_power_profile, PowerProfile};
34use scx_utils::build_id;
35use scx_utils::compat;
36use scx_utils::get_primary_cpus;
37use scx_utils::libbpf_clap_opts::LibbpfOpts;
38use scx_utils::pm::{cpu_idle_resume_latency_supported, update_cpu_idle_resume_latency};
39use scx_utils::scx_ops_attach;
40use scx_utils::scx_ops_load;
41use scx_utils::scx_ops_open;
42use scx_utils::try_set_rlimit_infinity;
43use scx_utils::uei_exited;
44use scx_utils::uei_report;
45use scx_utils::Cpumask;
46use scx_utils::Powermode;
47use scx_utils::Topology;
48use scx_utils::UserExitInfo;
49use scx_utils::NR_CPU_IDS;
50use stats::Metrics;
51
52const SCHEDULER_NAME: &str = "scx_bpfland";
53
54fn cpus_to_cpumask(cpus: &Vec<usize>) -> String {
56 if cpus.is_empty() {
57 return String::from("none");
58 }
59
60 let max_cpu_id = *cpus.iter().max().unwrap();
62
63 let mut bitmask = vec![0u8; (max_cpu_id + 1 + 7) / 8];
65
66 for cpu_id in cpus {
68 let byte_index = cpu_id / 8;
69 let bit_index = cpu_id % 8;
70 bitmask[byte_index] |= 1 << bit_index;
71 }
72
73 let hex_str: String = bitmask.iter().rev().fold(String::new(), |mut f, byte| {
75 let _ = write!(&mut f, "{:02x}", byte);
76 f
77 });
78
79 format!("0x{}", hex_str)
80}
81
82#[derive(Debug, Parser)]
90struct Opts {
91 #[clap(long, default_value = "0")]
93 exit_dump_len: u32,
94
95 #[clap(short = 's', long, default_value = "1000")]
97 slice_us: u64,
98
99 #[clap(short = 'L', long, default_value = "0")]
101 slice_min_us: u64,
102
103 #[clap(short = 'l', long, default_value = "40000")]
108 slice_us_lag: u64,
109
110 #[clap(short = 't', long, default_value = "0")]
115 throttle_us: u64,
116
117 #[clap(short = 'I', long, allow_hyphen_values = true, default_value = "-1")]
123 idle_resume_us: i64,
124
125 #[clap(short = 'p', long, action = clap::ArgAction::SetTrue)]
132 local_pcpu: bool,
133
134 #[clap(short = 'k', long, action = clap::ArgAction::SetTrue)]
143 local_kthreads: bool,
144
145 #[clap(short = 'w', long, action = clap::ArgAction::SetTrue)]
152 no_wake_sync: bool,
153
154 #[clap(short = 'S', long, action = clap::ArgAction::SetTrue)]
162 sticky_tasks: bool,
163
164 #[clap(short = 'm', long, default_value = "auto")]
175 primary_domain: String,
176
177 #[clap(short = 'P', long, action = clap::ArgAction::SetTrue)]
182 preferred_idle_scan: bool,
183
184 #[clap(long, action = clap::ArgAction::SetTrue)]
186 disable_smt: bool,
187
188 #[clap(long, action = clap::ArgAction::SetTrue)]
190 disable_numa: bool,
191
192 #[clap(short = 'f', long, action = clap::ArgAction::SetTrue)]
196 cpufreq: bool,
197
198 #[clap(short = 'T', long, action = clap::ArgAction::SetTrue)]
200 timely: bool,
201
202 #[clap(long, default_value = "5000")]
204 timely_tlow_us: u64,
205
206 #[clap(long, default_value = "50000")]
208 timely_thigh_us: u64,
209
210 #[clap(long, default_value = "128")]
212 timely_gain_min: u32,
213
214 #[clap(long, default_value = "32")]
216 timely_gain_step: u32,
217
218 #[clap(long, default_value = "768")]
220 timely_hai_thresh: u32,
221
222 #[clap(long, default_value = "2")]
224 timely_hai_multiplier: u32,
225
226 #[clap(long, default_value = "768")]
228 timely_backoff_low: u32,
229
230 #[clap(long, default_value = "960")]
232 timely_backoff_high: u32,
233
234 #[clap(long, default_value = "992")]
236 timely_backoff_gradient: u32,
237
238 #[clap(long, default_value = "125")]
240 timely_gradient_margin_us: u64,
241
242 #[clap(long, default_value = "500")]
244 timely_control_interval_us: u64,
245
246 #[clap(long)]
248 stats: Option<f64>,
249
250 #[clap(long)]
253 monitor: Option<f64>,
254
255 #[clap(short = 'd', long, action = clap::ArgAction::SetTrue)]
257 debug: bool,
258
259 #[clap(short = 'v', long, action = clap::ArgAction::SetTrue)]
261 verbose: bool,
262
263 #[clap(short = 'V', long, action = clap::ArgAction::SetTrue)]
265 version: bool,
266
267 #[clap(long)]
269 help_stats: bool,
270
271 #[clap(flatten, next_help_heading = "Libbpf Options")]
272 pub libbpf: LibbpfOpts,
273}
274
275struct Scheduler<'a> {
276 skel: BpfSkel<'a>,
277 struct_ops: Option<libbpf_rs::Link>,
278 opts: &'a Opts,
279 topo: Topology,
280 power_profile: PowerProfile,
281 stats_server: StatsServer<(), Metrics>,
282 user_restart: bool,
283}
284
285impl<'a> Scheduler<'a> {
286 fn init(opts: &'a Opts, open_object: &'a mut MaybeUninit<OpenObject>) -> Result<Self> {
287 try_set_rlimit_infinity();
288
289 let topo = Topology::new().unwrap();
291
292 let smt_enabled = !opts.disable_smt && topo.smt_enabled;
294
295 let nr_nodes = topo
297 .nodes
298 .values()
299 .filter(|node| !node.all_cpus.is_empty())
300 .count();
301 info!("NUMA nodes: {}", nr_nodes);
302
303 let numa_enabled = !opts.disable_numa && nr_nodes > 1;
305 if !numa_enabled {
306 info!("Disabling NUMA optimizations");
307 }
308
309 let power_profile = Self::power_profile();
311 let domain =
312 Self::resolve_energy_domain(&opts.primary_domain, power_profile).map_err(|err| {
313 anyhow!(
314 "failed to resolve primary domain '{}': {}",
315 &opts.primary_domain,
316 err
317 )
318 })?;
319
320 info!(
321 "{} {} {}",
322 SCHEDULER_NAME,
323 build_id::full_version(env!("CARGO_PKG_VERSION")),
324 if smt_enabled { "SMT on" } else { "SMT off" }
325 );
326
327 info!(
329 "scheduler options: {}",
330 std::env::args().collect::<Vec<_>>().join(" ")
331 );
332
333 if opts.idle_resume_us >= 0 {
334 if !cpu_idle_resume_latency_supported() {
335 warn!("idle resume latency not supported");
336 } else {
337 info!("Setting idle QoS to {} us", opts.idle_resume_us);
338 for cpu in topo.all_cpus.values() {
339 update_cpu_idle_resume_latency(
340 cpu.id,
341 opts.idle_resume_us.try_into().unwrap(),
342 )?;
343 }
344 }
345 }
346
347 let mut skel_builder = BpfSkelBuilder::default();
349 skel_builder.obj_builder.debug(opts.verbose);
350 let open_opts = opts.libbpf.clone().into_bpf_open_opts();
351 let mut skel = scx_ops_open!(skel_builder, open_object, bpfland_ops, open_opts)?;
352
353 skel.struct_ops.bpfland_ops_mut().exit_dump_len = opts.exit_dump_len;
354
355 let rodata = skel.maps.rodata_data.as_mut().unwrap();
357 rodata.debug = opts.debug;
358 rodata.smt_enabled = smt_enabled;
359 rodata.numa_enabled = numa_enabled;
360 rodata.local_pcpu = opts.local_pcpu;
361 rodata.no_wake_sync = opts.no_wake_sync;
362 rodata.sticky_tasks = opts.sticky_tasks;
363 rodata.slice_max = opts.slice_us * 1000;
364 rodata.slice_min = opts.slice_min_us * 1000;
365 rodata.slice_lag = opts.slice_us_lag * 1000;
366 rodata.throttle_ns = opts.throttle_us * 1000;
367 rodata.primary_all = domain.weight() == *NR_CPU_IDS;
368
369 rodata.timely_enabled = opts.timely;
371 rodata.timely_tlow_ns = opts.timely_tlow_us * 1000;
372 rodata.timely_thigh_ns = opts.timely_thigh_us * 1000;
373 rodata.timely_gain_min_fp = opts.timely_gain_min;
374 rodata.timely_gain_max_fp = 1024;
375 rodata.timely_gain_step_fp = opts.timely_gain_step;
376 rodata.timely_hai_thresh_fp = opts.timely_hai_thresh;
377 rodata.timely_hai_multiplier = opts.timely_hai_multiplier;
378 rodata.timely_backoff_low_fp = opts.timely_backoff_low;
379 rodata.timely_backoff_high_fp = opts.timely_backoff_high;
380 rodata.timely_backoff_gradient_fp = opts.timely_backoff_gradient;
381 rodata.timely_gradient_margin_ns = opts.timely_gradient_margin_us * 1000;
382 rodata.timely_control_interval_ns = opts.timely_control_interval_us * 1000;
383
384 let mut cpus: Vec<_> = topo.all_cpus.values().collect();
386 cpus.sort_by_key(|cpu| std::cmp::Reverse(cpu.cpu_capacity));
387 for (i, cpu) in cpus.iter().enumerate() {
388 rodata.cpu_capacity[cpu.id] = cpu.cpu_capacity as c_ulong;
389 rodata.preferred_cpus[i] = cpu.id as u64;
390 }
391 if opts.preferred_idle_scan {
392 info!(
393 "Preferred CPUs: {:?}",
394 &rodata.preferred_cpus[0..cpus.len()]
395 );
396 }
397 rodata.preferred_idle_scan = opts.preferred_idle_scan;
398
399 rodata.local_kthreads = opts.local_kthreads || opts.throttle_us > 0;
402
403 skel.struct_ops.bpfland_ops_mut().flags = *compat::SCX_OPS_ENQ_EXITING
405 | *compat::SCX_OPS_ENQ_LAST
406 | *compat::SCX_OPS_ENQ_MIGRATION_DISABLED
407 | *compat::SCX_OPS_ALLOW_QUEUED_WAKEUP
408 | if numa_enabled {
409 *compat::SCX_OPS_BUILTIN_IDLE_PER_NODE
410 } else {
411 0
412 };
413 info!(
414 "scheduler flags: {:#x}",
415 skel.struct_ops.bpfland_ops_mut().flags
416 );
417
418 let mut skel = scx_ops_load!(skel, bpfland_ops, uei)?;
420
421 Self::init_energy_domain(&mut skel, &domain).map_err(|err| {
423 anyhow!(
424 "failed to initialize primary domain 0x{:x}: {}",
425 domain,
426 err
427 )
428 })?;
429
430 if let Err(err) = Self::init_cpufreq_perf(&mut skel, &opts.primary_domain, opts.cpufreq) {
432 bail!(
433 "failed to initialize cpufreq performance level: error {}",
434 err
435 );
436 }
437
438 if smt_enabled {
440 Self::init_smt_domains(&mut skel, &topo)?;
441 }
442
443 let struct_ops = Some(scx_ops_attach!(skel, bpfland_ops)?);
445 let stats_server = StatsServer::new(stats::server_data()).launch()?;
446
447 Ok(Self {
448 skel,
449 struct_ops,
450 opts,
451 topo,
452 power_profile,
453 stats_server,
454 user_restart: false,
455 })
456 }
457
458 fn enable_primary_cpu(skel: &mut BpfSkel<'_>, cpu: i32) -> Result<(), u32> {
459 let prog = &mut skel.progs.enable_primary_cpu;
460 let mut args = cpu_arg {
461 cpu_id: cpu as c_int,
462 };
463 let input = ProgramInput {
464 context_in: Some(unsafe {
465 std::slice::from_raw_parts_mut(
466 &mut args as *mut _ as *mut u8,
467 std::mem::size_of_val(&args),
468 )
469 }),
470 ..Default::default()
471 };
472 let out = prog.test_run(input).unwrap();
473 if out.return_value != 0 {
474 return Err(out.return_value);
475 }
476
477 Ok(())
478 }
479
480 fn epp_to_cpumask(profile: Powermode) -> Result<Cpumask> {
481 let mut cpus = get_primary_cpus(profile).unwrap_or_default();
482 if cpus.is_empty() {
483 cpus = get_primary_cpus(Powermode::Any).unwrap_or_default();
484 }
485 Cpumask::from_str(&cpus_to_cpumask(&cpus))
486 }
487
488 fn resolve_energy_domain(primary_domain: &str, power_profile: PowerProfile) -> Result<Cpumask> {
489 let domain = match primary_domain {
490 "powersave" => Self::epp_to_cpumask(Powermode::Powersave)?,
491 "performance" => Self::epp_to_cpumask(Powermode::Performance)?,
492 "turbo" => Self::epp_to_cpumask(Powermode::Turbo)?,
493 "auto" => match power_profile {
494 PowerProfile::Powersave => Self::epp_to_cpumask(Powermode::Powersave)?,
495 PowerProfile::Balanced { .. }
496 | PowerProfile::Performance
497 | PowerProfile::Unknown => Self::epp_to_cpumask(Powermode::Any)?,
498 },
499 "all" => Self::epp_to_cpumask(Powermode::Any)?,
500 &_ => Cpumask::from_str(primary_domain)?,
501 };
502
503 Ok(domain)
504 }
505
506 fn init_energy_domain(skel: &mut BpfSkel<'_>, domain: &Cpumask) -> Result<()> {
507 info!("primary CPU domain = 0x{:x}", domain);
508
509 if let Err(err) = Self::enable_primary_cpu(skel, -1) {
511 bail!("failed to reset primary domain: error {}", err);
512 }
513
514 for cpu in 0..*NR_CPU_IDS {
516 if domain.test_cpu(cpu) {
517 if let Err(err) = Self::enable_primary_cpu(skel, cpu as i32) {
518 bail!("failed to add CPU {} to primary domain: error {}", cpu, err);
519 }
520 }
521 }
522
523 Ok(())
524 }
525
526 fn init_cpufreq_perf(
528 skel: &mut BpfSkel<'_>,
529 primary_domain: &String,
530 auto: bool,
531 ) -> Result<()> {
532 let perf_lvl: i64 = match primary_domain.as_str() {
535 "powersave" => 0,
536 _ if auto => -1,
537 _ => 1024,
538 };
539 info!(
540 "cpufreq performance level: {}",
541 match perf_lvl {
542 1024 => "max".into(),
543 0 => "min".into(),
544 n if n < 0 => "auto".into(),
545 _ => perf_lvl.to_string(),
546 }
547 );
548 skel.maps.bss_data.as_mut().unwrap().cpufreq_perf_lvl = perf_lvl;
549
550 Ok(())
551 }
552
553 fn power_profile() -> PowerProfile {
554 let profile = fetch_power_profile(true);
555 if profile == PowerProfile::Unknown {
556 fetch_power_profile(false)
557 } else {
558 profile
559 }
560 }
561
562 fn refresh_sched_domain(&mut self) -> bool {
563 if self.power_profile != PowerProfile::Unknown {
564 let power_profile = Self::power_profile();
565 if power_profile != self.power_profile {
566 self.power_profile = power_profile;
567
568 if self.opts.primary_domain == "auto" {
569 return true;
570 }
571 if let Err(err) = Self::init_cpufreq_perf(
572 &mut self.skel,
573 &self.opts.primary_domain,
574 self.opts.cpufreq,
575 ) {
576 warn!("failed to refresh cpufreq performance level: error {}", err);
577 }
578 }
579 }
580
581 false
582 }
583
584 fn enable_sibling_cpu(
585 skel: &mut BpfSkel<'_>,
586 cpu: usize,
587 sibling_cpu: usize,
588 ) -> Result<(), u32> {
589 let prog = &mut skel.progs.enable_sibling_cpu;
590 let mut args = domain_arg {
591 cpu_id: cpu as c_int,
592 sibling_cpu_id: sibling_cpu as c_int,
593 };
594 let input = ProgramInput {
595 context_in: Some(unsafe {
596 std::slice::from_raw_parts_mut(
597 &mut args as *mut _ as *mut u8,
598 std::mem::size_of_val(&args),
599 )
600 }),
601 ..Default::default()
602 };
603 let out = prog.test_run(input).unwrap();
604 if out.return_value != 0 {
605 return Err(out.return_value);
606 }
607
608 Ok(())
609 }
610
611 fn init_smt_domains(skel: &mut BpfSkel<'_>, topo: &Topology) -> Result<(), std::io::Error> {
612 let smt_siblings = topo.sibling_cpus();
613
614 info!("SMT sibling CPUs: {:?}", smt_siblings);
615 for (cpu, sibling_cpu) in smt_siblings.iter().enumerate() {
616 Self::enable_sibling_cpu(skel, cpu, *sibling_cpu as usize).unwrap();
617 }
618
619 Ok(())
620 }
621
622 fn get_metrics(&self) -> Metrics {
623 let bss_data = self.skel.maps.bss_data.as_ref().unwrap();
624 Metrics {
625 nr_running: bss_data.nr_running,
626 nr_cpus: bss_data.nr_online_cpus,
627 nr_kthread_dispatches: bss_data.nr_kthread_dispatches,
628 nr_direct_dispatches: bss_data.nr_direct_dispatches,
629 nr_shared_dispatches: bss_data.nr_shared_dispatches,
630 nr_delay_recovery_dispatches: bss_data.nr_delay_recovery_dispatches,
631 nr_delay_middle_add_dispatches: bss_data.nr_delay_middle_add_dispatches,
632 nr_delay_fast_recovery_dispatches: bss_data.nr_delay_fast_recovery_dispatches,
633 nr_delay_rate_limited_dispatches: bss_data.nr_delay_rate_limited_dispatches,
634 nr_gain_floor_dispatches: bss_data.nr_gain_floor_dispatches,
635 nr_gain_ceiling_dispatches: bss_data.nr_gain_ceiling_dispatches,
636 nr_delay_low_region_samples: bss_data.nr_delay_low_region_samples,
637 nr_delay_mid_region_samples: bss_data.nr_delay_mid_region_samples,
638 nr_delay_high_region_samples: bss_data.nr_delay_high_region_samples,
639 nr_gain_floor_resident_samples: bss_data.nr_gain_floor_resident_samples,
640 nr_gain_mid_resident_samples: bss_data.nr_gain_mid_resident_samples,
641 nr_gain_ceiling_resident_samples: bss_data.nr_gain_ceiling_resident_samples,
642 nr_idle_select_path_picks: bss_data.nr_idle_select_path_picks,
643 nr_idle_enqueue_path_picks: bss_data.nr_idle_enqueue_path_picks,
644 nr_idle_prev_cpu_picks: bss_data.nr_idle_prev_cpu_picks,
645 nr_idle_primary_picks: bss_data.nr_idle_primary_picks,
646 nr_idle_spill_picks: bss_data.nr_idle_spill_picks,
647 nr_idle_pick_failures: bss_data.nr_idle_pick_failures,
648 nr_idle_primary_domain_misses: bss_data.nr_idle_primary_domain_misses,
649 nr_idle_global_misses: bss_data.nr_idle_global_misses,
650 nr_waker_cpu_biases: bss_data.nr_waker_cpu_biases,
651 nr_keep_running_reuses: bss_data.nr_keep_running_reuses,
652 nr_keep_running_queue_empty: bss_data.nr_keep_running_queue_empty,
653 nr_keep_running_smt_blocked: bss_data.nr_keep_running_smt_blocked,
654 nr_keep_running_queued_work: bss_data.nr_keep_running_queued_work,
655 nr_dispatch_cpu_dsq_consumes: bss_data.nr_dispatch_cpu_dsq_consumes,
656 nr_dispatch_node_dsq_consumes: bss_data.nr_dispatch_node_dsq_consumes,
657 nr_cpu_release_reenqueue: bss_data.nr_cpu_release_reenqueue,
658 }
659 }
660
661 pub fn exited(&mut self) -> bool {
662 uei_exited!(&self.skel, uei)
663 }
664
665 fn run(&mut self, shutdown: Arc<AtomicBool>) -> Result<UserExitInfo> {
666 let (res_ch, req_ch) = self.stats_server.channels();
667 while !shutdown.load(Ordering::Relaxed) && !self.exited() {
668 if self.refresh_sched_domain() {
669 self.user_restart = true;
670 break;
671 }
672 match req_ch.recv_timeout(Duration::from_secs(1)) {
673 Ok(()) => res_ch.send(self.get_metrics())?,
674 Err(RecvTimeoutError::Timeout) => {}
675 Err(e) => Err(e)?,
676 }
677 }
678
679 let _ = self.struct_ops.take();
680 uei_report!(&self.skel, uei)
681 }
682}
683
684impl Drop for Scheduler<'_> {
685 fn drop(&mut self) {
686 info!("Unregister {SCHEDULER_NAME} scheduler");
687
688 if self.opts.idle_resume_us >= 0 {
690 if cpu_idle_resume_latency_supported() {
691 for cpu in self.topo.all_cpus.values() {
692 update_cpu_idle_resume_latency(cpu.id, cpu.pm_qos_resume_latency_us as i32)
693 .unwrap();
694 }
695 }
696 }
697 }
698}
699
700fn main() -> Result<()> {
701 let opts = Opts::parse();
702
703 if opts.version {
704 println!(
705 "{} {}",
706 SCHEDULER_NAME,
707 build_id::full_version(env!("CARGO_PKG_VERSION"))
708 );
709 return Ok(());
710 }
711
712 if opts.help_stats {
713 stats::server_data().describe_meta(&mut std::io::stdout(), None)?;
714 return Ok(());
715 }
716
717 let loglevel = simplelog::LevelFilter::Info;
718
719 let mut lcfg = simplelog::ConfigBuilder::new();
720 lcfg.set_time_offset_to_local()
721 .expect("Failed to set local time offset")
722 .set_time_level(simplelog::LevelFilter::Error)
723 .set_location_level(simplelog::LevelFilter::Off)
724 .set_target_level(simplelog::LevelFilter::Off)
725 .set_thread_level(simplelog::LevelFilter::Off);
726 simplelog::TermLogger::init(
727 loglevel,
728 lcfg.build(),
729 simplelog::TerminalMode::Stderr,
730 simplelog::ColorChoice::Auto,
731 )?;
732
733 let shutdown = Arc::new(AtomicBool::new(false));
734 let shutdown_clone = shutdown.clone();
735 ctrlc::set_handler(move || {
736 shutdown_clone.store(true, Ordering::Relaxed);
737 })
738 .context("Error setting Ctrl-C handler")?;
739
740 if let Some(intv) = opts.monitor.or(opts.stats) {
741 let shutdown_copy = shutdown.clone();
742 let jh = std::thread::spawn(move || {
743 match stats::monitor(Duration::from_secs_f64(intv), shutdown_copy) {
744 Ok(_) => {
745 debug!("stats monitor thread finished successfully")
746 }
747 Err(error_object) => {
748 warn!(
749 "stats monitor thread finished because of an error {}",
750 error_object
751 )
752 }
753 }
754 });
755 if opts.monitor.is_some() {
756 let _ = jh.join();
757 return Ok(());
758 }
759 }
760
761 let mut open_object = MaybeUninit::uninit();
762 loop {
763 let mut sched = Scheduler::init(&opts, &mut open_object)?;
764 if !sched.run(shutdown.clone())?.should_restart() {
765 if sched.user_restart {
766 continue;
767 }
768 break;
769 }
770 }
771
772 Ok(())
773}