1mod bpf_skel;
9pub use bpf_skel::*;
10pub mod bpf_intf;
11pub use bpf_intf::*;
12
13mod stats;
14use std::ffi::{c_int, c_ulong};
15use std::fmt::Write;
16use std::mem::MaybeUninit;
17use std::sync::atomic::AtomicBool;
18use std::sync::atomic::Ordering;
19use std::sync::Arc;
20use std::time::Duration;
21
22use anyhow::anyhow;
23use anyhow::bail;
24use anyhow::Context;
25use anyhow::Result;
26use clap::Parser;
27use crossbeam::channel::RecvTimeoutError;
28use libbpf_rs::OpenObject;
29use libbpf_rs::ProgramInput;
30use log::warn;
31use log::{debug, info};
32use scx_stats::prelude::*;
33use scx_utils::autopower::{fetch_power_profile, PowerProfile};
34use scx_utils::build_id;
35use scx_utils::compat;
36use scx_utils::libbpf_clap_opts::LibbpfOpts;
37use scx_utils::pm::{cpu_idle_resume_latency_supported, update_cpu_idle_resume_latency};
38use scx_utils::scx_ops_attach;
39use scx_utils::scx_ops_load;
40use scx_utils::scx_ops_open;
41use scx_utils::try_set_rlimit_infinity;
42use scx_utils::uei_exited;
43use scx_utils::uei_report;
44use scx_utils::CoreType;
45use scx_utils::Cpumask;
46use scx_utils::Topology;
47use scx_utils::UserExitInfo;
48use scx_utils::NR_CPU_IDS;
49use stats::Metrics;
50
51const SCHEDULER_NAME: &str = "scx_bpfland";
52
53#[derive(PartialEq)]
54enum Powermode {
55 Turbo,
56 Performance,
57 Powersave,
58 Any,
59}
60
61fn get_primary_cpus(mode: Powermode) -> std::io::Result<Vec<usize>> {
62 let topo = Topology::new().unwrap();
63
64 let cpus: Vec<usize> = topo
65 .all_cores
66 .values()
67 .flat_map(|core| &core.cpus)
68 .filter_map(|(cpu_id, cpu)| match (&mode, &cpu.core_type) {
69 (Powermode::Performance, CoreType::Big { .. }) |
71 (Powermode::Powersave, CoreType::Little) => Some(*cpu_id),
73 (Powermode::Any, ..) => Some(*cpu_id),
74 _ => None,
75 })
76 .collect();
77
78 Ok(cpus)
79}
80
81fn cpus_to_cpumask(cpus: &Vec<usize>) -> String {
83 if cpus.is_empty() {
84 return String::from("none");
85 }
86
87 let max_cpu_id = *cpus.iter().max().unwrap();
89
90 let mut bitmask = vec![0u8; (max_cpu_id + 1 + 7) / 8];
92
93 for cpu_id in cpus {
95 let byte_index = cpu_id / 8;
96 let bit_index = cpu_id % 8;
97 bitmask[byte_index] |= 1 << bit_index;
98 }
99
100 let hex_str: String = bitmask.iter().rev().fold(String::new(), |mut f, byte| {
102 let _ = write!(&mut f, "{:02x}", byte);
103 f
104 });
105
106 format!("0x{}", hex_str)
107}
108
109#[derive(Debug, Parser)]
117struct Opts {
118 #[clap(long, default_value = "0")]
120 exit_dump_len: u32,
121
122 #[clap(short = 's', long, default_value = "1000")]
124 slice_us: u64,
125
126 #[clap(short = 'l', long, default_value = "40000")]
131 slice_us_lag: u64,
132
133 #[clap(short = 't', long, default_value = "0")]
138 throttle_us: u64,
139
140 #[clap(short = 'I', long, allow_hyphen_values = true, default_value = "-1")]
146 idle_resume_us: i64,
147
148 #[clap(short = 'p', long, action = clap::ArgAction::SetTrue)]
155 local_pcpu: bool,
156
157 #[clap(short = 'k', long, action = clap::ArgAction::SetTrue)]
166 local_kthreads: bool,
167
168 #[clap(short = 'w', long, action = clap::ArgAction::SetTrue)]
175 no_wake_sync: bool,
176
177 #[clap(short = 'S', long, action = clap::ArgAction::SetTrue)]
185 sticky_tasks: bool,
186
187 #[clap(short = 'm', long, default_value = "auto")]
198 primary_domain: String,
199
200 #[clap(short = 'P', long, action = clap::ArgAction::SetTrue)]
205 preferred_idle_scan: bool,
206
207 #[clap(long, action = clap::ArgAction::SetTrue)]
209 disable_smt: bool,
210
211 #[clap(long, action = clap::ArgAction::SetTrue)]
213 disable_numa: bool,
214
215 #[clap(short = 'f', long, action = clap::ArgAction::SetTrue)]
219 cpufreq: bool,
220
221 #[clap(long)]
223 stats: Option<f64>,
224
225 #[clap(long)]
228 monitor: Option<f64>,
229
230 #[clap(short = 'd', long, action = clap::ArgAction::SetTrue)]
232 debug: bool,
233
234 #[clap(short = 'v', long, action = clap::ArgAction::SetTrue)]
236 verbose: bool,
237
238 #[clap(short = 'V', long, action = clap::ArgAction::SetTrue)]
240 version: bool,
241
242 #[clap(long)]
244 help_stats: bool,
245
246 #[clap(flatten, next_help_heading = "Libbpf Options")]
247 pub libbpf: LibbpfOpts,
248}
249
250struct Scheduler<'a> {
251 skel: BpfSkel<'a>,
252 struct_ops: Option<libbpf_rs::Link>,
253 opts: &'a Opts,
254 topo: Topology,
255 power_profile: PowerProfile,
256 stats_server: StatsServer<(), Metrics>,
257 user_restart: bool,
258}
259
260impl<'a> Scheduler<'a> {
261 fn init(opts: &'a Opts, open_object: &'a mut MaybeUninit<OpenObject>) -> Result<Self> {
262 try_set_rlimit_infinity();
263
264 let topo = Topology::new().unwrap();
266
267 let smt_enabled = !opts.disable_smt && topo.smt_enabled;
269
270 let nr_nodes = topo
272 .nodes
273 .values()
274 .filter(|node| !node.all_cpus.is_empty())
275 .count();
276 info!("NUMA nodes: {}", nr_nodes);
277
278 let numa_enabled = !opts.disable_numa && nr_nodes > 1;
280 if !numa_enabled {
281 info!("Disabling NUMA optimizations");
282 }
283
284 let power_profile = Self::power_profile();
286 let domain =
287 Self::resolve_energy_domain(&opts.primary_domain, power_profile).map_err(|err| {
288 anyhow!(
289 "failed to resolve primary domain '{}': {}",
290 &opts.primary_domain,
291 err
292 )
293 })?;
294
295 info!(
296 "{} {} {}",
297 SCHEDULER_NAME,
298 build_id::full_version(env!("CARGO_PKG_VERSION")),
299 if smt_enabled { "SMT on" } else { "SMT off" }
300 );
301
302 info!(
304 "scheduler options: {}",
305 std::env::args().collect::<Vec<_>>().join(" ")
306 );
307
308 if opts.idle_resume_us >= 0 {
309 if !cpu_idle_resume_latency_supported() {
310 warn!("idle resume latency not supported");
311 } else {
312 info!("Setting idle QoS to {} us", opts.idle_resume_us);
313 for cpu in topo.all_cpus.values() {
314 update_cpu_idle_resume_latency(
315 cpu.id,
316 opts.idle_resume_us.try_into().unwrap(),
317 )?;
318 }
319 }
320 }
321
322 let mut skel_builder = BpfSkelBuilder::default();
324 skel_builder.obj_builder.debug(opts.verbose);
325 let open_opts = opts.libbpf.clone().into_bpf_open_opts();
326 let mut skel = scx_ops_open!(skel_builder, open_object, bpfland_ops, open_opts)?;
327
328 skel.struct_ops.bpfland_ops_mut().exit_dump_len = opts.exit_dump_len;
329
330 let rodata = skel.maps.rodata_data.as_mut().unwrap();
332 rodata.debug = opts.debug;
333 rodata.smt_enabled = smt_enabled;
334 rodata.numa_enabled = numa_enabled;
335 rodata.local_pcpu = opts.local_pcpu;
336 rodata.no_wake_sync = opts.no_wake_sync;
337 rodata.sticky_tasks = opts.sticky_tasks;
338 rodata.slice_max = opts.slice_us * 1000;
339 rodata.slice_lag = opts.slice_us_lag * 1000;
340 rodata.throttle_ns = opts.throttle_us * 1000;
341 rodata.primary_all = domain.weight() == *NR_CPU_IDS;
342
343 let mut cpus: Vec<_> = topo.all_cpus.values().collect();
345 cpus.sort_by_key(|cpu| std::cmp::Reverse(cpu.cpu_capacity));
346 for (i, cpu) in cpus.iter().enumerate() {
347 rodata.cpu_capacity[cpu.id] = cpu.cpu_capacity as c_ulong;
348 rodata.preferred_cpus[i] = cpu.id as u64;
349 }
350 if opts.preferred_idle_scan {
351 info!(
352 "Preferred CPUs: {:?}",
353 &rodata.preferred_cpus[0..cpus.len()]
354 );
355 }
356 rodata.preferred_idle_scan = opts.preferred_idle_scan;
357
358 rodata.local_kthreads = opts.local_kthreads || opts.throttle_us > 0;
361
362 if opts.sticky_tasks {
364 if let Err(err) =
365 compat::cond_kprobe_enable("do_nanosleep", &skel.progs.kprobe_do_nanosleep)
366 {
367 warn!("failed to enable kprobe/do_nanosleep{}", err);
368 }
369 if let Err(err) = compat::cond_kprobe_enable("ksys_read", &skel.progs.kprobe_ksys_read)
370 {
371 warn!("failed to enable kprobe/ksys_read{}", err);
372 }
373 }
374
375 skel.struct_ops.bpfland_ops_mut().flags = *compat::SCX_OPS_ENQ_EXITING
377 | *compat::SCX_OPS_ENQ_LAST
378 | *compat::SCX_OPS_ENQ_MIGRATION_DISABLED
379 | *compat::SCX_OPS_ALLOW_QUEUED_WAKEUP
380 | if numa_enabled {
381 *compat::SCX_OPS_BUILTIN_IDLE_PER_NODE
382 } else {
383 0
384 };
385 info!(
386 "scheduler flags: {:#x}",
387 skel.struct_ops.bpfland_ops_mut().flags
388 );
389
390 let mut skel = scx_ops_load!(skel, bpfland_ops, uei)?;
392
393 Self::init_energy_domain(&mut skel, &domain).map_err(|err| {
395 anyhow!(
396 "failed to initialize primary domain 0x{:x}: {}",
397 domain,
398 err
399 )
400 })?;
401
402 if let Err(err) = Self::init_cpufreq_perf(&mut skel, &opts.primary_domain, opts.cpufreq) {
404 bail!(
405 "failed to initialize cpufreq performance level: error {}",
406 err
407 );
408 }
409
410 if smt_enabled {
412 Self::init_smt_domains(&mut skel, &topo)?;
413 }
414
415 let struct_ops = Some(scx_ops_attach!(skel, bpfland_ops)?);
417 let stats_server = StatsServer::new(stats::server_data()).launch()?;
418
419 Ok(Self {
420 skel,
421 struct_ops,
422 opts,
423 topo,
424 power_profile,
425 stats_server,
426 user_restart: false,
427 })
428 }
429
430 fn enable_primary_cpu(skel: &mut BpfSkel<'_>, cpu: i32) -> Result<(), u32> {
431 let prog = &mut skel.progs.enable_primary_cpu;
432 let mut args = cpu_arg {
433 cpu_id: cpu as c_int,
434 };
435 let input = ProgramInput {
436 context_in: Some(unsafe {
437 std::slice::from_raw_parts_mut(
438 &mut args as *mut _ as *mut u8,
439 std::mem::size_of_val(&args),
440 )
441 }),
442 ..Default::default()
443 };
444 let out = prog.test_run(input).unwrap();
445 if out.return_value != 0 {
446 return Err(out.return_value);
447 }
448
449 Ok(())
450 }
451
452 fn epp_to_cpumask(profile: Powermode) -> Result<Cpumask> {
453 let mut cpus = get_primary_cpus(profile).unwrap_or_default();
454 if cpus.is_empty() {
455 cpus = get_primary_cpus(Powermode::Any).unwrap_or_default();
456 }
457 Cpumask::from_str(&cpus_to_cpumask(&cpus))
458 }
459
460 fn resolve_energy_domain(primary_domain: &str, power_profile: PowerProfile) -> Result<Cpumask> {
461 let domain = match primary_domain {
462 "powersave" => Self::epp_to_cpumask(Powermode::Powersave)?,
463 "performance" => Self::epp_to_cpumask(Powermode::Performance)?,
464 "turbo" => Self::epp_to_cpumask(Powermode::Turbo)?,
465 "auto" => match power_profile {
466 PowerProfile::Powersave => Self::epp_to_cpumask(Powermode::Powersave)?,
467 PowerProfile::Balanced { .. }
468 | PowerProfile::Performance
469 | PowerProfile::Unknown => Self::epp_to_cpumask(Powermode::Any)?,
470 },
471 "all" => Self::epp_to_cpumask(Powermode::Any)?,
472 &_ => Cpumask::from_str(primary_domain)?,
473 };
474
475 Ok(domain)
476 }
477
478 fn init_energy_domain(skel: &mut BpfSkel<'_>, domain: &Cpumask) -> Result<()> {
479 info!("primary CPU domain = 0x{:x}", domain);
480
481 if let Err(err) = Self::enable_primary_cpu(skel, -1) {
483 bail!("failed to reset primary domain: error {}", err);
484 }
485
486 for cpu in 0..*NR_CPU_IDS {
488 if domain.test_cpu(cpu) {
489 if let Err(err) = Self::enable_primary_cpu(skel, cpu as i32) {
490 bail!("failed to add CPU {} to primary domain: error {}", cpu, err);
491 }
492 }
493 }
494
495 Ok(())
496 }
497
498 fn init_cpufreq_perf(
500 skel: &mut BpfSkel<'_>,
501 primary_domain: &String,
502 auto: bool,
503 ) -> Result<()> {
504 let perf_lvl: i64 = match primary_domain.as_str() {
507 "powersave" => 0,
508 _ if auto => -1,
509 _ => 1024,
510 };
511 info!(
512 "cpufreq performance level: {}",
513 match perf_lvl {
514 1024 => "max".into(),
515 0 => "min".into(),
516 n if n < 0 => "auto".into(),
517 _ => perf_lvl.to_string(),
518 }
519 );
520 skel.maps.bss_data.as_mut().unwrap().cpufreq_perf_lvl = perf_lvl;
521
522 Ok(())
523 }
524
525 fn power_profile() -> PowerProfile {
526 let profile = fetch_power_profile(true);
527 if profile == PowerProfile::Unknown {
528 fetch_power_profile(false)
529 } else {
530 profile
531 }
532 }
533
534 fn refresh_sched_domain(&mut self) -> bool {
535 if self.power_profile != PowerProfile::Unknown {
536 let power_profile = Self::power_profile();
537 if power_profile != self.power_profile {
538 self.power_profile = power_profile;
539
540 if self.opts.primary_domain == "auto" {
541 return true;
542 }
543 if let Err(err) = Self::init_cpufreq_perf(
544 &mut self.skel,
545 &self.opts.primary_domain,
546 self.opts.cpufreq,
547 ) {
548 warn!("failed to refresh cpufreq performance level: error {}", err);
549 }
550 }
551 }
552
553 false
554 }
555
556 fn enable_sibling_cpu(
557 skel: &mut BpfSkel<'_>,
558 cpu: usize,
559 sibling_cpu: usize,
560 ) -> Result<(), u32> {
561 let prog = &mut skel.progs.enable_sibling_cpu;
562 let mut args = domain_arg {
563 cpu_id: cpu as c_int,
564 sibling_cpu_id: sibling_cpu as c_int,
565 };
566 let input = ProgramInput {
567 context_in: Some(unsafe {
568 std::slice::from_raw_parts_mut(
569 &mut args as *mut _ as *mut u8,
570 std::mem::size_of_val(&args),
571 )
572 }),
573 ..Default::default()
574 };
575 let out = prog.test_run(input).unwrap();
576 if out.return_value != 0 {
577 return Err(out.return_value);
578 }
579
580 Ok(())
581 }
582
583 fn init_smt_domains(skel: &mut BpfSkel<'_>, topo: &Topology) -> Result<(), std::io::Error> {
584 let smt_siblings = topo.sibling_cpus();
585
586 info!("SMT sibling CPUs: {:?}", smt_siblings);
587 for (cpu, sibling_cpu) in smt_siblings.iter().enumerate() {
588 Self::enable_sibling_cpu(skel, cpu, *sibling_cpu as usize).unwrap();
589 }
590
591 Ok(())
592 }
593
594 fn get_metrics(&self) -> Metrics {
595 let bss_data = self.skel.maps.bss_data.as_ref().unwrap();
596 Metrics {
597 nr_running: bss_data.nr_running,
598 nr_cpus: bss_data.nr_online_cpus,
599 nr_kthread_dispatches: bss_data.nr_kthread_dispatches,
600 nr_direct_dispatches: bss_data.nr_direct_dispatches,
601 nr_shared_dispatches: bss_data.nr_shared_dispatches,
602 }
603 }
604
605 pub fn exited(&mut self) -> bool {
606 uei_exited!(&self.skel, uei)
607 }
608
609 fn run(&mut self, shutdown: Arc<AtomicBool>) -> Result<UserExitInfo> {
610 let (res_ch, req_ch) = self.stats_server.channels();
611 while !shutdown.load(Ordering::Relaxed) && !self.exited() {
612 if self.refresh_sched_domain() {
613 self.user_restart = true;
614 break;
615 }
616 match req_ch.recv_timeout(Duration::from_secs(1)) {
617 Ok(()) => res_ch.send(self.get_metrics())?,
618 Err(RecvTimeoutError::Timeout) => {}
619 Err(e) => Err(e)?,
620 }
621 }
622
623 let _ = self.struct_ops.take();
624 uei_report!(&self.skel, uei)
625 }
626}
627
628impl Drop for Scheduler<'_> {
629 fn drop(&mut self) {
630 info!("Unregister {SCHEDULER_NAME} scheduler");
631
632 if self.opts.idle_resume_us >= 0 {
634 if cpu_idle_resume_latency_supported() {
635 for cpu in self.topo.all_cpus.values() {
636 update_cpu_idle_resume_latency(cpu.id, cpu.pm_qos_resume_latency_us as i32)
637 .unwrap();
638 }
639 }
640 }
641 }
642}
643
644fn main() -> Result<()> {
645 let opts = Opts::parse();
646
647 if opts.version {
648 println!(
649 "{} {}",
650 SCHEDULER_NAME,
651 build_id::full_version(env!("CARGO_PKG_VERSION"))
652 );
653 return Ok(());
654 }
655
656 if opts.help_stats {
657 stats::server_data().describe_meta(&mut std::io::stdout(), None)?;
658 return Ok(());
659 }
660
661 let loglevel = simplelog::LevelFilter::Info;
662
663 let mut lcfg = simplelog::ConfigBuilder::new();
664 lcfg.set_time_offset_to_local()
665 .expect("Failed to set local time offset")
666 .set_time_level(simplelog::LevelFilter::Error)
667 .set_location_level(simplelog::LevelFilter::Off)
668 .set_target_level(simplelog::LevelFilter::Off)
669 .set_thread_level(simplelog::LevelFilter::Off);
670 simplelog::TermLogger::init(
671 loglevel,
672 lcfg.build(),
673 simplelog::TerminalMode::Stderr,
674 simplelog::ColorChoice::Auto,
675 )?;
676
677 let shutdown = Arc::new(AtomicBool::new(false));
678 let shutdown_clone = shutdown.clone();
679 ctrlc::set_handler(move || {
680 shutdown_clone.store(true, Ordering::Relaxed);
681 })
682 .context("Error setting Ctrl-C handler")?;
683
684 if let Some(intv) = opts.monitor.or(opts.stats) {
685 let shutdown_copy = shutdown.clone();
686 let jh = std::thread::spawn(move || {
687 match stats::monitor(Duration::from_secs_f64(intv), shutdown_copy) {
688 Ok(_) => {
689 debug!("stats monitor thread finished successfully")
690 }
691 Err(error_object) => {
692 warn!(
693 "stats monitor thread finished because of an error {}",
694 error_object
695 )
696 }
697 }
698 });
699 if opts.monitor.is_some() {
700 let _ = jh.join();
701 return Ok(());
702 }
703 }
704
705 let mut open_object = MaybeUninit::uninit();
706 loop {
707 let mut sched = Scheduler::init(&opts, &mut open_object)?;
708 if !sched.run(shutdown.clone())?.should_restart() {
709 if sched.user_restart {
710 continue;
711 }
712 break;
713 }
714 }
715
716 Ok(())
717}