scx_flash/
main.rs

1// SPDX-License-Identifier: GPL-2.0
2//
3// Copyright (c) 2024 Andrea Righi <arighi@nvidia.com>
4
5// This software may be used and distributed according to the terms of the
6// GNU General Public License version 2.
7
8mod bpf_skel;
9pub use bpf_skel::*;
10pub mod bpf_intf;
11pub use bpf_intf::*;
12
13mod stats;
14use std::collections::BTreeMap;
15use std::ffi::c_int;
16use std::fmt::Write;
17use std::fs::File;
18use std::io::{BufRead, BufReader};
19use std::mem::MaybeUninit;
20use std::sync::atomic::AtomicBool;
21use std::sync::atomic::Ordering;
22use std::sync::Arc;
23use std::time::Duration;
24
25use anyhow::anyhow;
26use anyhow::bail;
27use anyhow::Context;
28use anyhow::Result;
29use clap::Parser;
30use crossbeam::channel::RecvTimeoutError;
31use libbpf_rs::OpenObject;
32use libbpf_rs::ProgramInput;
33use log::{debug, info, warn};
34use scx_stats::prelude::*;
35use scx_utils::autopower::{fetch_power_profile, PowerProfile};
36use scx_utils::build_id;
37use scx_utils::compat;
38use scx_utils::pm::{cpu_idle_resume_latency_supported, update_cpu_idle_resume_latency};
39use scx_utils::scx_ops_attach;
40use scx_utils::scx_ops_load;
41use scx_utils::scx_ops_open;
42use scx_utils::set_rlimit_infinity;
43use scx_utils::uei_exited;
44use scx_utils::uei_report;
45use scx_utils::CoreType;
46use scx_utils::Cpumask;
47use scx_utils::Topology;
48use scx_utils::UserExitInfo;
49use scx_utils::NR_CPU_IDS;
50use stats::Metrics;
51
52const SCHEDULER_NAME: &str = "scx_flash";
53
54#[derive(PartialEq)]
55enum Powermode {
56    Turbo,
57    Performance,
58    Powersave,
59    Any,
60}
61
62fn get_primary_cpus(mode: Powermode) -> std::io::Result<Vec<usize>> {
63    let topo = Topology::new().unwrap();
64
65    let cpus: Vec<usize> = topo
66        .all_cores
67        .values()
68        .flat_map(|core| &core.cpus)
69        .filter_map(|(cpu_id, cpu)| match (&mode, &cpu.core_type) {
70            // Turbo mode: prioritize CPUs with the highest max frequency
71            (Powermode::Turbo, CoreType::Big { turbo: true }) |
72            // Performance mode: add all the Big CPUs (either Turbo or non-Turbo)
73            (Powermode::Performance, CoreType::Big { .. }) |
74            // Powersave mode: add all the Little CPUs
75            (Powermode::Powersave, CoreType::Little) => Some(*cpu_id),
76            (Powermode::Any, ..) => Some(*cpu_id),
77            _ => None,
78        })
79        .collect();
80
81    Ok(cpus)
82}
83
84// Convert an array of CPUs to the corresponding cpumask of any arbitrary size.
85fn cpus_to_cpumask(cpus: &Vec<usize>) -> String {
86    if cpus.is_empty() {
87        return String::from("none");
88    }
89
90    // Determine the maximum CPU ID to create a sufficiently large byte vector.
91    let max_cpu_id = *cpus.iter().max().unwrap();
92
93    // Create a byte vector with enough bytes to cover all CPU IDs.
94    let mut bitmask = vec![0u8; (max_cpu_id + 1 + 7) / 8];
95
96    // Set the appropriate bits for each CPU ID.
97    for cpu_id in cpus {
98        let byte_index = cpu_id / 8;
99        let bit_index = cpu_id % 8;
100        bitmask[byte_index] |= 1 << bit_index;
101    }
102
103    // Convert the byte vector to a hexadecimal string.
104    let hex_str: String = bitmask.iter().rev().fold(String::new(), |mut f, byte| {
105        let _ = write!(&mut f, "{:02x}", byte);
106        f
107    });
108
109    format!("0x{}", hex_str)
110}
111
112#[derive(Debug, clap::Parser)]
113#[command(
114    name = "scx_flash",
115    version,
116    disable_version_flag = true,
117    about = "A deadline-based scheduler focused on fairness and performance predictability.",
118    long_about = r#"
119scx_flash is scheduler that focuses on ensuring fairness and performance predictability.
120
121It operates using an earliest deadline first (EDF) policy. The deadline of each task deadline is
122defined as:
123
124    deadline = vruntime + exec_vruntime
125
126Here, `vruntime` represents the task's total accumulated runtime, inversely scaled by its weight,
127while `exec_vruntime` accounts for the scaled runtime accumulated since the last sleep event.
128
129Fairness is driven by `vruntime`, while `exec_vruntime` helps prioritize latency-sensitive tasks
130that sleep frequently and use the CPU in short bursts.
131
132To prevent sleeping tasks from gaining excessive priority, the maximum vruntime credit a task can
133accumulate while sleeping is capped by `slice_lag`, scaled by the task’s voluntary context switch
134rate (`max_avg_nvcsw`): tasks that sleep frequently can receive a larger credit, while tasks that
135perform fewer, longer sleeps are granted a smaller credit. This encourages responsive behavior
136without excessively boosting idle tasks.
137
138When dynamic fairness is enabled (`--slice-lag-scaling`), the maximum vruntime sleep credit is also
139scaled depending on the user-mode CPU utilization:
140
141 - At low utilization (mostly idle system), the impact of `vruntime` is reduced, and scheduling
142   decisions are driven primarily by `exec_vruntime`. This favors bursty, latency-sensitive
143   workloads (i.e., hackbench), improving their performance and latency.
144
145 - At high utilization, sleeping tasks regain their vruntime credit, increasing the influence of
146   `vruntime` in deadline calculation. This restores fairness and ensures system responsiveness
147   under load.
148
149This adaptive behavior allows the scheduler to prioritize intense message-passing workloads when
150the system is lightly loaded, while maintaining fairness and responsiveness when the system is
151saturated or overcommitted.
152"#
153)]
154struct Opts {
155    /// Exit debug dump buffer length. 0 indicates default.
156    #[clap(long, default_value = "0")]
157    exit_dump_len: u32,
158
159    /// Maximum scheduling slice duration in microseconds.
160    #[clap(short = 's', long, default_value = "4096")]
161    slice_us: u64,
162
163    /// Minimum scheduling slice duration in microseconds.
164    #[clap(short = 'S', long, default_value = "128")]
165    slice_us_min: u64,
166
167    /// Maximum runtime budget that a task can accumulate while sleeping (in microseconds).
168    ///
169    /// Increasing this value can help to enhance the responsiveness of interactive tasks, but it
170    /// can also make performance more "spikey".
171    #[clap(short = 'l', long, default_value = "4096")]
172    slice_us_lag: u64,
173
174    /// Dynamically adjust task's maximum sleep budget based on CPU utilization.
175    ///
176    /// Enabling this option allows to increase the throughput of highly message passing workloads,
177    /// but it can also reduce the overall system responsiveness.
178    #[clap(short = 'L', long, action = clap::ArgAction::SetTrue)]
179    slice_lag_scaling: bool,
180
181    /// Maximum runtime penalty that a task can accumulate while running (in microseconds).
182    ///
183    /// Increasing this value can help to enhance the responsiveness of interactive tasks, but it
184    /// can also make performance more "spikey".
185    #[clap(short = 'r', long, default_value = "32768")]
186    run_us_lag: u64,
187
188    /// Maximum rate of voluntary context switches.
189    ///
190    /// Increasing this value can help prioritize interactive tasks with a higher sleep frequency
191    /// over interactive tasks with lower sleep frequency.
192    ///
193    /// Decreasing this value makes the scheduler more robust and fair.
194    ///
195    /// (0 = disable voluntary context switch prioritization).
196    #[clap(short = 'c', long, default_value = "128")]
197    max_avg_nvcsw: u64,
198
199    /// Utilization percentage to consider a CPU as busy (-1 = auto).
200    ///
201    /// A value close to 0 forces tasks to migrate quickier, increasing work conservation and
202    /// potentially system responsiveness.
203    ///
204    /// A value close to 100 makes tasks more sticky to their CPU, increasing cache-sensivite and
205    /// server-type workloads.
206    ///
207    /// In auto mode (-1) the scheduler autoomatically tries to determine the optimal value in
208    /// function of the current workload.
209    #[clap(short = 'C', long, allow_hyphen_values = true, default_value = "-1")]
210    cpu_busy_thresh: i64,
211
212    /// Throttle the running CPUs by periodically injecting idle cycles.
213    ///
214    /// This option can help extend battery life on portable devices, reduce heating, fan noise
215    /// and overall energy consumption (0 = disable).
216    #[clap(short = 't', long, default_value = "0")]
217    throttle_us: u64,
218
219    /// Set CPU idle QoS resume latency in microseconds (-1 = disabled).
220    ///
221    /// Setting a lower latency value makes CPUs less likely to enter deeper idle states, enhancing
222    /// performance at the cost of higher power consumption. Alternatively, increasing the latency
223    /// value may reduce performance, but also improve power efficiency.
224    #[clap(short = 'I', long, allow_hyphen_values = true, default_value = "32")]
225    idle_resume_us: i64,
226
227    /// Enable tickless mode.
228    ///
229    /// This option enables tickless mode: tasks get an infinite time slice and they are preempted
230    /// only in case of CPU contention. This can help reduce the OS noise and provide a better
231    /// level of performance isolation.
232    #[clap(short = 'T', long, action = clap::ArgAction::SetTrue)]
233    tickless: bool,
234
235    /// Enable round-robin scheduling.
236    ///
237    /// Each task is given a fixed time slice (defined by --slice-us) and run in a cyclic, fair
238    /// order.
239    #[clap(short = 'R', long, action = clap::ArgAction::SetTrue)]
240    rr_sched: bool,
241
242    /// Disable in-kernel idle CPU selection policy.
243    ///
244    /// Set this option to disable the in-kernel built-in idle CPU selection policy and rely on the
245    /// custom CPU selection policy.
246    #[clap(short = 'b', long, action = clap::ArgAction::SetTrue)]
247    no_builtin_idle: bool,
248
249    /// Enable per-CPU tasks prioritization.
250    ///
251    /// Enabling this option allows to prioritize per-CPU tasks that usually tend to be
252    /// de-prioritized, since they can't be migrated when their only usable CPU is busy. This
253    /// improves fairness, but it can also reduce the overall system throughput.
254    ///
255    /// This option is recommended for gaming or latency-sensitive workloads.
256    #[clap(short = 'p', long, action = clap::ArgAction::SetTrue)]
257    local_pcpu: bool,
258
259    /// Always allow direct dispatch to idle CPUs.
260    ///
261    /// By default tasks are not directly dispatched to idle CPUs if there are other tasks waiting
262    /// in the scheduler's queues. This prevents potential starvation of the already queued tasks.
263    ///
264    /// Enabling this option allows tasks to be always dispatched directly to idle CPUs,
265    /// potentially bypassing the scheduler queues.
266    ///
267    /// This allows to improve system throughput, especially with server workloads, but it can
268    /// introduce unfairness and potentially trigger stall conditions.
269    #[clap(short = 'D', long, action = clap::ArgAction::SetTrue)]
270    direct_dispatch: bool,
271
272    /// Enable CPU stickiness.
273    ///
274    /// Enabling this option can reduce the amount of task migrations, but it can also make
275    /// performance less consistent on systems with hybrid cores.
276    ///
277    /// This option has no effect if the primary scheduling domain includes all the CPUs
278    /// (e.g., `--primary-domain all`).
279    #[clap(short = 'y', long, action = clap::ArgAction::SetTrue)]
280    sticky_cpu: bool,
281
282    /// Native tasks priorities.
283    ///
284    /// By default, the scheduler normalizes task priorities to avoid large gaps that could lead to
285    /// stalls or starvation. This option disables normalization and uses the default Linux priority
286    /// range instead.
287    #[clap(short = 'n', long, action = clap::ArgAction::SetTrue)]
288    native_priority: bool,
289
290    /// Enable per-CPU kthread prioritization.
291    ///
292    /// Enabling this can improve system performance, but it may also introduce interactivity
293    /// issues or unfairness in scenarios with high kthread activity, such as heavy I/O or network
294    /// traffic.
295    #[clap(short = 'k', long, action = clap::ArgAction::SetTrue)]
296    local_kthreads: bool,
297
298    /// Disable direct dispatch during synchronous wakeups.
299    ///
300    /// Enabling this option can lead to a more uniform load distribution across available cores,
301    /// potentially improving performance in certain scenarios. However, it may come at the cost of
302    /// reduced efficiency for pipe-intensive workloads that benefit from tighter producer-consumer
303    /// coupling.
304    #[clap(short = 'w', long, action = clap::ArgAction::SetTrue)]
305    no_wake_sync: bool,
306
307    /// Specifies the initial set of CPUs, represented as a bitmask in hex (e.g., 0xff), that the
308    /// scheduler will use to dispatch tasks, until the system becomes saturated, at which point
309    /// tasks may overflow to other available CPUs.
310    ///
311    /// Special values:
312    ///  - "auto" = automatically detect the CPUs based on the active power profile
313    ///  - "turbo" = automatically detect and prioritize the CPUs with the highest max frequency
314    ///  - "performance" = automatically detect and prioritize the fastest CPUs
315    ///  - "powersave" = automatically detect and prioritize the slowest CPUs
316    ///  - "all" = all CPUs assigned to the primary domain
317    ///  - "none" = no prioritization, tasks are dispatched on the first CPU available
318    #[clap(short = 'm', long, default_value = "auto")]
319    primary_domain: String,
320
321    /// Disable L2 cache awareness.
322    #[clap(long, action = clap::ArgAction::SetTrue)]
323    disable_l2: bool,
324
325    /// Disable L3 cache awareness.
326    #[clap(long, action = clap::ArgAction::SetTrue)]
327    disable_l3: bool,
328
329    /// Disable SMT awareness.
330    #[clap(long, action = clap::ArgAction::SetTrue)]
331    disable_smt: bool,
332
333    /// Disable NUMA rebalancing.
334    #[clap(long, action = clap::ArgAction::SetTrue)]
335    disable_numa: bool,
336
337    /// Enable CPU frequency control (only with schedutil governor).
338    ///
339    /// With this option enabled the CPU frequency will be automatically scaled based on the load.
340    #[clap(short = 'f', long, action = clap::ArgAction::SetTrue)]
341    cpufreq: bool,
342
343    /// Enable stats monitoring with the specified interval.
344    #[clap(long)]
345    stats: Option<f64>,
346
347    /// Run in stats monitoring mode with the specified interval. Scheduler
348    /// is not launched.
349    #[clap(long)]
350    monitor: Option<f64>,
351
352    /// Enable BPF debugging via /sys/kernel/tracing/trace_pipe.
353    #[clap(short = 'd', long, action = clap::ArgAction::SetTrue)]
354    debug: bool,
355
356    /// Enable verbose output, including libbpf details.
357    #[clap(short = 'v', long, action = clap::ArgAction::SetTrue)]
358    verbose: bool,
359
360    /// Print scheduler version and exit.
361    #[clap(short = 'V', long, action = clap::ArgAction::SetTrue)]
362    version: bool,
363
364    /// Show descriptions for statistics.
365    #[clap(long)]
366    help_stats: bool,
367}
368
369#[derive(Debug, Clone, Copy)]
370struct CpuTimes {
371    user: u64,
372    nice: u64,
373    total: u64,
374}
375
376struct Scheduler<'a> {
377    skel: BpfSkel<'a>,
378    struct_ops: Option<libbpf_rs::Link>,
379    opts: &'a Opts,
380    topo: Topology,
381    power_profile: PowerProfile,
382    stats_server: StatsServer<(), Metrics>,
383    user_restart: bool,
384}
385
386impl<'a> Scheduler<'a> {
387    fn init(opts: &'a Opts, open_object: &'a mut MaybeUninit<OpenObject>) -> Result<Self> {
388        set_rlimit_infinity();
389
390        // Validate command line arguments.
391        assert!(opts.slice_us >= opts.slice_us_min);
392
393        // Initialize CPU topology.
394        let topo = Topology::new().unwrap();
395
396        // Check host topology to determine if we need to enable SMT capabilities.
397        let smt_enabled = !opts.disable_smt && topo.smt_enabled;
398
399        info!(
400            "{} {} {}",
401            SCHEDULER_NAME,
402            build_id::full_version(env!("CARGO_PKG_VERSION")),
403            if smt_enabled { "SMT on" } else { "SMT off" }
404        );
405
406        // Print command line.
407        info!(
408            "scheduler options: {}",
409            std::env::args().collect::<Vec<_>>().join(" ")
410        );
411
412        if opts.idle_resume_us >= 0 {
413            if !cpu_idle_resume_latency_supported() {
414                warn!("idle resume latency not supported");
415            } else {
416                info!("Setting idle QoS to {} us", opts.idle_resume_us);
417                for cpu in topo.all_cpus.values() {
418                    update_cpu_idle_resume_latency(
419                        cpu.id,
420                        opts.idle_resume_us.try_into().unwrap(),
421                    )?;
422                }
423            }
424        }
425
426        // Determine the amount of non-empty NUMA nodes in the system.
427        let nr_nodes = topo
428            .nodes
429            .values()
430            .filter(|node| !node.all_cpus.is_empty())
431            .count();
432        info!("NUMA nodes: {}", nr_nodes);
433
434        // Automatically disable NUMA optimizations when running on non-NUMA systems.
435        let numa_disabled = opts.disable_numa || nr_nodes == 1;
436        if numa_disabled {
437            info!("Disabling NUMA optimizations");
438        }
439
440        // Determine the primary scheduling domain.
441        let power_profile = Self::power_profile();
442        let domain =
443            Self::resolve_energy_domain(&opts.primary_domain, power_profile).map_err(|err| {
444                anyhow!(
445                    "failed to resolve primary domain '{}': {}",
446                    &opts.primary_domain,
447                    err
448                )
449            })?;
450
451        // Initialize BPF connector.
452        let mut skel_builder = BpfSkelBuilder::default();
453        skel_builder.obj_builder.debug(opts.verbose);
454        let mut skel = scx_ops_open!(skel_builder, open_object, flash_ops)?;
455
456        skel.struct_ops.flash_ops_mut().exit_dump_len = opts.exit_dump_len;
457
458        // Override default BPF scheduling parameters.
459        let rodata = skel.maps.rodata_data.as_mut().unwrap();
460        rodata.debug = opts.debug;
461        rodata.smt_enabled = smt_enabled;
462        rodata.numa_disabled = numa_disabled;
463        rodata.rr_sched = opts.rr_sched;
464        rodata.local_pcpu = opts.local_pcpu;
465        rodata.direct_dispatch = opts.direct_dispatch;
466        rodata.sticky_cpu = opts.sticky_cpu;
467        rodata.no_wake_sync = opts.no_wake_sync;
468        rodata.tickless_sched = opts.tickless;
469        rodata.native_priority = opts.native_priority;
470        rodata.slice_lag_scaling = opts.slice_lag_scaling;
471        rodata.builtin_idle = !opts.no_builtin_idle;
472        rodata.slice_max = opts.slice_us * 1000;
473        rodata.slice_min = opts.slice_us_min * 1000;
474        rodata.slice_lag = opts.slice_us_lag * 1000;
475        rodata.run_lag = opts.run_us_lag * 1000;
476        rodata.throttle_ns = opts.throttle_us * 1000;
477        rodata.max_avg_nvcsw = opts.max_avg_nvcsw;
478        rodata.primary_all = domain.weight() == *NR_CPU_IDS;
479
480        // Normalize CPU busy threshold in the range [0 .. 1024].
481        rodata.cpu_busy_thresh = if opts.cpu_busy_thresh < 0 {
482            opts.cpu_busy_thresh
483        } else {
484            opts.cpu_busy_thresh * 1024 / 100
485        };
486
487        // Implicitly enable direct dispatch of per-CPU kthreads if CPU throttling is enabled
488        // (it's never a good idea to throttle per-CPU kthreads).
489        rodata.local_kthreads = opts.local_kthreads || opts.throttle_us > 0;
490
491        // Set scheduler compatibility flags.
492        rodata.__COMPAT_SCX_PICK_IDLE_IN_NODE = *compat::SCX_PICK_IDLE_IN_NODE;
493
494        // Set scheduler flags.
495        skel.struct_ops.flash_ops_mut().flags = *compat::SCX_OPS_ENQ_EXITING
496            | *compat::SCX_OPS_ENQ_LAST
497            | *compat::SCX_OPS_ENQ_MIGRATION_DISABLED
498            | *compat::SCX_OPS_ALLOW_QUEUED_WAKEUP
499            | if numa_disabled {
500                0
501            } else {
502                *compat::SCX_OPS_BUILTIN_IDLE_PER_NODE
503            };
504        info!(
505            "scheduler flags: {:#x}",
506            skel.struct_ops.flash_ops_mut().flags
507        );
508
509        // Load the BPF program for validation.
510        let mut skel = scx_ops_load!(skel, flash_ops, uei)?;
511
512        // Initialize the primary scheduling domain and the preferred domain.
513        Self::init_energy_domain(&mut skel, &domain).map_err(|err| {
514            anyhow!(
515                "failed to initialize primary domain 0x{:x}: {}",
516                domain,
517                err
518            )
519        })?;
520
521        if let Err(err) = Self::init_cpufreq_perf(&mut skel, &opts.primary_domain, opts.cpufreq) {
522            bail!(
523                "failed to initialize cpufreq performance level: error {}",
524                err
525            );
526        }
527
528        // Initialize SMT domains.
529        if smt_enabled {
530            Self::init_smt_domains(&mut skel, &topo)?;
531        }
532
533        // Initialize L2 cache domains.
534        if !opts.disable_l2 {
535            Self::init_l2_cache_domains(&mut skel, &topo)?;
536        }
537        // Initialize L3 cache domains.
538        if !opts.disable_l3 {
539            Self::init_l3_cache_domains(&mut skel, &topo)?;
540        }
541
542        // Attach the scheduler.
543        let struct_ops = Some(scx_ops_attach!(skel, flash_ops)?);
544        let stats_server = StatsServer::new(stats::server_data()).launch()?;
545
546        Ok(Self {
547            skel,
548            struct_ops,
549            opts,
550            topo,
551            power_profile,
552            stats_server,
553            user_restart: false,
554        })
555    }
556
557    fn enable_primary_cpu(skel: &mut BpfSkel<'_>, cpu: i32) -> Result<(), u32> {
558        let prog = &mut skel.progs.enable_primary_cpu;
559        let mut args = cpu_arg {
560            cpu_id: cpu as c_int,
561        };
562        let input = ProgramInput {
563            context_in: Some(unsafe {
564                std::slice::from_raw_parts_mut(
565                    &mut args as *mut _ as *mut u8,
566                    std::mem::size_of_val(&args),
567                )
568            }),
569            ..Default::default()
570        };
571        let out = prog.test_run(input).unwrap();
572        if out.return_value != 0 {
573            return Err(out.return_value);
574        }
575
576        Ok(())
577    }
578
579    fn epp_to_cpumask(profile: Powermode) -> Result<Cpumask> {
580        let mut cpus = get_primary_cpus(profile).unwrap_or_default();
581        if cpus.is_empty() {
582            cpus = get_primary_cpus(Powermode::Any).unwrap_or_default();
583        }
584        Cpumask::from_str(&cpus_to_cpumask(&cpus))
585    }
586
587    fn resolve_energy_domain(primary_domain: &str, power_profile: PowerProfile) -> Result<Cpumask> {
588        let domain = match primary_domain {
589            "powersave" => Self::epp_to_cpumask(Powermode::Powersave)?,
590            "performance" => Self::epp_to_cpumask(Powermode::Performance)?,
591            "turbo" => Self::epp_to_cpumask(Powermode::Turbo)?,
592            "auto" => match power_profile {
593                PowerProfile::Powersave => Self::epp_to_cpumask(Powermode::Powersave)?,
594                PowerProfile::Balanced { power: true } => {
595                    Self::epp_to_cpumask(Powermode::Powersave)?
596                }
597                PowerProfile::Balanced { power: false }
598                | PowerProfile::Performance
599                | PowerProfile::Unknown => Self::epp_to_cpumask(Powermode::Any)?,
600            },
601            "all" => Self::epp_to_cpumask(Powermode::Any)?,
602            &_ => Cpumask::from_str(primary_domain)?,
603        };
604
605        Ok(domain)
606    }
607
608    fn init_energy_domain(skel: &mut BpfSkel<'_>, domain: &Cpumask) -> Result<()> {
609        info!("primary CPU domain = 0x{:x}", domain);
610
611        // Clear the primary domain by passing a negative CPU id.
612        if let Err(err) = Self::enable_primary_cpu(skel, -1) {
613            bail!("failed to reset primary domain: error {}", err);
614        }
615
616        // Update primary scheduling domain.
617        for cpu in 0..*NR_CPU_IDS {
618            if domain.test_cpu(cpu) {
619                if let Err(err) = Self::enable_primary_cpu(skel, cpu as i32) {
620                    bail!("failed to add CPU {} to primary domain: error {}", cpu, err);
621                }
622            }
623        }
624
625        Ok(())
626    }
627
628    // Update hint for the cpufreq governor.
629    fn init_cpufreq_perf(
630        skel: &mut BpfSkel<'_>,
631        primary_domain: &String,
632        auto: bool,
633    ) -> Result<()> {
634        // If we are using the powersave profile always scale the CPU frequency to the minimum,
635        // otherwise use the maximum, unless automatic frequency scaling is enabled.
636        let perf_lvl: i64 = match primary_domain.as_str() {
637            "powersave" => 0,
638            _ if auto => -1,
639            _ => 1024,
640        };
641        info!(
642            "cpufreq performance level: {}",
643            match perf_lvl {
644                1024 => "max".into(),
645                0 => "min".into(),
646                n if n < 0 => "auto".into(),
647                _ => perf_lvl.to_string(),
648            }
649        );
650        skel.maps.bss_data.as_mut().unwrap().cpufreq_perf_lvl = perf_lvl;
651
652        Ok(())
653    }
654
655    fn power_profile() -> PowerProfile {
656        let profile = fetch_power_profile(true);
657        if profile == PowerProfile::Unknown {
658            fetch_power_profile(false)
659        } else {
660            profile
661        }
662    }
663
664    fn refresh_sched_domain(&mut self) -> bool {
665        if self.power_profile != PowerProfile::Unknown {
666            let power_profile = Self::power_profile();
667            if power_profile != self.power_profile {
668                self.power_profile = power_profile;
669
670                if self.opts.primary_domain == "auto" {
671                    return true;
672                }
673                if let Err(err) = Self::init_cpufreq_perf(
674                    &mut self.skel,
675                    &self.opts.primary_domain,
676                    self.opts.cpufreq,
677                ) {
678                    warn!("failed to refresh cpufreq performance level: error {}", err);
679                }
680            }
681        }
682
683        false
684    }
685
686    fn enable_sibling_cpu(
687        skel: &mut BpfSkel<'_>,
688        lvl: usize,
689        cpu: usize,
690        sibling_cpu: usize,
691    ) -> Result<(), u32> {
692        let prog = &mut skel.progs.enable_sibling_cpu;
693        let mut args = domain_arg {
694            lvl_id: lvl as c_int,
695            cpu_id: cpu as c_int,
696            sibling_cpu_id: sibling_cpu as c_int,
697        };
698        let input = ProgramInput {
699            context_in: Some(unsafe {
700                std::slice::from_raw_parts_mut(
701                    &mut args as *mut _ as *mut u8,
702                    std::mem::size_of_val(&args),
703                )
704            }),
705            ..Default::default()
706        };
707        let out = prog.test_run(input).unwrap();
708        if out.return_value != 0 {
709            return Err(out.return_value);
710        }
711
712        Ok(())
713    }
714
715    fn init_smt_domains(skel: &mut BpfSkel<'_>, topo: &Topology) -> Result<(), std::io::Error> {
716        let smt_siblings = topo.sibling_cpus();
717
718        info!("SMT sibling CPUs: {:?}", smt_siblings);
719        for (cpu, sibling_cpu) in smt_siblings.iter().enumerate() {
720            Self::enable_sibling_cpu(skel, 0, cpu, *sibling_cpu as usize).unwrap();
721        }
722
723        Ok(())
724    }
725
726    fn are_smt_siblings(topo: &Topology, cpus: &[usize]) -> bool {
727        // Single CPU or empty array are considered siblings.
728        if cpus.len() <= 1 {
729            return true;
730        }
731
732        // Check if each CPU is a sibling of the first CPU.
733        let first_cpu = cpus[0];
734        let smt_siblings = topo.sibling_cpus();
735        cpus.iter().all(|&cpu| {
736            cpu == first_cpu
737                || smt_siblings[cpu] == first_cpu as i32
738                || (smt_siblings[first_cpu] >= 0 && smt_siblings[first_cpu] == cpu as i32)
739        })
740    }
741
742    fn init_cache_domains(
743        skel: &mut BpfSkel<'_>,
744        topo: &Topology,
745        cache_lvl: usize,
746        enable_sibling_cpu_fn: &dyn Fn(&mut BpfSkel<'_>, usize, usize, usize) -> Result<(), u32>,
747    ) -> Result<(), std::io::Error> {
748        // Determine the list of CPU IDs associated to each cache node.
749        let mut cache_id_map: BTreeMap<usize, Vec<usize>> = BTreeMap::new();
750        for core in topo.all_cores.values() {
751            for (cpu_id, cpu) in &core.cpus {
752                let cache_id = match cache_lvl {
753                    2 => cpu.l2_id,
754                    3 => cpu.llc_id,
755                    _ => panic!("invalid cache level {}", cache_lvl),
756                };
757                cache_id_map.entry(cache_id).or_default().push(*cpu_id);
758            }
759        }
760
761        // Update the BPF cpumasks for the cache domains.
762        for (cache_id, cpus) in cache_id_map {
763            // Ignore the cache domain if it includes a single CPU.
764            if cpus.len() <= 1 {
765                continue;
766            }
767
768            // Ignore the cache domain if all the CPUs are part of the same SMT core.
769            if Self::are_smt_siblings(topo, &cpus) {
770                continue;
771            }
772
773            info!(
774                "L{} cache ID {}: sibling CPUs: {:?}",
775                cache_lvl, cache_id, cpus
776            );
777            for cpu in &cpus {
778                for sibling_cpu in &cpus {
779                    if enable_sibling_cpu_fn(skel, cache_lvl, *cpu, *sibling_cpu).is_err() {
780                        warn!(
781                            "L{} cache ID {}: failed to set CPU {} sibling {}",
782                            cache_lvl, cache_id, *cpu, *sibling_cpu
783                        );
784                    }
785                }
786            }
787        }
788
789        Ok(())
790    }
791
792    fn init_l2_cache_domains(
793        skel: &mut BpfSkel<'_>,
794        topo: &Topology,
795    ) -> Result<(), std::io::Error> {
796        Self::init_cache_domains(skel, topo, 2, &|skel, lvl, cpu, sibling_cpu| {
797            Self::enable_sibling_cpu(skel, lvl, cpu, sibling_cpu)
798        })
799    }
800
801    fn init_l3_cache_domains(
802        skel: &mut BpfSkel<'_>,
803        topo: &Topology,
804    ) -> Result<(), std::io::Error> {
805        Self::init_cache_domains(skel, topo, 3, &|skel, lvl, cpu, sibling_cpu| {
806            Self::enable_sibling_cpu(skel, lvl, cpu, sibling_cpu)
807        })
808    }
809
810    fn get_metrics(&self) -> Metrics {
811        let bss_data = self.skel.maps.bss_data.as_ref().unwrap();
812        Metrics {
813            nr_running: bss_data.nr_running,
814            nr_cpus: bss_data.nr_online_cpus,
815            nr_kthread_dispatches: bss_data.nr_kthread_dispatches,
816            nr_direct_dispatches: bss_data.nr_direct_dispatches,
817            nr_shared_dispatches: bss_data.nr_shared_dispatches,
818        }
819    }
820
821    pub fn exited(&mut self) -> bool {
822        uei_exited!(&self.skel, uei)
823    }
824
825    fn compute_user_cpu_pct(prev: &CpuTimes, curr: &CpuTimes) -> Option<u64> {
826        let total_diff = curr.total.saturating_sub(prev.total);
827        let user_diff = (curr.user + curr.nice).saturating_sub(prev.user + prev.nice);
828
829        if total_diff > 0 {
830            let user_ratio = user_diff as f64 / total_diff as f64;
831            Some((user_ratio * 1024.0).round() as u64)
832        } else {
833            None
834        }
835    }
836
837    fn read_cpu_times() -> Option<CpuTimes> {
838        let file = File::open("/proc/stat").ok()?;
839        let reader = BufReader::new(file);
840
841        for line in reader.lines() {
842            let line = line.ok()?;
843            if line.starts_with("cpu ") {
844                let fields: Vec<&str> = line.split_whitespace().collect();
845                if fields.len() < 5 {
846                    return None;
847                }
848
849                let user: u64 = fields[1].parse().ok()?;
850                let nice: u64 = fields[2].parse().ok()?;
851
852                // Sum the first 8 fields as total time, including idle, system, etc.
853                let total: u64 = fields
854                    .iter()
855                    .skip(1)
856                    .take(8)
857                    .filter_map(|v| v.parse::<u64>().ok())
858                    .sum();
859
860                return Some(CpuTimes { user, nice, total });
861            }
862        }
863
864        None
865    }
866
867    fn run(&mut self, shutdown: Arc<AtomicBool>) -> Result<UserExitInfo> {
868        let mut prev_cputime = Self::read_cpu_times().expect("Failed to read initial CPU stats");
869        let (res_ch, req_ch) = self.stats_server.channels();
870
871        while !shutdown.load(Ordering::Relaxed) && !self.exited() {
872            if self.refresh_sched_domain() {
873                self.user_restart = true;
874                break;
875            }
876
877            if self.opts.cpu_busy_thresh < 0 {
878                if let Some(curr_cputime) = Self::read_cpu_times() {
879                    if let Some(cpu_util) = Self::compute_user_cpu_pct(&prev_cputime, &curr_cputime)
880                    {
881                        self.skel.maps.bss_data.as_mut().unwrap().cpu_util = cpu_util;
882                    }
883                    prev_cputime = curr_cputime;
884                }
885            }
886
887            match req_ch.recv_timeout(Duration::from_secs(1)) {
888                Ok(()) => res_ch.send(self.get_metrics())?,
889                Err(RecvTimeoutError::Timeout) => {}
890                Err(e) => Err(e)?,
891            }
892        }
893
894        let _ = self.struct_ops.take();
895        uei_report!(&self.skel, uei)
896    }
897}
898
899impl Drop for Scheduler<'_> {
900    fn drop(&mut self) {
901        info!("Unregister {} scheduler", SCHEDULER_NAME);
902
903        // Restore default CPU idle QoS resume latency.
904        if self.opts.idle_resume_us >= 0 {
905            if cpu_idle_resume_latency_supported() {
906                for cpu in self.topo.all_cpus.values() {
907                    update_cpu_idle_resume_latency(cpu.id, cpu.pm_qos_resume_latency_us as i32)
908                        .unwrap();
909                }
910            }
911        }
912    }
913}
914
915fn main() -> Result<()> {
916    let opts = Opts::parse();
917
918    if opts.version {
919        println!(
920            "{} {}",
921            SCHEDULER_NAME,
922            build_id::full_version(env!("CARGO_PKG_VERSION"))
923        );
924        return Ok(());
925    }
926
927    if opts.help_stats {
928        stats::server_data().describe_meta(&mut std::io::stdout(), None)?;
929        return Ok(());
930    }
931
932    let loglevel = simplelog::LevelFilter::Info;
933
934    let mut lcfg = simplelog::ConfigBuilder::new();
935    lcfg.set_time_offset_to_local()
936        .expect("Failed to set local time offset")
937        .set_time_level(simplelog::LevelFilter::Error)
938        .set_location_level(simplelog::LevelFilter::Off)
939        .set_target_level(simplelog::LevelFilter::Off)
940        .set_thread_level(simplelog::LevelFilter::Off);
941    simplelog::TermLogger::init(
942        loglevel,
943        lcfg.build(),
944        simplelog::TerminalMode::Stderr,
945        simplelog::ColorChoice::Auto,
946    )?;
947
948    let shutdown = Arc::new(AtomicBool::new(false));
949    let shutdown_clone = shutdown.clone();
950    ctrlc::set_handler(move || {
951        shutdown_clone.store(true, Ordering::Relaxed);
952    })
953    .context("Error setting Ctrl-C handler")?;
954
955    if let Some(intv) = opts.monitor.or(opts.stats) {
956        let shutdown_copy = shutdown.clone();
957        let jh = std::thread::spawn(move || {
958            match stats::monitor(Duration::from_secs_f64(intv), shutdown_copy) {
959                Ok(_) => {
960                    debug!("stats monitor thread finished successfully")
961                }
962                Err(error_object) => {
963                    warn!(
964                        "stats monitor thread finished because of an error {}",
965                        error_object
966                    )
967                }
968            }
969        });
970        if opts.monitor.is_some() {
971            let _ = jh.join();
972            return Ok(());
973        }
974    }
975
976    let mut open_object = MaybeUninit::uninit();
977    loop {
978        let mut sched = Scheduler::init(&opts, &mut open_object)?;
979        if !sched.run(shutdown.clone())?.should_restart() {
980            if sched.user_restart {
981                continue;
982            }
983            break;
984        }
985    }
986
987    Ok(())
988}