scx_flash/
main.rs

1// SPDX-License-Identifier: GPL-2.0
2//
3// Copyright (c) 2024 Andrea Righi <arighi@nvidia.com>
4
5// This software may be used and distributed according to the terms of the
6// GNU General Public License version 2.
7
8mod bpf_skel;
9pub use bpf_skel::*;
10pub mod bpf_intf;
11pub use bpf_intf::*;
12
13mod stats;
14use std::collections::BTreeMap;
15use std::ffi::c_int;
16use std::fmt::Write;
17use std::fs::File;
18use std::io::{BufRead, BufReader};
19use std::mem::MaybeUninit;
20use std::sync::atomic::AtomicBool;
21use std::sync::atomic::Ordering;
22use std::sync::Arc;
23use std::time::Duration;
24
25use anyhow::anyhow;
26use anyhow::bail;
27use anyhow::Context;
28use anyhow::Result;
29use clap::Parser;
30use crossbeam::channel::RecvTimeoutError;
31use libbpf_rs::OpenObject;
32use libbpf_rs::ProgramInput;
33use log::{debug, info, warn};
34use scx_stats::prelude::*;
35use scx_utils::autopower::{fetch_power_profile, PowerProfile};
36use scx_utils::build_id;
37use scx_utils::compat;
38use scx_utils::libbpf_clap_opts::LibbpfOpts;
39use scx_utils::pm::{cpu_idle_resume_latency_supported, update_cpu_idle_resume_latency};
40use scx_utils::scx_ops_attach;
41use scx_utils::scx_ops_load;
42use scx_utils::scx_ops_open;
43use scx_utils::try_set_rlimit_infinity;
44use scx_utils::uei_exited;
45use scx_utils::uei_report;
46use scx_utils::CoreType;
47use scx_utils::Cpumask;
48use scx_utils::Topology;
49use scx_utils::UserExitInfo;
50use scx_utils::NR_CPU_IDS;
51use stats::Metrics;
52
53const SCHEDULER_NAME: &str = "scx_flash";
54
55#[derive(PartialEq)]
56enum Powermode {
57    Turbo,
58    Performance,
59    Powersave,
60    Any,
61}
62
63fn get_primary_cpus(mode: Powermode) -> std::io::Result<Vec<usize>> {
64    let topo = Topology::new().unwrap();
65
66    let cpus: Vec<usize> = topo
67        .all_cores
68        .values()
69        .flat_map(|core| &core.cpus)
70        .filter_map(|(cpu_id, cpu)| match (&mode, &cpu.core_type) {
71            // Turbo mode: prioritize CPUs with the highest max frequency
72            (Powermode::Turbo, CoreType::Big { turbo: true }) |
73            // Performance mode: add all the Big CPUs (either Turbo or non-Turbo)
74            (Powermode::Performance, CoreType::Big { .. }) |
75            // Powersave mode: add all the Little CPUs
76            (Powermode::Powersave, CoreType::Little) => Some(*cpu_id),
77            (Powermode::Any, ..) => Some(*cpu_id),
78            _ => None,
79        })
80        .collect();
81
82    Ok(cpus)
83}
84
85// Convert an array of CPUs to the corresponding cpumask of any arbitrary size.
86fn cpus_to_cpumask(cpus: &Vec<usize>) -> String {
87    if cpus.is_empty() {
88        return String::from("none");
89    }
90
91    // Determine the maximum CPU ID to create a sufficiently large byte vector.
92    let max_cpu_id = *cpus.iter().max().unwrap();
93
94    // Create a byte vector with enough bytes to cover all CPU IDs.
95    let mut bitmask = vec![0u8; (max_cpu_id + 1 + 7) / 8];
96
97    // Set the appropriate bits for each CPU ID.
98    for cpu_id in cpus {
99        let byte_index = cpu_id / 8;
100        let bit_index = cpu_id % 8;
101        bitmask[byte_index] |= 1 << bit_index;
102    }
103
104    // Convert the byte vector to a hexadecimal string.
105    let hex_str: String = bitmask.iter().rev().fold(String::new(), |mut f, byte| {
106        let _ = write!(&mut f, "{:02x}", byte);
107        f
108    });
109
110    format!("0x{}", hex_str)
111}
112
113#[derive(Debug, clap::Parser)]
114#[command(
115    name = "scx_flash",
116    version,
117    disable_version_flag = true,
118    about = "A deadline-based scheduler focused on fairness and performance predictability.",
119    long_about = r#"
120scx_flash is scheduler that focuses on ensuring fairness and performance predictability.
121
122It operates using an earliest deadline first (EDF) policy. The deadline of each task deadline is
123defined as:
124
125    deadline = vruntime + exec_vruntime
126
127Here, `vruntime` represents the task's total accumulated runtime, inversely scaled by its weight,
128while `exec_vruntime` accounts for the scaled runtime accumulated since the last sleep event.
129
130Fairness is driven by `vruntime`, while `exec_vruntime` helps prioritize latency-sensitive tasks
131that sleep frequently and use the CPU in short bursts.
132
133To prevent sleeping tasks from gaining excessive priority, the maximum vruntime credit a task can
134accumulate while sleeping is capped by `slice_lag`, scaled by the task’s voluntary context switch
135rate (`max_avg_nvcsw`): tasks that sleep frequently can receive a larger credit, while tasks that
136perform fewer, longer sleeps are granted a smaller credit. This encourages responsive behavior
137without excessively boosting idle tasks.
138
139When dynamic fairness is enabled (`--slice-lag-scaling`), the maximum vruntime sleep credit is also
140scaled depending on the user-mode CPU utilization:
141
142 - At low utilization (mostly idle system), the impact of `vruntime` is reduced, and scheduling
143   decisions are driven primarily by `exec_vruntime`. This favors bursty, latency-sensitive
144   workloads (i.e., hackbench), improving their performance and latency.
145
146 - At high utilization, sleeping tasks regain their vruntime credit, increasing the influence of
147   `vruntime` in deadline calculation. This restores fairness and ensures system responsiveness
148   under load.
149
150This adaptive behavior allows the scheduler to prioritize intense message-passing workloads when
151the system is lightly loaded, while maintaining fairness and responsiveness when the system is
152saturated or overcommitted.
153"#
154)]
155struct Opts {
156    /// Exit debug dump buffer length. 0 indicates default.
157    #[clap(long, default_value = "0")]
158    exit_dump_len: u32,
159
160    /// Maximum scheduling slice duration in microseconds.
161    #[clap(short = 's', long, default_value = "4096")]
162    slice_us: u64,
163
164    /// Minimum scheduling slice duration in microseconds.
165    #[clap(short = 'S', long, default_value = "128")]
166    slice_us_min: u64,
167
168    /// Maximum runtime budget that a task can accumulate while sleeping (in microseconds).
169    ///
170    /// Increasing this value can help to enhance the responsiveness of interactive tasks, but it
171    /// can also make performance more "spikey".
172    #[clap(short = 'l', long, default_value = "4096")]
173    slice_us_lag: u64,
174
175    /// Dynamically adjust task's maximum sleep budget based on CPU utilization.
176    ///
177    /// Enabling this option allows to increase the throughput of highly message passing workloads,
178    /// but it can also reduce the overall system responsiveness.
179    #[clap(short = 'L', long, action = clap::ArgAction::SetTrue)]
180    slice_lag_scaling: bool,
181
182    /// Maximum runtime penalty that a task can accumulate while running (in microseconds).
183    ///
184    /// Increasing this value can help to enhance the responsiveness of interactive tasks, but it
185    /// can also make performance more "spikey".
186    #[clap(short = 'r', long, default_value = "32768")]
187    run_us_lag: u64,
188
189    /// Maximum rate of voluntary context switches.
190    ///
191    /// Increasing this value can help prioritize interactive tasks with a higher sleep frequency
192    /// over interactive tasks with lower sleep frequency.
193    ///
194    /// Decreasing this value makes the scheduler more robust and fair.
195    ///
196    /// (0 = disable voluntary context switch prioritization).
197    #[clap(short = 'c', long, default_value = "128")]
198    max_avg_nvcsw: u64,
199
200    /// Utilization percentage to consider a CPU as busy (-1 = auto).
201    ///
202    /// A value close to 0 forces tasks to migrate quickier, increasing work conservation and
203    /// potentially system responsiveness.
204    ///
205    /// A value close to 100 makes tasks more sticky to their CPU, increasing cache-sensivite and
206    /// server-type workloads.
207    ///
208    /// In auto mode (-1) the scheduler autoomatically tries to determine the optimal value in
209    /// function of the current workload.
210    #[clap(short = 'C', long, allow_hyphen_values = true, default_value = "-1")]
211    cpu_busy_thresh: i64,
212
213    /// Throttle the running CPUs by periodically injecting idle cycles.
214    ///
215    /// This option can help extend battery life on portable devices, reduce heating, fan noise
216    /// and overall energy consumption (0 = disable).
217    #[clap(short = 't', long, default_value = "0")]
218    throttle_us: u64,
219
220    /// Set CPU idle QoS resume latency in microseconds (-1 = disabled).
221    ///
222    /// Setting a lower latency value makes CPUs less likely to enter deeper idle states, enhancing
223    /// performance at the cost of higher power consumption. Alternatively, increasing the latency
224    /// value may reduce performance, but also improve power efficiency.
225    #[clap(short = 'I', long, allow_hyphen_values = true, default_value = "32")]
226    idle_resume_us: i64,
227
228    /// Enable tickless mode.
229    ///
230    /// This option enables tickless mode: tasks get an infinite time slice and they are preempted
231    /// only in case of CPU contention. This can help reduce the OS noise and provide a better
232    /// level of performance isolation.
233    #[clap(short = 'T', long, action = clap::ArgAction::SetTrue)]
234    tickless: bool,
235
236    /// Enable round-robin scheduling.
237    ///
238    /// Each task is given a fixed time slice (defined by --slice-us) and run in a cyclic, fair
239    /// order.
240    #[clap(short = 'R', long, action = clap::ArgAction::SetTrue)]
241    rr_sched: bool,
242
243    /// Disable in-kernel idle CPU selection policy.
244    ///
245    /// Set this option to disable the in-kernel built-in idle CPU selection policy and rely on the
246    /// custom CPU selection policy.
247    #[clap(short = 'b', long, action = clap::ArgAction::SetTrue)]
248    no_builtin_idle: bool,
249
250    /// Enable per-CPU tasks prioritization.
251    ///
252    /// Enabling this option allows to prioritize per-CPU tasks that usually tend to be
253    /// de-prioritized, since they can't be migrated when their only usable CPU is busy. This
254    /// improves fairness, but it can also reduce the overall system throughput.
255    ///
256    /// This option is recommended for gaming or latency-sensitive workloads.
257    #[clap(short = 'p', long, action = clap::ArgAction::SetTrue)]
258    local_pcpu: bool,
259
260    /// Always allow direct dispatch to idle CPUs.
261    ///
262    /// By default tasks are not directly dispatched to idle CPUs if there are other tasks waiting
263    /// in the scheduler's queues. This prevents potential starvation of the already queued tasks.
264    ///
265    /// Enabling this option allows tasks to be always dispatched directly to idle CPUs,
266    /// potentially bypassing the scheduler queues.
267    ///
268    /// This allows to improve system throughput, especially with server workloads, but it can
269    /// introduce unfairness and potentially trigger stall conditions.
270    #[clap(short = 'D', long, action = clap::ArgAction::SetTrue)]
271    direct_dispatch: bool,
272
273    /// Enable CPU stickiness.
274    ///
275    /// Enabling this option can reduce the amount of task migrations, but it can also make
276    /// performance less consistent on systems with hybrid cores.
277    ///
278    /// This option has no effect if the primary scheduling domain includes all the CPUs
279    /// (e.g., `--primary-domain all`).
280    #[clap(short = 'y', long, action = clap::ArgAction::SetTrue)]
281    sticky_cpu: bool,
282
283    /// Native tasks priorities.
284    ///
285    /// By default, the scheduler normalizes task priorities to avoid large gaps that could lead to
286    /// stalls or starvation. This option disables normalization and uses the default Linux priority
287    /// range instead.
288    #[clap(short = 'n', long, action = clap::ArgAction::SetTrue)]
289    native_priority: bool,
290
291    /// Enable per-CPU kthread prioritization.
292    ///
293    /// Enabling this can improve system performance, but it may also introduce interactivity
294    /// issues or unfairness in scenarios with high kthread activity, such as heavy I/O or network
295    /// traffic.
296    #[clap(short = 'k', long, action = clap::ArgAction::SetTrue)]
297    local_kthreads: bool,
298
299    /// Disable direct dispatch during synchronous wakeups.
300    ///
301    /// Enabling this option can lead to a more uniform load distribution across available cores,
302    /// potentially improving performance in certain scenarios. However, it may come at the cost of
303    /// reduced efficiency for pipe-intensive workloads that benefit from tighter producer-consumer
304    /// coupling.
305    #[clap(short = 'w', long, action = clap::ArgAction::SetTrue)]
306    no_wake_sync: bool,
307
308    /// Specifies the initial set of CPUs, represented as a bitmask in hex (e.g., 0xff), that the
309    /// scheduler will use to dispatch tasks, until the system becomes saturated, at which point
310    /// tasks may overflow to other available CPUs.
311    ///
312    /// Special values:
313    ///  - "auto" = automatically detect the CPUs based on the active power profile
314    ///  - "turbo" = automatically detect and prioritize the CPUs with the highest max frequency
315    ///  - "performance" = automatically detect and prioritize the fastest CPUs
316    ///  - "powersave" = automatically detect and prioritize the slowest CPUs
317    ///  - "all" = all CPUs assigned to the primary domain
318    ///  - "none" = no prioritization, tasks are dispatched on the first CPU available
319    #[clap(short = 'm', long, default_value = "auto")]
320    primary_domain: String,
321
322    /// Disable L2 cache awareness.
323    #[clap(long, action = clap::ArgAction::SetTrue)]
324    disable_l2: bool,
325
326    /// Disable L3 cache awareness.
327    #[clap(long, action = clap::ArgAction::SetTrue)]
328    disable_l3: bool,
329
330    /// Disable SMT awareness.
331    #[clap(long, action = clap::ArgAction::SetTrue)]
332    disable_smt: bool,
333
334    /// Disable NUMA rebalancing.
335    #[clap(long, action = clap::ArgAction::SetTrue)]
336    disable_numa: bool,
337
338    /// Enable CPU frequency control (only with schedutil governor).
339    ///
340    /// With this option enabled the CPU frequency will be automatically scaled based on the load.
341    #[clap(short = 'f', long, action = clap::ArgAction::SetTrue)]
342    cpufreq: bool,
343
344    /// Enable stats monitoring with the specified interval.
345    #[clap(long)]
346    stats: Option<f64>,
347
348    /// Run in stats monitoring mode with the specified interval. Scheduler
349    /// is not launched.
350    #[clap(long)]
351    monitor: Option<f64>,
352
353    /// Enable BPF debugging via /sys/kernel/tracing/trace_pipe.
354    #[clap(short = 'd', long, action = clap::ArgAction::SetTrue)]
355    debug: bool,
356
357    /// Enable verbose output, including libbpf details.
358    #[clap(short = 'v', long, action = clap::ArgAction::SetTrue)]
359    verbose: bool,
360
361    /// Print scheduler version and exit.
362    #[clap(short = 'V', long, action = clap::ArgAction::SetTrue)]
363    version: bool,
364
365    /// Show descriptions for statistics.
366    #[clap(long)]
367    help_stats: bool,
368
369    #[clap(flatten, next_help_heading = "Libbpf Options")]
370    pub libbpf: LibbpfOpts,
371}
372
373#[derive(Debug, Clone, Copy)]
374struct CpuTimes {
375    user: u64,
376    nice: u64,
377    total: u64,
378}
379
380struct Scheduler<'a> {
381    skel: BpfSkel<'a>,
382    struct_ops: Option<libbpf_rs::Link>,
383    opts: &'a Opts,
384    topo: Topology,
385    power_profile: PowerProfile,
386    stats_server: StatsServer<(), Metrics>,
387    user_restart: bool,
388}
389
390impl<'a> Scheduler<'a> {
391    fn init(opts: &'a Opts, open_object: &'a mut MaybeUninit<OpenObject>) -> Result<Self> {
392        try_set_rlimit_infinity();
393
394        // Validate command line arguments.
395        assert!(opts.slice_us >= opts.slice_us_min);
396
397        // Initialize CPU topology.
398        let topo = Topology::new().unwrap();
399
400        // Check host topology to determine if we need to enable SMT capabilities.
401        let smt_enabled = !opts.disable_smt && topo.smt_enabled;
402
403        info!(
404            "{} {} {}",
405            SCHEDULER_NAME,
406            build_id::full_version(env!("CARGO_PKG_VERSION")),
407            if smt_enabled { "SMT on" } else { "SMT off" }
408        );
409
410        // Print command line.
411        info!(
412            "scheduler options: {}",
413            std::env::args().collect::<Vec<_>>().join(" ")
414        );
415
416        if opts.idle_resume_us >= 0 {
417            if !cpu_idle_resume_latency_supported() {
418                warn!("idle resume latency not supported");
419            } else {
420                info!("Setting idle QoS to {} us", opts.idle_resume_us);
421                for cpu in topo.all_cpus.values() {
422                    update_cpu_idle_resume_latency(
423                        cpu.id,
424                        opts.idle_resume_us.try_into().unwrap(),
425                    )?;
426                }
427            }
428        }
429
430        // Determine the amount of non-empty NUMA nodes in the system.
431        let nr_nodes = topo
432            .nodes
433            .values()
434            .filter(|node| !node.all_cpus.is_empty())
435            .count();
436        info!("NUMA nodes: {}", nr_nodes);
437
438        // Automatically disable NUMA optimizations when running on non-NUMA systems.
439        let numa_disabled = opts.disable_numa || nr_nodes == 1;
440        if numa_disabled {
441            info!("Disabling NUMA optimizations");
442        }
443
444        // Determine the primary scheduling domain.
445        let power_profile = Self::power_profile();
446        let domain =
447            Self::resolve_energy_domain(&opts.primary_domain, power_profile).map_err(|err| {
448                anyhow!(
449                    "failed to resolve primary domain '{}': {}",
450                    &opts.primary_domain,
451                    err
452                )
453            })?;
454
455        // Initialize BPF connector.
456        let mut skel_builder = BpfSkelBuilder::default();
457        skel_builder.obj_builder.debug(opts.verbose);
458        let open_opts = opts.libbpf.clone().into_bpf_open_opts();
459        let mut skel = scx_ops_open!(skel_builder, open_object, flash_ops, open_opts)?;
460
461        skel.struct_ops.flash_ops_mut().exit_dump_len = opts.exit_dump_len;
462
463        // Override default BPF scheduling parameters.
464        let rodata = skel.maps.rodata_data.as_mut().unwrap();
465        rodata.debug = opts.debug;
466        rodata.smt_enabled = smt_enabled;
467        rodata.numa_disabled = numa_disabled;
468        rodata.rr_sched = opts.rr_sched;
469        rodata.local_pcpu = opts.local_pcpu;
470        rodata.direct_dispatch = opts.direct_dispatch;
471        rodata.sticky_cpu = opts.sticky_cpu;
472        rodata.no_wake_sync = opts.no_wake_sync;
473        rodata.tickless_sched = opts.tickless;
474        rodata.native_priority = opts.native_priority;
475        rodata.slice_lag_scaling = opts.slice_lag_scaling;
476        rodata.builtin_idle = !opts.no_builtin_idle;
477        rodata.slice_max = opts.slice_us * 1000;
478        rodata.slice_min = opts.slice_us_min * 1000;
479        rodata.slice_lag = opts.slice_us_lag * 1000;
480        rodata.run_lag = opts.run_us_lag * 1000;
481        rodata.throttle_ns = opts.throttle_us * 1000;
482        rodata.max_avg_nvcsw = opts.max_avg_nvcsw;
483        rodata.primary_all = domain.weight() == *NR_CPU_IDS;
484
485        // Normalize CPU busy threshold in the range [0 .. 1024].
486        rodata.cpu_busy_thresh = if opts.cpu_busy_thresh < 0 {
487            opts.cpu_busy_thresh
488        } else {
489            opts.cpu_busy_thresh * 1024 / 100
490        };
491
492        // Implicitly enable direct dispatch of per-CPU kthreads if CPU throttling is enabled
493        // (it's never a good idea to throttle per-CPU kthreads).
494        rodata.local_kthreads = opts.local_kthreads || opts.throttle_us > 0;
495
496        // Set scheduler compatibility flags.
497        rodata.__COMPAT_SCX_PICK_IDLE_IN_NODE = *compat::SCX_PICK_IDLE_IN_NODE;
498
499        // Set scheduler flags.
500        skel.struct_ops.flash_ops_mut().flags = *compat::SCX_OPS_ENQ_EXITING
501            | *compat::SCX_OPS_ENQ_LAST
502            | *compat::SCX_OPS_ENQ_MIGRATION_DISABLED
503            | *compat::SCX_OPS_ALLOW_QUEUED_WAKEUP
504            | if numa_disabled {
505                0
506            } else {
507                *compat::SCX_OPS_BUILTIN_IDLE_PER_NODE
508            };
509        info!(
510            "scheduler flags: {:#x}",
511            skel.struct_ops.flash_ops_mut().flags
512        );
513
514        // Load the BPF program for validation.
515        let mut skel = scx_ops_load!(skel, flash_ops, uei)?;
516
517        // Initialize the primary scheduling domain and the preferred domain.
518        Self::init_energy_domain(&mut skel, &domain).map_err(|err| {
519            anyhow!(
520                "failed to initialize primary domain 0x{:x}: {}",
521                domain,
522                err
523            )
524        })?;
525
526        if let Err(err) = Self::init_cpufreq_perf(&mut skel, &opts.primary_domain, opts.cpufreq) {
527            bail!(
528                "failed to initialize cpufreq performance level: error {}",
529                err
530            );
531        }
532
533        // Initialize SMT domains.
534        if smt_enabled {
535            Self::init_smt_domains(&mut skel, &topo)?;
536        }
537
538        // Initialize L2 cache domains.
539        if !opts.disable_l2 {
540            Self::init_l2_cache_domains(&mut skel, &topo)?;
541        }
542        // Initialize L3 cache domains.
543        if !opts.disable_l3 {
544            Self::init_l3_cache_domains(&mut skel, &topo)?;
545        }
546
547        // Attach the scheduler.
548        let struct_ops = Some(scx_ops_attach!(skel, flash_ops)?);
549        let stats_server = StatsServer::new(stats::server_data()).launch()?;
550
551        Ok(Self {
552            skel,
553            struct_ops,
554            opts,
555            topo,
556            power_profile,
557            stats_server,
558            user_restart: false,
559        })
560    }
561
562    fn enable_primary_cpu(skel: &mut BpfSkel<'_>, cpu: i32) -> Result<(), u32> {
563        let prog = &mut skel.progs.enable_primary_cpu;
564        let mut args = cpu_arg {
565            cpu_id: cpu as c_int,
566        };
567        let input = ProgramInput {
568            context_in: Some(unsafe {
569                std::slice::from_raw_parts_mut(
570                    &mut args as *mut _ as *mut u8,
571                    std::mem::size_of_val(&args),
572                )
573            }),
574            ..Default::default()
575        };
576        let out = prog.test_run(input).unwrap();
577        if out.return_value != 0 {
578            return Err(out.return_value);
579        }
580
581        Ok(())
582    }
583
584    fn epp_to_cpumask(profile: Powermode) -> Result<Cpumask> {
585        let mut cpus = get_primary_cpus(profile).unwrap_or_default();
586        if cpus.is_empty() {
587            cpus = get_primary_cpus(Powermode::Any).unwrap_or_default();
588        }
589        Cpumask::from_str(&cpus_to_cpumask(&cpus))
590    }
591
592    fn resolve_energy_domain(primary_domain: &str, power_profile: PowerProfile) -> Result<Cpumask> {
593        let domain = match primary_domain {
594            "powersave" => Self::epp_to_cpumask(Powermode::Powersave)?,
595            "performance" => Self::epp_to_cpumask(Powermode::Performance)?,
596            "turbo" => Self::epp_to_cpumask(Powermode::Turbo)?,
597            "auto" => match power_profile {
598                PowerProfile::Powersave => Self::epp_to_cpumask(Powermode::Powersave)?,
599                PowerProfile::Balanced { power: true } => {
600                    Self::epp_to_cpumask(Powermode::Powersave)?
601                }
602                PowerProfile::Balanced { power: false }
603                | PowerProfile::Performance
604                | PowerProfile::Unknown => Self::epp_to_cpumask(Powermode::Any)?,
605            },
606            "all" => Self::epp_to_cpumask(Powermode::Any)?,
607            &_ => Cpumask::from_str(primary_domain)?,
608        };
609
610        Ok(domain)
611    }
612
613    fn init_energy_domain(skel: &mut BpfSkel<'_>, domain: &Cpumask) -> Result<()> {
614        info!("primary CPU domain = 0x{:x}", domain);
615
616        // Clear the primary domain by passing a negative CPU id.
617        if let Err(err) = Self::enable_primary_cpu(skel, -1) {
618            bail!("failed to reset primary domain: error {}", err);
619        }
620
621        // Update primary scheduling domain.
622        for cpu in 0..*NR_CPU_IDS {
623            if domain.test_cpu(cpu) {
624                if let Err(err) = Self::enable_primary_cpu(skel, cpu as i32) {
625                    bail!("failed to add CPU {} to primary domain: error {}", cpu, err);
626                }
627            }
628        }
629
630        Ok(())
631    }
632
633    // Update hint for the cpufreq governor.
634    fn init_cpufreq_perf(
635        skel: &mut BpfSkel<'_>,
636        primary_domain: &String,
637        auto: bool,
638    ) -> Result<()> {
639        // If we are using the powersave profile always scale the CPU frequency to the minimum,
640        // otherwise use the maximum, unless automatic frequency scaling is enabled.
641        let perf_lvl: i64 = match primary_domain.as_str() {
642            "powersave" => 0,
643            _ if auto => -1,
644            _ => 1024,
645        };
646        info!(
647            "cpufreq performance level: {}",
648            match perf_lvl {
649                1024 => "max".into(),
650                0 => "min".into(),
651                n if n < 0 => "auto".into(),
652                _ => perf_lvl.to_string(),
653            }
654        );
655        skel.maps.bss_data.as_mut().unwrap().cpufreq_perf_lvl = perf_lvl;
656
657        Ok(())
658    }
659
660    fn power_profile() -> PowerProfile {
661        let profile = fetch_power_profile(true);
662        if profile == PowerProfile::Unknown {
663            fetch_power_profile(false)
664        } else {
665            profile
666        }
667    }
668
669    fn refresh_sched_domain(&mut self) -> bool {
670        if self.power_profile != PowerProfile::Unknown {
671            let power_profile = Self::power_profile();
672            if power_profile != self.power_profile {
673                self.power_profile = power_profile;
674
675                if self.opts.primary_domain == "auto" {
676                    return true;
677                }
678                if let Err(err) = Self::init_cpufreq_perf(
679                    &mut self.skel,
680                    &self.opts.primary_domain,
681                    self.opts.cpufreq,
682                ) {
683                    warn!("failed to refresh cpufreq performance level: error {}", err);
684                }
685            }
686        }
687
688        false
689    }
690
691    fn enable_sibling_cpu(
692        skel: &mut BpfSkel<'_>,
693        lvl: usize,
694        cpu: usize,
695        sibling_cpu: usize,
696    ) -> Result<(), u32> {
697        let prog = &mut skel.progs.enable_sibling_cpu;
698        let mut args = domain_arg {
699            lvl_id: lvl as c_int,
700            cpu_id: cpu as c_int,
701            sibling_cpu_id: sibling_cpu as c_int,
702        };
703        let input = ProgramInput {
704            context_in: Some(unsafe {
705                std::slice::from_raw_parts_mut(
706                    &mut args as *mut _ as *mut u8,
707                    std::mem::size_of_val(&args),
708                )
709            }),
710            ..Default::default()
711        };
712        let out = prog.test_run(input).unwrap();
713        if out.return_value != 0 {
714            return Err(out.return_value);
715        }
716
717        Ok(())
718    }
719
720    fn init_smt_domains(skel: &mut BpfSkel<'_>, topo: &Topology) -> Result<(), std::io::Error> {
721        let smt_siblings = topo.sibling_cpus();
722
723        info!("SMT sibling CPUs: {:?}", smt_siblings);
724        for (cpu, sibling_cpu) in smt_siblings.iter().enumerate() {
725            Self::enable_sibling_cpu(skel, 0, cpu, *sibling_cpu as usize).unwrap();
726        }
727
728        Ok(())
729    }
730
731    fn are_smt_siblings(topo: &Topology, cpus: &[usize]) -> bool {
732        // Single CPU or empty array are considered siblings.
733        if cpus.len() <= 1 {
734            return true;
735        }
736
737        // Check if each CPU is a sibling of the first CPU.
738        let first_cpu = cpus[0];
739        let smt_siblings = topo.sibling_cpus();
740        cpus.iter().all(|&cpu| {
741            cpu == first_cpu
742                || smt_siblings[cpu] == first_cpu as i32
743                || (smt_siblings[first_cpu] >= 0 && smt_siblings[first_cpu] == cpu as i32)
744        })
745    }
746
747    fn init_cache_domains(
748        skel: &mut BpfSkel<'_>,
749        topo: &Topology,
750        cache_lvl: usize,
751        enable_sibling_cpu_fn: &dyn Fn(&mut BpfSkel<'_>, usize, usize, usize) -> Result<(), u32>,
752    ) -> Result<(), std::io::Error> {
753        // Determine the list of CPU IDs associated to each cache node.
754        let mut cache_id_map: BTreeMap<usize, Vec<usize>> = BTreeMap::new();
755        for core in topo.all_cores.values() {
756            for (cpu_id, cpu) in &core.cpus {
757                let cache_id = match cache_lvl {
758                    2 => cpu.l2_id,
759                    3 => cpu.llc_id,
760                    _ => panic!("invalid cache level {}", cache_lvl),
761                };
762                cache_id_map.entry(cache_id).or_default().push(*cpu_id);
763            }
764        }
765
766        // Update the BPF cpumasks for the cache domains.
767        for (cache_id, cpus) in cache_id_map {
768            // Ignore the cache domain if it includes a single CPU.
769            if cpus.len() <= 1 {
770                continue;
771            }
772
773            // Ignore the cache domain if all the CPUs are part of the same SMT core.
774            if Self::are_smt_siblings(topo, &cpus) {
775                continue;
776            }
777
778            info!(
779                "L{} cache ID {}: sibling CPUs: {:?}",
780                cache_lvl, cache_id, cpus
781            );
782            for cpu in &cpus {
783                for sibling_cpu in &cpus {
784                    if enable_sibling_cpu_fn(skel, cache_lvl, *cpu, *sibling_cpu).is_err() {
785                        warn!(
786                            "L{} cache ID {}: failed to set CPU {} sibling {}",
787                            cache_lvl, cache_id, *cpu, *sibling_cpu
788                        );
789                    }
790                }
791            }
792        }
793
794        Ok(())
795    }
796
797    fn init_l2_cache_domains(
798        skel: &mut BpfSkel<'_>,
799        topo: &Topology,
800    ) -> Result<(), std::io::Error> {
801        Self::init_cache_domains(skel, topo, 2, &|skel, lvl, cpu, sibling_cpu| {
802            Self::enable_sibling_cpu(skel, lvl, cpu, sibling_cpu)
803        })
804    }
805
806    fn init_l3_cache_domains(
807        skel: &mut BpfSkel<'_>,
808        topo: &Topology,
809    ) -> Result<(), std::io::Error> {
810        Self::init_cache_domains(skel, topo, 3, &|skel, lvl, cpu, sibling_cpu| {
811            Self::enable_sibling_cpu(skel, lvl, cpu, sibling_cpu)
812        })
813    }
814
815    fn get_metrics(&self) -> Metrics {
816        let bss_data = self.skel.maps.bss_data.as_ref().unwrap();
817        Metrics {
818            nr_running: bss_data.nr_running,
819            nr_cpus: bss_data.nr_online_cpus,
820            nr_kthread_dispatches: bss_data.nr_kthread_dispatches,
821            nr_direct_dispatches: bss_data.nr_direct_dispatches,
822            nr_shared_dispatches: bss_data.nr_shared_dispatches,
823        }
824    }
825
826    pub fn exited(&mut self) -> bool {
827        uei_exited!(&self.skel, uei)
828    }
829
830    fn compute_user_cpu_pct(prev: &CpuTimes, curr: &CpuTimes) -> Option<u64> {
831        let total_diff = curr.total.saturating_sub(prev.total);
832        let user_diff = (curr.user + curr.nice).saturating_sub(prev.user + prev.nice);
833
834        if total_diff > 0 {
835            let user_ratio = user_diff as f64 / total_diff as f64;
836            Some((user_ratio * 1024.0).round() as u64)
837        } else {
838            None
839        }
840    }
841
842    fn read_cpu_times() -> Option<CpuTimes> {
843        let file = File::open("/proc/stat").ok()?;
844        let reader = BufReader::new(file);
845
846        for line in reader.lines() {
847            let line = line.ok()?;
848            if line.starts_with("cpu ") {
849                let fields: Vec<&str> = line.split_whitespace().collect();
850                if fields.len() < 5 {
851                    return None;
852                }
853
854                let user: u64 = fields[1].parse().ok()?;
855                let nice: u64 = fields[2].parse().ok()?;
856
857                // Sum the first 8 fields as total time, including idle, system, etc.
858                let total: u64 = fields
859                    .iter()
860                    .skip(1)
861                    .take(8)
862                    .filter_map(|v| v.parse::<u64>().ok())
863                    .sum();
864
865                return Some(CpuTimes { user, nice, total });
866            }
867        }
868
869        None
870    }
871
872    fn run(&mut self, shutdown: Arc<AtomicBool>) -> Result<UserExitInfo> {
873        let mut prev_cputime = Self::read_cpu_times().expect("Failed to read initial CPU stats");
874        let (res_ch, req_ch) = self.stats_server.channels();
875
876        while !shutdown.load(Ordering::Relaxed) && !self.exited() {
877            if self.refresh_sched_domain() {
878                self.user_restart = true;
879                break;
880            }
881
882            if self.opts.cpu_busy_thresh < 0 {
883                if let Some(curr_cputime) = Self::read_cpu_times() {
884                    if let Some(cpu_util) = Self::compute_user_cpu_pct(&prev_cputime, &curr_cputime)
885                    {
886                        self.skel.maps.bss_data.as_mut().unwrap().cpu_util = cpu_util;
887                    }
888                    prev_cputime = curr_cputime;
889                }
890            }
891
892            match req_ch.recv_timeout(Duration::from_secs(1)) {
893                Ok(()) => res_ch.send(self.get_metrics())?,
894                Err(RecvTimeoutError::Timeout) => {}
895                Err(e) => Err(e)?,
896            }
897        }
898
899        let _ = self.struct_ops.take();
900        uei_report!(&self.skel, uei)
901    }
902}
903
904impl Drop for Scheduler<'_> {
905    fn drop(&mut self) {
906        info!("Unregister {SCHEDULER_NAME} scheduler");
907
908        // Restore default CPU idle QoS resume latency.
909        if self.opts.idle_resume_us >= 0 {
910            if cpu_idle_resume_latency_supported() {
911                for cpu in self.topo.all_cpus.values() {
912                    update_cpu_idle_resume_latency(cpu.id, cpu.pm_qos_resume_latency_us as i32)
913                        .unwrap();
914                }
915            }
916        }
917    }
918}
919
920fn main() -> Result<()> {
921    let opts = Opts::parse();
922
923    if opts.version {
924        println!(
925            "{} {}",
926            SCHEDULER_NAME,
927            build_id::full_version(env!("CARGO_PKG_VERSION"))
928        );
929        return Ok(());
930    }
931
932    if opts.help_stats {
933        stats::server_data().describe_meta(&mut std::io::stdout(), None)?;
934        return Ok(());
935    }
936
937    let loglevel = simplelog::LevelFilter::Info;
938
939    let mut lcfg = simplelog::ConfigBuilder::new();
940    lcfg.set_time_offset_to_local()
941        .expect("Failed to set local time offset")
942        .set_time_level(simplelog::LevelFilter::Error)
943        .set_location_level(simplelog::LevelFilter::Off)
944        .set_target_level(simplelog::LevelFilter::Off)
945        .set_thread_level(simplelog::LevelFilter::Off);
946    simplelog::TermLogger::init(
947        loglevel,
948        lcfg.build(),
949        simplelog::TerminalMode::Stderr,
950        simplelog::ColorChoice::Auto,
951    )?;
952
953    let shutdown = Arc::new(AtomicBool::new(false));
954    let shutdown_clone = shutdown.clone();
955    ctrlc::set_handler(move || {
956        shutdown_clone.store(true, Ordering::Relaxed);
957    })
958    .context("Error setting Ctrl-C handler")?;
959
960    if let Some(intv) = opts.monitor.or(opts.stats) {
961        let shutdown_copy = shutdown.clone();
962        let jh = std::thread::spawn(move || {
963            match stats::monitor(Duration::from_secs_f64(intv), shutdown_copy) {
964                Ok(_) => {
965                    debug!("stats monitor thread finished successfully")
966                }
967                Err(error_object) => {
968                    warn!(
969                        "stats monitor thread finished because of an error {}",
970                        error_object
971                    )
972                }
973            }
974        });
975        if opts.monitor.is_some() {
976            let _ = jh.join();
977            return Ok(());
978        }
979    }
980
981    let mut open_object = MaybeUninit::uninit();
982    loop {
983        let mut sched = Scheduler::init(&opts, &mut open_object)?;
984        if !sched.run(shutdown.clone())?.should_restart() {
985            if sched.user_restart {
986                continue;
987            }
988            break;
989        }
990    }
991
992    Ok(())
993}