scx_flash/
main.rs

1// SPDX-License-Identifier: GPL-2.0
2//
3// Copyright (c) 2024 Andrea Righi <arighi@nvidia.com>
4
5// This software may be used and distributed according to the terms of the
6// GNU General Public License version 2.
7
8mod bpf_skel;
9pub use bpf_skel::*;
10pub mod bpf_intf;
11pub use bpf_intf::*;
12
13mod stats;
14use std::collections::BTreeMap;
15use std::ffi::c_int;
16use std::fmt::Write;
17use std::mem::MaybeUninit;
18use std::sync::atomic::AtomicBool;
19use std::sync::atomic::Ordering;
20use std::sync::Arc;
21use std::time::Duration;
22
23use anyhow::Context;
24use anyhow::Result;
25use clap::Parser;
26use crossbeam::channel::RecvTimeoutError;
27use libbpf_rs::OpenObject;
28use libbpf_rs::ProgramInput;
29use log::warn;
30use log::{debug, info};
31use scx_stats::prelude::*;
32use scx_utils::autopower::{fetch_power_profile, PowerProfile};
33use scx_utils::build_id;
34use scx_utils::compat;
35use scx_utils::pm::{cpu_idle_resume_latency_supported, update_cpu_idle_resume_latency};
36use scx_utils::scx_ops_attach;
37use scx_utils::scx_ops_load;
38use scx_utils::scx_ops_open;
39use scx_utils::set_rlimit_infinity;
40use scx_utils::uei_exited;
41use scx_utils::uei_report;
42use scx_utils::CoreType;
43use scx_utils::Cpumask;
44use scx_utils::Topology;
45use scx_utils::UserExitInfo;
46use scx_utils::NR_CPU_IDS;
47use stats::Metrics;
48
49const SCHEDULER_NAME: &str = "scx_flash";
50
51#[derive(PartialEq)]
52enum Powermode {
53    Turbo,
54    Performance,
55    Powersave,
56    Any,
57}
58
59fn get_primary_cpus(mode: Powermode) -> std::io::Result<Vec<usize>> {
60    let topo = Topology::new().unwrap();
61
62    let cpus: Vec<usize> = topo
63        .all_cores
64        .values()
65        .flat_map(|core| &core.cpus)
66        .filter_map(|(cpu_id, cpu)| match (&mode, &cpu.core_type) {
67            // Turbo mode: prioritize CPUs with the highest max frequency
68            (Powermode::Turbo, CoreType::Big { turbo: true }) |
69            // Performance mode: add all the Big CPUs (either Turbo or non-Turbo)
70            (Powermode::Performance, CoreType::Big { .. }) |
71            // Powersave mode: add all the Little CPUs
72            (Powermode::Powersave, CoreType::Little) => Some(*cpu_id),
73            (Powermode::Any, ..) => Some(*cpu_id),
74            _ => None,
75        })
76        .collect();
77
78    Ok(cpus)
79}
80
81// Convert an array of CPUs to the corresponding cpumask of any arbitrary size.
82fn cpus_to_cpumask(cpus: &Vec<usize>) -> String {
83    if cpus.is_empty() {
84        return String::from("none");
85    }
86
87    // Determine the maximum CPU ID to create a sufficiently large byte vector.
88    let max_cpu_id = *cpus.iter().max().unwrap();
89
90    // Create a byte vector with enough bytes to cover all CPU IDs.
91    let mut bitmask = vec![0u8; (max_cpu_id + 1 + 7) / 8];
92
93    // Set the appropriate bits for each CPU ID.
94    for cpu_id in cpus {
95        let byte_index = cpu_id / 8;
96        let bit_index = cpu_id % 8;
97        bitmask[byte_index] |= 1 << bit_index;
98    }
99
100    // Convert the byte vector to a hexadecimal string.
101    let hex_str: String = bitmask.iter().rev().fold(String::new(), |mut f, byte| {
102        let _ = write!(&mut f, "{:02x}", byte);
103        f
104    });
105
106    format!("0x{}", hex_str)
107}
108
109#[derive(Debug, clap::Parser)]
110#[command(
111    name = "scx_flash",
112    version,
113    disable_version_flag = true,
114    about = "A deadline-based scheduler focused on fairness and performance predictability.",
115    long_about = r#"
116scx_flash is scheduler that focuses on ensuring fairness and performance predictability.
117
118It operates using an earliest deadline first (EDF) policy. The deadline of each task deadline is
119defined as:
120
121    deadline = vruntime + exec_vruntime
122
123`vruntime` represents the task's accumulated runtime, inversely scaled by its weight, while
124`exec_vruntime` accounts for the vruntime accumulated since the last sleep event.
125
126Fairness is ensured through `vruntime`, whereas `exec_vruntime` helps prioritize latency-sensitive
127tasks. Tasks that are frequently blocked waiting for an event (typically latency-sensitive)
128accumulate a smaller `exec_vruntime` compared to tasks that continuously consume CPU without
129interruption.
130
131As a result, tasks with a smaller `exec_vruntime` will have a shorter deadline and will be
132dispatched earlier, ensuring better responsiveness for latency-sensitive tasks.
133
134Moreover, tasks can accumulate a maximum `vruntime` credit while they're sleeping, based on how
135often they voluntarily release the CPU (`avg_nvcsw`). This allows prioritizing frequent sleepers
136over less-frequent ones.
137"#
138)]
139struct Opts {
140    /// Exit debug dump buffer length. 0 indicates default.
141    #[clap(long, default_value = "0")]
142    exit_dump_len: u32,
143
144    /// Maximum scheduling slice duration in microseconds.
145    #[clap(short = 's', long, default_value = "20000")]
146    slice_us: u64,
147
148    /// Minimum scheduling slice duration in microseconds.
149    #[clap(short = 'S', long, default_value = "1000")]
150    slice_us_min: u64,
151
152    /// Maximum runtime budget that a task can accumulate while sleeping (in microseconds).
153    ///
154    /// Increasing this value can help to enhance the responsiveness of interactive tasks, but it
155    /// can also make performance more "spikey".
156    #[clap(short = 'l', long, default_value = "20000")]
157    slice_us_lag: u64,
158
159    /// Maximum runtime penalty that a task can accumulate while running (in microseconds).
160    ///
161    /// Increasing this value can help to enhance the responsiveness of interactive tasks, but it
162    /// can also make performance more "spikey".
163    #[clap(short = 'r', long, default_value = "20000")]
164    run_us_lag: u64,
165
166    /// Maximum rate of voluntary context switches.
167    ///
168    /// Increasing this value can help prioritize interactive tasks with a higher sleep frequency
169    /// over interactive tasks with lower sleep frequency.
170    ///
171    /// Decreasing this value makes the scheduler more robust and fair.
172    ///
173    /// (0 = disable voluntary context switch prioritization).
174    #[clap(short = 'c', long, default_value = "128")]
175    max_avg_nvcsw: u64,
176
177    /// Throttle the running CPUs by periodically injecting idle cycles.
178    ///
179    /// This option can help extend battery life on portable devices, reduce heating, fan noise
180    /// and overall energy consumption (0 = disable).
181    #[clap(short = 't', long, default_value = "0")]
182    throttle_us: u64,
183
184    /// Set CPU idle QoS resume latency in microseconds (-1 = disabled).
185    ///
186    /// Setting a lower latency value makes CPUs less likely to enter deeper idle states, enhancing
187    /// performance at the cost of higher power consumption. Alternatively, increasing the latency
188    /// value may reduce performance, but also improve power efficiency.
189    #[clap(short = 'I', long, allow_hyphen_values = true, default_value = "-1")]
190    idle_resume_us: i64,
191
192    /// Enable per-CPU tasks prioritization.
193    ///
194    /// This allows to prioritize per-CPU tasks that usually tend to be de-prioritized (since they
195    /// can't be migrated when their only usable CPU is busy). Enabling this option can introduce
196    /// unfairness and potentially trigger stalls, but it can improve performance of server-type
197    /// workloads (such as large parallel builds).
198    #[clap(short = 'p', long, action = clap::ArgAction::SetTrue)]
199    local_pcpu: bool,
200
201    /// Native tasks priorities.
202    ///
203    /// By default, the scheduler normalizes task priorities to avoid large gaps that could lead to
204    /// stalls or starvation. This option disables normalization and uses the default Linux priority
205    /// range instead.
206    #[clap(short = 'n', long, action = clap::ArgAction::SetTrue)]
207    native_priority: bool,
208
209    /// Enable kthreads prioritization (EXPERIMENTAL).
210    ///
211    /// Enabling this can improve system performance, but it may also introduce noticeable
212    /// interactivity issues or unfairness in scenarios with high kthread activity, such as heavy
213    /// I/O or network traffic.
214    ///
215    /// Use it only when conducting specific experiments or if you have a clear understanding of
216    /// its implications.
217    #[clap(short = 'k', long, action = clap::ArgAction::SetTrue)]
218    local_kthreads: bool,
219
220    /// Disable direct dispatch during synchronous wakeups.
221    ///
222    /// Enabling this option can lead to a more uniform load distribution across available cores,
223    /// potentially improving performance in certain scenarios. However, it may come at the cost of
224    /// reduced efficiency for pipe-intensive workloads that benefit from tighter producer-consumer
225    /// coupling.
226    #[clap(short = 'w', long, action = clap::ArgAction::SetTrue)]
227    no_wake_sync: bool,
228
229    /// Specifies the initial set of CPUs, represented as a bitmask in hex (e.g., 0xff), that the
230    /// scheduler will use to dispatch tasks, until the system becomes saturated, at which point
231    /// tasks may overflow to other available CPUs.
232    ///
233    /// Special values:
234    ///  - "auto" = automatically detect the CPUs based on the active power profile
235    ///  - "turbo" = automatically detect and prioritize the CPUs with the highest max frequency
236    ///  - "performance" = automatically detect and prioritize the fastest CPUs
237    ///  - "powersave" = automatically detect and prioritize the slowest CPUs
238    ///  - "all" = all CPUs assigned to the primary domain
239    ///  - "none" = no prioritization, tasks are dispatched on the first CPU available
240    #[clap(short = 'm', long, default_value = "auto")]
241    primary_domain: String,
242
243    /// Disable L2 cache awareness.
244    #[clap(long, action = clap::ArgAction::SetTrue)]
245    disable_l2: bool,
246
247    /// Disable L3 cache awareness.
248    #[clap(long, action = clap::ArgAction::SetTrue)]
249    disable_l3: bool,
250
251    /// Disable SMT awareness.
252    #[clap(long, action = clap::ArgAction::SetTrue)]
253    disable_smt: bool,
254
255    /// Disable NUMA rebalancing.
256    #[clap(long, action = clap::ArgAction::SetTrue)]
257    disable_numa: bool,
258
259    /// Enable CPU frequency control (only with schedutil governor).
260    ///
261    /// With this option enabled the CPU frequency will be automatically scaled based on the load.
262    #[clap(short = 'f', long, action = clap::ArgAction::SetTrue)]
263    cpufreq: bool,
264
265    /// Enable stats monitoring with the specified interval.
266    #[clap(long)]
267    stats: Option<f64>,
268
269    /// Run in stats monitoring mode with the specified interval. Scheduler
270    /// is not launched.
271    #[clap(long)]
272    monitor: Option<f64>,
273
274    /// Enable BPF debugging via /sys/kernel/tracing/trace_pipe.
275    #[clap(short = 'd', long, action = clap::ArgAction::SetTrue)]
276    debug: bool,
277
278    /// Enable verbose output, including libbpf details.
279    #[clap(short = 'v', long, action = clap::ArgAction::SetTrue)]
280    verbose: bool,
281
282    /// Print scheduler version and exit.
283    #[clap(short = 'V', long, action = clap::ArgAction::SetTrue)]
284    version: bool,
285
286    /// Show descriptions for statistics.
287    #[clap(long)]
288    help_stats: bool,
289}
290
291struct Scheduler<'a> {
292    skel: BpfSkel<'a>,
293    struct_ops: Option<libbpf_rs::Link>,
294    opts: &'a Opts,
295    topo: Topology,
296    power_profile: PowerProfile,
297    stats_server: StatsServer<(), Metrics>,
298    user_restart: bool,
299}
300
301impl<'a> Scheduler<'a> {
302    fn init(opts: &'a Opts, open_object: &'a mut MaybeUninit<OpenObject>) -> Result<Self> {
303        set_rlimit_infinity();
304
305        // Validate command line arguments.
306        assert!(opts.slice_us >= opts.slice_us_min);
307
308        // Initialize CPU topology.
309        let topo = Topology::new().unwrap();
310
311        // Check host topology to determine if we need to enable SMT capabilities.
312        let smt_enabled = !opts.disable_smt && topo.smt_enabled;
313
314        info!(
315            "{} {} {}",
316            SCHEDULER_NAME,
317            build_id::full_version(env!("CARGO_PKG_VERSION")),
318            if smt_enabled { "SMT on" } else { "SMT off" }
319        );
320
321        // Print command line.
322        info!(
323            "scheduler options: {}",
324            std::env::args().collect::<Vec<_>>().join(" ")
325        );
326
327        if opts.idle_resume_us >= 0 {
328            if !cpu_idle_resume_latency_supported() {
329                warn!("idle resume latency not supported");
330            } else {
331                info!("Setting idle QoS to {} us", opts.idle_resume_us);
332                for cpu in topo.all_cpus.values() {
333                    update_cpu_idle_resume_latency(
334                        cpu.id,
335                        opts.idle_resume_us.try_into().unwrap(),
336                    )?;
337                }
338            }
339        }
340
341        // Initialize BPF connector.
342        let mut skel_builder = BpfSkelBuilder::default();
343        skel_builder.obj_builder.debug(opts.verbose);
344        let mut skel = scx_ops_open!(skel_builder, open_object, flash_ops)?;
345
346        skel.struct_ops.flash_ops_mut().exit_dump_len = opts.exit_dump_len;
347
348        // Override default BPF scheduling parameters.
349        skel.maps.rodata_data.debug = opts.debug;
350        skel.maps.rodata_data.smt_enabled = smt_enabled;
351        skel.maps.rodata_data.numa_disabled = opts.disable_numa;
352        skel.maps.rodata_data.local_pcpu = opts.local_pcpu;
353        skel.maps.rodata_data.no_wake_sync = opts.no_wake_sync;
354        skel.maps.rodata_data.native_priority = opts.native_priority;
355        skel.maps.rodata_data.slice_max = opts.slice_us * 1000;
356        skel.maps.rodata_data.slice_min = opts.slice_us_min * 1000;
357        skel.maps.rodata_data.slice_lag = opts.slice_us_lag * 1000;
358        skel.maps.rodata_data.run_lag = opts.run_us_lag * 1000;
359        skel.maps.rodata_data.throttle_ns = opts.throttle_us * 1000;
360        skel.maps.rodata_data.max_avg_nvcsw = opts.max_avg_nvcsw;
361
362        // Implicitly enable direct dispatch of per-CPU kthreads if CPU throttling is enabled
363        // (it's never a good idea to throttle per-CPU kthreads).
364        skel.maps.rodata_data.local_kthreads = opts.local_kthreads || opts.throttle_us > 0;
365
366        // Set scheduler compatibility flags.
367        skel.maps.rodata_data.__COMPAT_SCX_PICK_IDLE_IN_NODE = *compat::SCX_PICK_IDLE_IN_NODE;
368
369        // Set scheduler flags.
370        skel.struct_ops.flash_ops_mut().flags = *compat::SCX_OPS_ENQ_EXITING
371            | *compat::SCX_OPS_ENQ_LAST
372            | *compat::SCX_OPS_ENQ_MIGRATION_DISABLED
373            | *compat::SCX_OPS_BUILTIN_IDLE_PER_NODE
374            | *compat::SCX_OPS_ALLOW_QUEUED_WAKEUP;
375        info!(
376            "scheduler flags: {:#x}",
377            skel.struct_ops.flash_ops_mut().flags
378        );
379
380        // Load the BPF program for validation.
381        let mut skel = scx_ops_load!(skel, flash_ops, uei)?;
382
383        // Initialize the primary scheduling domain and the preferred domain.
384        let power_profile = Self::power_profile();
385        if let Err(err) = Self::init_energy_domain(&mut skel, &opts.primary_domain, power_profile) {
386            warn!("failed to initialize primary domain: error {}", err);
387        }
388        if let Err(err) = Self::init_cpufreq_perf(&mut skel, &opts.primary_domain, opts.cpufreq) {
389            warn!(
390                "failed to initialize cpufreq performance level: error {}",
391                err
392            );
393        }
394
395        // Initialize SMT domains.
396        if smt_enabled {
397            Self::init_smt_domains(&mut skel, &topo)?;
398        }
399
400        // Initialize L2 cache domains.
401        if !opts.disable_l2 {
402            Self::init_l2_cache_domains(&mut skel, &topo)?;
403        }
404        // Initialize L3 cache domains.
405        if !opts.disable_l3 {
406            Self::init_l3_cache_domains(&mut skel, &topo)?;
407        }
408
409        // Attach the scheduler.
410        let struct_ops = Some(scx_ops_attach!(skel, flash_ops)?);
411        let stats_server = StatsServer::new(stats::server_data()).launch()?;
412
413        Ok(Self {
414            skel,
415            struct_ops,
416            opts,
417            topo,
418            power_profile,
419            stats_server,
420            user_restart: false,
421        })
422    }
423
424    fn enable_primary_cpu(skel: &mut BpfSkel<'_>, cpu: i32) -> Result<(), u32> {
425        let prog = &mut skel.progs.enable_primary_cpu;
426        let mut args = cpu_arg {
427            cpu_id: cpu as c_int,
428        };
429        let input = ProgramInput {
430            context_in: Some(unsafe {
431                std::slice::from_raw_parts_mut(
432                    &mut args as *mut _ as *mut u8,
433                    std::mem::size_of_val(&args),
434                )
435            }),
436            ..Default::default()
437        };
438        let out = prog.test_run(input).unwrap();
439        if out.return_value != 0 {
440            return Err(out.return_value);
441        }
442
443        Ok(())
444    }
445
446    fn epp_to_cpumask(profile: Powermode) -> Result<Cpumask> {
447        let mut cpus = get_primary_cpus(profile).unwrap_or_default();
448        if cpus.is_empty() {
449            cpus = get_primary_cpus(Powermode::Any).unwrap_or_default();
450        }
451        Cpumask::from_str(&cpus_to_cpumask(&cpus))
452    }
453
454    fn init_energy_domain(
455        skel: &mut BpfSkel<'_>,
456        primary_domain: &str,
457        power_profile: PowerProfile,
458    ) -> Result<()> {
459        let domain = match primary_domain {
460            "powersave" => Self::epp_to_cpumask(Powermode::Powersave)?,
461            "performance" => Self::epp_to_cpumask(Powermode::Performance)?,
462            "turbo" => Self::epp_to_cpumask(Powermode::Turbo)?,
463            "auto" => match power_profile {
464                PowerProfile::Powersave => Self::epp_to_cpumask(Powermode::Powersave)?,
465                PowerProfile::Balanced { power: true } => {
466                    Self::epp_to_cpumask(Powermode::Powersave)?
467                }
468                PowerProfile::Balanced { power: false } => Self::epp_to_cpumask(Powermode::Any)?,
469                PowerProfile::Performance => Self::epp_to_cpumask(Powermode::Any)?,
470                PowerProfile::Unknown => Self::epp_to_cpumask(Powermode::Any)?,
471            },
472            "all" => Self::epp_to_cpumask(Powermode::Any)?,
473            &_ => Cpumask::from_str(primary_domain)?,
474        };
475
476        info!("primary CPU domain = 0x{:x}", domain);
477
478        // Clear the primary domain by passing a negative CPU id.
479        if let Err(err) = Self::enable_primary_cpu(skel, -1) {
480            warn!("failed to reset primary domain: error {}", err);
481        }
482        // Update primary scheduling domain.
483        for cpu in 0..*NR_CPU_IDS {
484            if domain.test_cpu(cpu) {
485                if let Err(err) = Self::enable_primary_cpu(skel, cpu as i32) {
486                    warn!("failed to add CPU {} to primary domain: error {}", cpu, err);
487                }
488            }
489        }
490
491        Ok(())
492    }
493
494    // Update hint for the cpufreq governor.
495    fn init_cpufreq_perf(
496        skel: &mut BpfSkel<'_>,
497        primary_domain: &String,
498        auto: bool,
499    ) -> Result<()> {
500        // If we are using the powersave profile always scale the CPU frequency to the minimum,
501        // otherwise use the maximum, unless automatic frequency scaling is enabled.
502        let perf_lvl: i64 = match primary_domain.as_str() {
503            "powersave" => 0,
504            _ if auto => -1,
505            _ => 1024,
506        };
507        info!(
508            "cpufreq performance level: {}",
509            match perf_lvl {
510                1024 => "max".into(),
511                0 => "min".into(),
512                n if n < 0 => "auto".into(),
513                _ => perf_lvl.to_string(),
514            }
515        );
516        skel.maps.bss_data.cpufreq_perf_lvl = perf_lvl;
517
518        Ok(())
519    }
520
521    fn power_profile() -> PowerProfile {
522        let profile = fetch_power_profile(true);
523        if profile == PowerProfile::Unknown {
524            fetch_power_profile(false)
525        } else {
526            profile
527        }
528    }
529
530    fn refresh_sched_domain(&mut self) -> bool {
531        if self.power_profile != PowerProfile::Unknown {
532            let power_profile = Self::power_profile();
533            if power_profile != self.power_profile {
534                self.power_profile = power_profile;
535
536                if self.opts.primary_domain == "auto" {
537                    return true;
538                }
539                if let Err(err) = Self::init_cpufreq_perf(
540                    &mut self.skel,
541                    &self.opts.primary_domain,
542                    self.opts.cpufreq,
543                ) {
544                    warn!("failed to refresh cpufreq performance level: error {}", err);
545                }
546            }
547        }
548
549        false
550    }
551
552    fn enable_sibling_cpu(
553        skel: &mut BpfSkel<'_>,
554        lvl: usize,
555        cpu: usize,
556        sibling_cpu: usize,
557    ) -> Result<(), u32> {
558        let prog = &mut skel.progs.enable_sibling_cpu;
559        let mut args = domain_arg {
560            lvl_id: lvl as c_int,
561            cpu_id: cpu as c_int,
562            sibling_cpu_id: sibling_cpu as c_int,
563        };
564        let input = ProgramInput {
565            context_in: Some(unsafe {
566                std::slice::from_raw_parts_mut(
567                    &mut args as *mut _ as *mut u8,
568                    std::mem::size_of_val(&args),
569                )
570            }),
571            ..Default::default()
572        };
573        let out = prog.test_run(input).unwrap();
574        if out.return_value != 0 {
575            return Err(out.return_value);
576        }
577
578        Ok(())
579    }
580
581    fn init_smt_domains(skel: &mut BpfSkel<'_>, topo: &Topology) -> Result<(), std::io::Error> {
582        let smt_siblings = topo.sibling_cpus();
583
584        info!("SMT sibling CPUs: {:?}", smt_siblings);
585        for (cpu, sibling_cpu) in smt_siblings.iter().enumerate() {
586            Self::enable_sibling_cpu(skel, 0, cpu, *sibling_cpu as usize).unwrap();
587        }
588
589        Ok(())
590    }
591
592    fn are_smt_siblings(topo: &Topology, cpus: &[usize]) -> bool {
593        // Single CPU or empty array are considered siblings.
594        if cpus.len() <= 1 {
595            return true;
596        }
597
598        // Check if each CPU is a sibling of the first CPU.
599        let first_cpu = cpus[0];
600        let smt_siblings = topo.sibling_cpus();
601        cpus.iter().all(|&cpu| {
602            cpu == first_cpu
603                || smt_siblings[cpu] == first_cpu as i32
604                || (smt_siblings[first_cpu] >= 0 && smt_siblings[first_cpu] == cpu as i32)
605        })
606    }
607
608    fn init_cache_domains(
609        skel: &mut BpfSkel<'_>,
610        topo: &Topology,
611        cache_lvl: usize,
612        enable_sibling_cpu_fn: &dyn Fn(&mut BpfSkel<'_>, usize, usize, usize) -> Result<(), u32>,
613    ) -> Result<(), std::io::Error> {
614        // Determine the list of CPU IDs associated to each cache node.
615        let mut cache_id_map: BTreeMap<usize, Vec<usize>> = BTreeMap::new();
616        for core in topo.all_cores.values() {
617            for (cpu_id, cpu) in &core.cpus {
618                let cache_id = match cache_lvl {
619                    2 => cpu.l2_id,
620                    3 => cpu.llc_id,
621                    _ => panic!("invalid cache level {}", cache_lvl),
622                };
623                cache_id_map.entry(cache_id).or_default().push(*cpu_id);
624            }
625        }
626
627        // Update the BPF cpumasks for the cache domains.
628        for (cache_id, cpus) in cache_id_map {
629            // Ignore the cache domain if it includes a single CPU.
630            if cpus.len() <= 1 {
631                continue;
632            }
633
634            // Ignore the cache domain if all the CPUs are part of the same SMT core.
635            if Self::are_smt_siblings(topo, &cpus) {
636                continue;
637            }
638
639            info!(
640                "L{} cache ID {}: sibling CPUs: {:?}",
641                cache_lvl, cache_id, cpus
642            );
643            for cpu in &cpus {
644                for sibling_cpu in &cpus {
645                    if enable_sibling_cpu_fn(skel, cache_lvl, *cpu, *sibling_cpu).is_err() {
646                        warn!(
647                            "L{} cache ID {}: failed to set CPU {} sibling {}",
648                            cache_lvl, cache_id, *cpu, *sibling_cpu
649                        );
650                    }
651                }
652            }
653        }
654
655        Ok(())
656    }
657
658    fn init_l2_cache_domains(
659        skel: &mut BpfSkel<'_>,
660        topo: &Topology,
661    ) -> Result<(), std::io::Error> {
662        Self::init_cache_domains(skel, topo, 2, &|skel, lvl, cpu, sibling_cpu| {
663            Self::enable_sibling_cpu(skel, lvl, cpu, sibling_cpu)
664        })
665    }
666
667    fn init_l3_cache_domains(
668        skel: &mut BpfSkel<'_>,
669        topo: &Topology,
670    ) -> Result<(), std::io::Error> {
671        Self::init_cache_domains(skel, topo, 3, &|skel, lvl, cpu, sibling_cpu| {
672            Self::enable_sibling_cpu(skel, lvl, cpu, sibling_cpu)
673        })
674    }
675
676    fn get_metrics(&self) -> Metrics {
677        Metrics {
678            nr_running: self.skel.maps.bss_data.nr_running,
679            nr_cpus: self.skel.maps.bss_data.nr_online_cpus,
680            nr_kthread_dispatches: self.skel.maps.bss_data.nr_kthread_dispatches,
681            nr_direct_dispatches: self.skel.maps.bss_data.nr_direct_dispatches,
682            nr_shared_dispatches: self.skel.maps.bss_data.nr_shared_dispatches,
683        }
684    }
685
686    pub fn exited(&mut self) -> bool {
687        uei_exited!(&self.skel, uei)
688    }
689
690    fn run(&mut self, shutdown: Arc<AtomicBool>) -> Result<UserExitInfo> {
691        let (res_ch, req_ch) = self.stats_server.channels();
692        while !shutdown.load(Ordering::Relaxed) && !self.exited() {
693            if self.refresh_sched_domain() {
694                self.user_restart = true;
695                break;
696            }
697            match req_ch.recv_timeout(Duration::from_secs(1)) {
698                Ok(()) => res_ch.send(self.get_metrics())?,
699                Err(RecvTimeoutError::Timeout) => {}
700                Err(e) => Err(e)?,
701            }
702        }
703
704        let _ = self.struct_ops.take();
705        uei_report!(&self.skel, uei)
706    }
707}
708
709impl Drop for Scheduler<'_> {
710    fn drop(&mut self) {
711        info!("Unregister {} scheduler", SCHEDULER_NAME);
712
713        // Restore default CPU idle QoS resume latency.
714        if self.opts.idle_resume_us >= 0 {
715            if cpu_idle_resume_latency_supported() {
716                for cpu in self.topo.all_cpus.values() {
717                    update_cpu_idle_resume_latency(cpu.id, cpu.pm_qos_resume_latency_us as i32)
718                        .unwrap();
719                }
720            }
721        }
722    }
723}
724
725fn main() -> Result<()> {
726    let opts = Opts::parse();
727
728    if opts.version {
729        println!(
730            "{} {}",
731            SCHEDULER_NAME,
732            build_id::full_version(env!("CARGO_PKG_VERSION"))
733        );
734        return Ok(());
735    }
736
737    if opts.help_stats {
738        stats::server_data().describe_meta(&mut std::io::stdout(), None)?;
739        return Ok(());
740    }
741
742    let loglevel = simplelog::LevelFilter::Info;
743
744    let mut lcfg = simplelog::ConfigBuilder::new();
745    lcfg.set_time_offset_to_local()
746        .expect("Failed to set local time offset")
747        .set_time_level(simplelog::LevelFilter::Error)
748        .set_location_level(simplelog::LevelFilter::Off)
749        .set_target_level(simplelog::LevelFilter::Off)
750        .set_thread_level(simplelog::LevelFilter::Off);
751    simplelog::TermLogger::init(
752        loglevel,
753        lcfg.build(),
754        simplelog::TerminalMode::Stderr,
755        simplelog::ColorChoice::Auto,
756    )?;
757
758    let shutdown = Arc::new(AtomicBool::new(false));
759    let shutdown_clone = shutdown.clone();
760    ctrlc::set_handler(move || {
761        shutdown_clone.store(true, Ordering::Relaxed);
762    })
763    .context("Error setting Ctrl-C handler")?;
764
765    if let Some(intv) = opts.monitor.or(opts.stats) {
766        let shutdown_copy = shutdown.clone();
767        let jh = std::thread::spawn(move || {
768            match stats::monitor(Duration::from_secs_f64(intv), shutdown_copy) {
769                Ok(_) => {
770                    debug!("stats monitor thread finished successfully")
771                }
772                Err(error_object) => {
773                    warn!(
774                        "stats monitor thread finished because of an error {}",
775                        error_object
776                    )
777                }
778            }
779        });
780        if opts.monitor.is_some() {
781            let _ = jh.join();
782            return Ok(());
783        }
784    }
785
786    let mut open_object = MaybeUninit::uninit();
787    loop {
788        let mut sched = Scheduler::init(&opts, &mut open_object)?;
789        if !sched.run(shutdown.clone())?.should_restart() {
790            if sched.user_restart {
791                continue;
792            }
793            break;
794        }
795    }
796
797    Ok(())
798}