scx_bpfland/
main.rs

1// SPDX-License-Identifier: GPL-2.0
2//
3// Copyright (c) 2024 Andrea Righi <andrea.righi@linux.dev>
4
5// This software may be used and distributed according to the terms of the
6// GNU General Public License version 2.
7
8mod bpf_skel;
9pub use bpf_skel::*;
10pub mod bpf_intf;
11pub use bpf_intf::*;
12
13mod stats;
14use std::collections::BTreeMap;
15use std::ffi::c_int;
16use std::fmt::Write;
17use std::mem::MaybeUninit;
18use std::sync::atomic::AtomicBool;
19use std::sync::atomic::Ordering;
20use std::sync::Arc;
21use std::time::Duration;
22
23use anyhow::Context;
24use anyhow::Result;
25use clap::Parser;
26use crossbeam::channel::RecvTimeoutError;
27use libbpf_rs::OpenObject;
28use libbpf_rs::ProgramInput;
29use log::warn;
30use log::{debug, info};
31use scx_stats::prelude::*;
32use scx_utils::autopower::{fetch_power_profile, PowerProfile};
33use scx_utils::build_id;
34use scx_utils::compat;
35use scx_utils::libbpf_clap_opts::LibbpfOpts;
36use scx_utils::pm::{cpu_idle_resume_latency_supported, update_cpu_idle_resume_latency};
37use scx_utils::scx_ops_attach;
38use scx_utils::scx_ops_load;
39use scx_utils::scx_ops_open;
40use scx_utils::try_set_rlimit_infinity;
41use scx_utils::uei_exited;
42use scx_utils::uei_report;
43use scx_utils::CoreType;
44use scx_utils::Cpumask;
45use scx_utils::Topology;
46use scx_utils::UserExitInfo;
47use scx_utils::NR_CPU_IDS;
48use stats::Metrics;
49
50const SCHEDULER_NAME: &str = "scx_bpfland";
51
52#[derive(PartialEq)]
53enum Powermode {
54    Performance,
55    Powersave,
56    Any,
57}
58
59fn get_primary_cpus(mode: Powermode) -> std::io::Result<Vec<usize>> {
60    let topo = Topology::new().unwrap();
61
62    let cpus: Vec<usize> = topo
63        .all_cores
64        .values()
65        .flat_map(|core| &core.cpus)
66        .filter_map(|(cpu_id, cpu)| match (&mode, &cpu.core_type) {
67            // Performance mode: add all the Big CPUs (either Turbo or non-Turbo)
68            (Powermode::Performance, CoreType::Big { .. }) |
69            // Powersave mode: add all the Little CPUs
70            (Powermode::Powersave, CoreType::Little) => Some(*cpu_id),
71            (Powermode::Any, ..) => Some(*cpu_id),
72            _ => None,
73        })
74        .collect();
75
76    Ok(cpus)
77}
78
79// Convert an array of CPUs to the corresponding cpumask of any arbitrary size.
80fn cpus_to_cpumask(cpus: &Vec<usize>) -> String {
81    if cpus.is_empty() {
82        return String::from("none");
83    }
84
85    // Determine the maximum CPU ID to create a sufficiently large byte vector.
86    let max_cpu_id = *cpus.iter().max().unwrap();
87
88    // Create a byte vector with enough bytes to cover all CPU IDs.
89    let mut bitmask = vec![0u8; (max_cpu_id + 1 + 7) / 8];
90
91    // Set the appropriate bits for each CPU ID.
92    for cpu_id in cpus {
93        let byte_index = cpu_id / 8;
94        let bit_index = cpu_id % 8;
95        bitmask[byte_index] |= 1 << bit_index;
96    }
97
98    // Convert the byte vector to a hexadecimal string.
99    let hex_str: String = bitmask.iter().rev().fold(String::new(), |mut f, byte| {
100        let _ = write!(&mut f, "{:02x}", byte);
101        f
102    });
103
104    format!("0x{}", hex_str)
105}
106
107/// scx_bpfland: a vruntime-based sched_ext scheduler that prioritizes interactive workloads.
108///
109/// This scheduler is derived from scx_rustland, but it is fully implemented in BPF. It has a minimal
110/// user-space part written in Rust to process command line options, collect metrics and log out
111/// scheduling statistics.
112///
113/// The BPF part makes all the scheduling decisions (see src/bpf/main.bpf.c).
114#[derive(Debug, Parser)]
115struct Opts {
116    /// Exit debug dump buffer length. 0 indicates default.
117    #[clap(long, default_value = "0")]
118    exit_dump_len: u32,
119
120    /// Maximum scheduling slice duration in microseconds.
121    #[clap(short = 's', long, default_value = "20000")]
122    slice_us: u64,
123
124    /// Minimum scheduling slice duration in microseconds.
125    #[clap(short = 'S', long, default_value = "1000")]
126    slice_us_min: u64,
127
128    /// Maximum time slice lag in microseconds.
129    ///
130    /// A positive value can help to enhance the responsiveness of interactive tasks, but it can
131    /// also make performance more "spikey".
132    ///
133    /// A negative value can make performance more consistent, but it can also reduce the
134    /// responsiveness of interactive tasks (by smoothing the effect of the vruntime scheduling and
135    /// making the task ordering closer to a FIFO).
136    #[clap(short = 'l', long, allow_hyphen_values = true, default_value = "20000")]
137    slice_us_lag: i64,
138
139    /// Throttle the running CPUs by periodically injecting idle cycles.
140    ///
141    /// This option can help extend battery life on portable devices, reduce heating, fan noise
142    /// and overall energy consumption (0 = disable).
143    #[clap(short = 't', long, default_value = "0")]
144    throttle_us: u64,
145
146    /// Set CPU idle QoS resume latency in microseconds (-1 = disabled).
147    ///
148    /// Setting a lower latency value makes CPUs less likely to enter deeper idle states, enhancing
149    /// performance at the cost of higher power consumption. Alternatively, increasing the latency
150    /// value may reduce performance, but also improve power efficiency.
151    #[clap(short = 'I', long, allow_hyphen_values = true, default_value = "-1")]
152    idle_resume_us: i64,
153
154    /// Disable preemption.
155    ///
156    /// Never allow tasks to be directly dispatched. This can help to increase fairness
157    /// over responsiveness.
158    #[clap(short = 'n', long, action = clap::ArgAction::SetTrue)]
159    no_preempt: bool,
160
161    /// Enable per-CPU tasks prioritization.
162    ///
163    /// This allows to prioritize per-CPU tasks that usually tend to be de-prioritized (since they
164    /// can't be migrated when their only usable CPU is busy). Enabling this option can introduce
165    /// unfairness and potentially trigger stalls, but it can improve performance of server-type
166    /// workloads (such as large parallel builds).
167    #[clap(short = 'p', long, action = clap::ArgAction::SetTrue)]
168    local_pcpu: bool,
169
170    /// Enable kthreads prioritization (EXPERIMENTAL).
171    ///
172    /// Enabling this can improve system performance, but it may also introduce noticeable
173    /// interactivity issues or unfairness in scenarios with high kthread activity, such as heavy
174    /// I/O or network traffic.
175    ///
176    /// Use it only when conducting specific experiments or if you have a clear understanding of
177    /// its implications.
178    #[clap(short = 'k', long, action = clap::ArgAction::SetTrue)]
179    local_kthreads: bool,
180
181    /// Disable direct dispatch during synchronous wakeups.
182    ///
183    /// Enabling this option can lead to a more uniform load distribution across available cores,
184    /// potentially improving performance in certain scenarios. However, it may come at the cost of
185    /// reduced efficiency for pipe-intensive workloads that benefit from tighter producer-consumer
186    /// coupling.
187    #[clap(short = 'w', long, action = clap::ArgAction::SetTrue)]
188    no_wake_sync: bool,
189
190    /// Specifies the initial set of CPUs, represented as a bitmask in hex (e.g., 0xff), that the
191    /// scheduler will use to dispatch tasks, until the system becomes saturated, at which point
192    /// tasks may overflow to other available CPUs.
193    ///
194    /// Special values:
195    ///  - "auto" = automatically detect the CPUs based on the active power profile
196    ///  - "performance" = automatically detect and prioritize the fastest CPUs
197    ///  - "powersave" = automatically detect and prioritize the slowest CPUs
198    ///  - "all" = all CPUs assigned to the primary domain
199    ///  - "none" = no prioritization, tasks are dispatched on the first CPU available
200    #[clap(short = 'm', long, default_value = "auto")]
201    primary_domain: String,
202
203    /// Disable L2 cache awareness.
204    #[clap(long, action = clap::ArgAction::SetTrue)]
205    disable_l2: bool,
206
207    /// Disable L3 cache awareness.
208    #[clap(long, action = clap::ArgAction::SetTrue)]
209    disable_l3: bool,
210
211    /// Disable SMT awareness.
212    #[clap(long, action = clap::ArgAction::SetTrue)]
213    disable_smt: bool,
214
215    /// Disable NUMA rebalancing.
216    #[clap(long, action = clap::ArgAction::SetTrue)]
217    disable_numa: bool,
218
219    /// Enable CPU frequency control (only with schedutil governor).
220    ///
221    /// With this option enabled the CPU frequency will be automatically scaled based on the load.
222    #[clap(short = 'f', long, action = clap::ArgAction::SetTrue)]
223    cpufreq: bool,
224
225    /// [DEPRECATED] Maximum threshold of voluntary context switches per second. This is used to
226    /// classify interactive.
227    ///
228    /// tasks (0 = disable interactive tasks classification).
229    #[clap(short = 'c', long, default_value = "10", hide = true)]
230    nvcsw_max_thresh: u64,
231
232    /// Enable stats monitoring with the specified interval.
233    #[clap(long)]
234    stats: Option<f64>,
235
236    /// Run in stats monitoring mode with the specified interval. Scheduler
237    /// is not launched.
238    #[clap(long)]
239    monitor: Option<f64>,
240
241    /// Enable BPF debugging via /sys/kernel/tracing/trace_pipe.
242    #[clap(short = 'd', long, action = clap::ArgAction::SetTrue)]
243    debug: bool,
244
245    /// Enable verbose output, including libbpf details.
246    #[clap(short = 'v', long, action = clap::ArgAction::SetTrue)]
247    verbose: bool,
248
249    /// Print scheduler version and exit.
250    #[clap(short = 'V', long, action = clap::ArgAction::SetTrue)]
251    version: bool,
252
253    /// Show descriptions for statistics.
254    #[clap(long)]
255    help_stats: bool,
256
257    #[clap(flatten, next_help_heading = "Libbpf Options")]
258    pub libbpf: LibbpfOpts,
259}
260
261struct Scheduler<'a> {
262    skel: BpfSkel<'a>,
263    struct_ops: Option<libbpf_rs::Link>,
264    opts: &'a Opts,
265    topo: Topology,
266    power_profile: PowerProfile,
267    stats_server: StatsServer<(), Metrics>,
268    user_restart: bool,
269}
270
271impl<'a> Scheduler<'a> {
272    fn init(opts: &'a Opts, open_object: &'a mut MaybeUninit<OpenObject>) -> Result<Self> {
273        try_set_rlimit_infinity();
274
275        // Validate command line arguments.
276        assert!(opts.slice_us >= opts.slice_us_min);
277
278        // Initialize CPU topology.
279        let topo = Topology::new().unwrap();
280
281        // Check host topology to determine if we need to enable SMT capabilities.
282        let smt_enabled = !opts.disable_smt && topo.smt_enabled;
283
284        // Determine the amount of non-empty NUMA nodes in the system.
285        let nr_nodes = topo
286            .nodes
287            .values()
288            .filter(|node| !node.all_cpus.is_empty())
289            .count();
290        info!("NUMA nodes: {}", nr_nodes);
291
292        // Automatically disable NUMA optimizations when running on non-NUMA systems.
293        let numa_disabled = opts.disable_numa || nr_nodes == 1;
294        if numa_disabled {
295            info!("Disabling NUMA optimizations");
296        }
297
298        info!(
299            "{} {} {}",
300            SCHEDULER_NAME,
301            build_id::full_version(env!("CARGO_PKG_VERSION")),
302            if smt_enabled { "SMT on" } else { "SMT off" }
303        );
304
305        if opts.idle_resume_us >= 0 {
306            if !cpu_idle_resume_latency_supported() {
307                warn!("idle resume latency not supported");
308            } else {
309                info!("Setting idle QoS to {} us", opts.idle_resume_us);
310                for cpu in topo.all_cpus.values() {
311                    update_cpu_idle_resume_latency(
312                        cpu.id,
313                        opts.idle_resume_us.try_into().unwrap(),
314                    )?;
315                }
316            }
317        }
318
319        // Initialize BPF connector.
320        let mut skel_builder = BpfSkelBuilder::default();
321        skel_builder.obj_builder.debug(opts.verbose);
322        let open_opts = opts.libbpf.clone().into_bpf_open_opts();
323        let mut skel = scx_ops_open!(skel_builder, open_object, bpfland_ops, open_opts)?;
324
325        skel.struct_ops.bpfland_ops_mut().exit_dump_len = opts.exit_dump_len;
326
327        // Override default BPF scheduling parameters.
328        let rodata = skel.maps.rodata_data.as_mut().unwrap();
329        rodata.debug = opts.debug;
330        rodata.smt_enabled = smt_enabled;
331        rodata.numa_disabled = numa_disabled;
332        rodata.local_pcpu = opts.local_pcpu;
333        rodata.no_preempt = opts.no_preempt;
334        rodata.no_wake_sync = opts.no_wake_sync;
335        rodata.slice_max = opts.slice_us * 1000;
336        rodata.slice_min = opts.slice_us_min * 1000;
337        rodata.slice_lag = opts.slice_us_lag * 1000;
338        rodata.throttle_ns = opts.throttle_us * 1000;
339
340        // Implicitly enable direct dispatch of per-CPU kthreads if CPU throttling is enabled
341        // (it's never a good idea to throttle per-CPU kthreads).
342        rodata.local_kthreads = opts.local_kthreads || opts.throttle_us > 0;
343
344        // Set scheduler compatibility flags.
345        rodata.__COMPAT_SCX_PICK_IDLE_IN_NODE = *compat::SCX_PICK_IDLE_IN_NODE;
346
347        // Set scheduler flags.
348        skel.struct_ops.bpfland_ops_mut().flags = *compat::SCX_OPS_ENQ_EXITING
349            | *compat::SCX_OPS_ENQ_LAST
350            | *compat::SCX_OPS_ENQ_MIGRATION_DISABLED
351            | *compat::SCX_OPS_ALLOW_QUEUED_WAKEUP
352            | if numa_disabled {
353                0
354            } else {
355                *compat::SCX_OPS_BUILTIN_IDLE_PER_NODE
356            };
357        info!(
358            "scheduler flags: {:#x}",
359            skel.struct_ops.bpfland_ops_mut().flags
360        );
361
362        // Load the BPF program for validation.
363        let mut skel = scx_ops_load!(skel, bpfland_ops, uei)?;
364
365        // Initialize the primary scheduling domain and the preferred domain.
366        let power_profile = Self::power_profile();
367        if let Err(err) = Self::init_energy_domain(&mut skel, &opts.primary_domain, power_profile) {
368            warn!("failed to initialize primary domain: error {}", err);
369        }
370        if let Err(err) = Self::init_cpufreq_perf(&mut skel, &opts.primary_domain, opts.cpufreq) {
371            warn!(
372                "failed to initialize cpufreq performance level: error {}",
373                err
374            );
375        }
376
377        // Initialize SMT domains.
378        if smt_enabled {
379            Self::init_smt_domains(&mut skel, &topo)?;
380        }
381
382        // Initialize L2 cache domains.
383        if !opts.disable_l2 {
384            Self::init_l2_cache_domains(&mut skel, &topo)?;
385        }
386        // Initialize L3 cache domains.
387        if !opts.disable_l3 {
388            Self::init_l3_cache_domains(&mut skel, &topo)?;
389        }
390
391        // Attach the scheduler.
392        let struct_ops = Some(scx_ops_attach!(skel, bpfland_ops)?);
393        let stats_server = StatsServer::new(stats::server_data()).launch()?;
394
395        Ok(Self {
396            skel,
397            struct_ops,
398            opts,
399            topo,
400            power_profile,
401            stats_server,
402            user_restart: false,
403        })
404    }
405
406    fn enable_primary_cpu(skel: &mut BpfSkel<'_>, cpu: i32) -> Result<(), u32> {
407        let prog = &mut skel.progs.enable_primary_cpu;
408        let mut args = cpu_arg {
409            cpu_id: cpu as c_int,
410        };
411        let input = ProgramInput {
412            context_in: Some(unsafe {
413                std::slice::from_raw_parts_mut(
414                    &mut args as *mut _ as *mut u8,
415                    std::mem::size_of_val(&args),
416                )
417            }),
418            ..Default::default()
419        };
420        let out = prog.test_run(input).unwrap();
421        if out.return_value != 0 {
422            return Err(out.return_value);
423        }
424
425        Ok(())
426    }
427
428    fn epp_to_cpumask(profile: Powermode) -> Result<Cpumask> {
429        let mut cpus = get_primary_cpus(profile).unwrap_or_default();
430        if cpus.is_empty() {
431            cpus = get_primary_cpus(Powermode::Any).unwrap_or_default();
432        }
433        Cpumask::from_str(&cpus_to_cpumask(&cpus))
434    }
435
436    fn init_energy_domain(
437        skel: &mut BpfSkel<'_>,
438        primary_domain: &str,
439        power_profile: PowerProfile,
440    ) -> Result<()> {
441        let domain = match primary_domain {
442            "powersave" => Self::epp_to_cpumask(Powermode::Powersave)?,
443            "performance" => Self::epp_to_cpumask(Powermode::Performance)?,
444            "auto" => match power_profile {
445                PowerProfile::Powersave => Self::epp_to_cpumask(Powermode::Powersave)?,
446                PowerProfile::Balanced { power: true } => {
447                    Self::epp_to_cpumask(Powermode::Powersave)?
448                }
449                PowerProfile::Balanced { power: false } => Self::epp_to_cpumask(Powermode::Any)?,
450                PowerProfile::Performance => Self::epp_to_cpumask(Powermode::Any)?,
451                PowerProfile::Unknown => Self::epp_to_cpumask(Powermode::Any)?,
452            },
453            "all" => Self::epp_to_cpumask(Powermode::Any)?,
454            &_ => Cpumask::from_str(primary_domain)?,
455        };
456
457        info!("primary CPU domain = 0x{:x}", domain);
458
459        // Clear the primary domain by passing a negative CPU id.
460        if let Err(err) = Self::enable_primary_cpu(skel, -1) {
461            warn!("failed to reset primary domain: error {}", err);
462        }
463        // Update primary scheduling domain.
464        for cpu in 0..*NR_CPU_IDS {
465            if domain.test_cpu(cpu) {
466                if let Err(err) = Self::enable_primary_cpu(skel, cpu as i32) {
467                    warn!("failed to add CPU {} to primary domain: error {}", cpu, err);
468                }
469            }
470        }
471
472        Ok(())
473    }
474
475    // Update hint for the cpufreq governor.
476    fn init_cpufreq_perf(
477        skel: &mut BpfSkel<'_>,
478        primary_domain: &String,
479        auto: bool,
480    ) -> Result<()> {
481        // If we are using the powersave profile always scale the CPU frequency to the minimum,
482        // otherwise use the maximum, unless automatic frequency scaling is enabled.
483        let perf_lvl: i64 = match primary_domain.as_str() {
484            "powersave" => 0,
485            _ if auto => -1,
486            _ => 1024,
487        };
488        info!(
489            "cpufreq performance level: {}",
490            match perf_lvl {
491                1024 => "max".into(),
492                0 => "min".into(),
493                n if n < 0 => "auto".into(),
494                _ => perf_lvl.to_string(),
495            }
496        );
497        skel.maps.bss_data.as_mut().unwrap().cpufreq_perf_lvl = perf_lvl;
498
499        Ok(())
500    }
501
502    fn power_profile() -> PowerProfile {
503        let profile = fetch_power_profile(true);
504        if profile == PowerProfile::Unknown {
505            fetch_power_profile(false)
506        } else {
507            profile
508        }
509    }
510
511    fn refresh_sched_domain(&mut self) -> bool {
512        if self.power_profile != PowerProfile::Unknown {
513            let power_profile = Self::power_profile();
514            if power_profile != self.power_profile {
515                self.power_profile = power_profile;
516
517                if self.opts.primary_domain == "auto" {
518                    return true;
519                }
520                if let Err(err) = Self::init_cpufreq_perf(
521                    &mut self.skel,
522                    &self.opts.primary_domain,
523                    self.opts.cpufreq,
524                ) {
525                    warn!("failed to refresh cpufreq performance level: error {}", err);
526                }
527            }
528        }
529
530        false
531    }
532
533    fn enable_sibling_cpu(
534        skel: &mut BpfSkel<'_>,
535        lvl: usize,
536        cpu: usize,
537        sibling_cpu: usize,
538    ) -> Result<(), u32> {
539        let prog = &mut skel.progs.enable_sibling_cpu;
540        let mut args = domain_arg {
541            lvl_id: lvl as c_int,
542            cpu_id: cpu as c_int,
543            sibling_cpu_id: sibling_cpu as c_int,
544        };
545        let input = ProgramInput {
546            context_in: Some(unsafe {
547                std::slice::from_raw_parts_mut(
548                    &mut args as *mut _ as *mut u8,
549                    std::mem::size_of_val(&args),
550                )
551            }),
552            ..Default::default()
553        };
554        let out = prog.test_run(input).unwrap();
555        if out.return_value != 0 {
556            return Err(out.return_value);
557        }
558
559        Ok(())
560    }
561
562    fn init_smt_domains(skel: &mut BpfSkel<'_>, topo: &Topology) -> Result<(), std::io::Error> {
563        let smt_siblings = topo.sibling_cpus();
564
565        info!("SMT sibling CPUs: {:?}", smt_siblings);
566        for (cpu, sibling_cpu) in smt_siblings.iter().enumerate() {
567            Self::enable_sibling_cpu(skel, 0, cpu, *sibling_cpu as usize).unwrap();
568        }
569
570        Ok(())
571    }
572
573    fn are_smt_siblings(topo: &Topology, cpus: &[usize]) -> bool {
574        // Single CPU or empty array are considered siblings.
575        if cpus.len() <= 1 {
576            return true;
577        }
578
579        // Check if each CPU is a sibling of the first CPU.
580        let first_cpu = cpus[0];
581        let smt_siblings = topo.sibling_cpus();
582        cpus.iter().all(|&cpu| {
583            cpu == first_cpu
584                || smt_siblings[cpu] == first_cpu as i32
585                || (smt_siblings[first_cpu] >= 0 && smt_siblings[first_cpu] == cpu as i32)
586        })
587    }
588
589    fn init_cache_domains(
590        skel: &mut BpfSkel<'_>,
591        topo: &Topology,
592        cache_lvl: usize,
593        enable_sibling_cpu_fn: &dyn Fn(&mut BpfSkel<'_>, usize, usize, usize) -> Result<(), u32>,
594    ) -> Result<(), std::io::Error> {
595        // Determine the list of CPU IDs associated to each cache node.
596        let mut cache_id_map: BTreeMap<usize, Vec<usize>> = BTreeMap::new();
597        for core in topo.all_cores.values() {
598            for (cpu_id, cpu) in &core.cpus {
599                let cache_id = match cache_lvl {
600                    2 => cpu.l2_id,
601                    3 => cpu.llc_id,
602                    _ => panic!("invalid cache level {}", cache_lvl),
603                };
604                cache_id_map.entry(cache_id).or_default().push(*cpu_id);
605            }
606        }
607
608        // Update the BPF cpumasks for the cache domains.
609        for (cache_id, cpus) in cache_id_map {
610            // Ignore the cache domain if it includes a single CPU.
611            if cpus.len() <= 1 {
612                continue;
613            }
614
615            // Ignore the cache domain if all the CPUs are part of the same SMT core.
616            if Self::are_smt_siblings(topo, &cpus) {
617                continue;
618            }
619
620            info!(
621                "L{} cache ID {}: sibling CPUs: {:?}",
622                cache_lvl, cache_id, cpus
623            );
624            for cpu in &cpus {
625                for sibling_cpu in &cpus {
626                    if enable_sibling_cpu_fn(skel, cache_lvl, *cpu, *sibling_cpu).is_err() {
627                        warn!(
628                            "L{} cache ID {}: failed to set CPU {} sibling {}",
629                            cache_lvl, cache_id, *cpu, *sibling_cpu
630                        );
631                    }
632                }
633            }
634        }
635
636        Ok(())
637    }
638
639    fn init_l2_cache_domains(
640        skel: &mut BpfSkel<'_>,
641        topo: &Topology,
642    ) -> Result<(), std::io::Error> {
643        Self::init_cache_domains(skel, topo, 2, &|skel, lvl, cpu, sibling_cpu| {
644            Self::enable_sibling_cpu(skel, lvl, cpu, sibling_cpu)
645        })
646    }
647
648    fn init_l3_cache_domains(
649        skel: &mut BpfSkel<'_>,
650        topo: &Topology,
651    ) -> Result<(), std::io::Error> {
652        Self::init_cache_domains(skel, topo, 3, &|skel, lvl, cpu, sibling_cpu| {
653            Self::enable_sibling_cpu(skel, lvl, cpu, sibling_cpu)
654        })
655    }
656
657    fn get_metrics(&self) -> Metrics {
658        let bss_data = self.skel.maps.bss_data.as_ref().unwrap();
659        Metrics {
660            nr_running: bss_data.nr_running,
661            nr_cpus: bss_data.nr_online_cpus,
662            nr_kthread_dispatches: bss_data.nr_kthread_dispatches,
663            nr_direct_dispatches: bss_data.nr_direct_dispatches,
664            nr_shared_dispatches: bss_data.nr_shared_dispatches,
665        }
666    }
667
668    pub fn exited(&mut self) -> bool {
669        uei_exited!(&self.skel, uei)
670    }
671
672    fn run(&mut self, shutdown: Arc<AtomicBool>) -> Result<UserExitInfo> {
673        let (res_ch, req_ch) = self.stats_server.channels();
674        while !shutdown.load(Ordering::Relaxed) && !self.exited() {
675            if self.refresh_sched_domain() {
676                self.user_restart = true;
677                break;
678            }
679            match req_ch.recv_timeout(Duration::from_secs(1)) {
680                Ok(()) => res_ch.send(self.get_metrics())?,
681                Err(RecvTimeoutError::Timeout) => {}
682                Err(e) => Err(e)?,
683            }
684        }
685
686        let _ = self.struct_ops.take();
687        uei_report!(&self.skel, uei)
688    }
689}
690
691impl Drop for Scheduler<'_> {
692    fn drop(&mut self) {
693        info!("Unregister {SCHEDULER_NAME} scheduler");
694
695        // Restore default CPU idle QoS resume latency.
696        if self.opts.idle_resume_us >= 0 {
697            if cpu_idle_resume_latency_supported() {
698                for cpu in self.topo.all_cpus.values() {
699                    update_cpu_idle_resume_latency(cpu.id, cpu.pm_qos_resume_latency_us as i32)
700                        .unwrap();
701                }
702            }
703        }
704    }
705}
706
707fn main() -> Result<()> {
708    let opts = Opts::parse();
709
710    if opts.version {
711        println!(
712            "{} {}",
713            SCHEDULER_NAME,
714            build_id::full_version(env!("CARGO_PKG_VERSION"))
715        );
716        return Ok(());
717    }
718
719    if opts.help_stats {
720        stats::server_data().describe_meta(&mut std::io::stdout(), None)?;
721        return Ok(());
722    }
723
724    let loglevel = simplelog::LevelFilter::Info;
725
726    let mut lcfg = simplelog::ConfigBuilder::new();
727    lcfg.set_time_offset_to_local()
728        .expect("Failed to set local time offset")
729        .set_time_level(simplelog::LevelFilter::Error)
730        .set_location_level(simplelog::LevelFilter::Off)
731        .set_target_level(simplelog::LevelFilter::Off)
732        .set_thread_level(simplelog::LevelFilter::Off);
733    simplelog::TermLogger::init(
734        loglevel,
735        lcfg.build(),
736        simplelog::TerminalMode::Stderr,
737        simplelog::ColorChoice::Auto,
738    )?;
739
740    let shutdown = Arc::new(AtomicBool::new(false));
741    let shutdown_clone = shutdown.clone();
742    ctrlc::set_handler(move || {
743        shutdown_clone.store(true, Ordering::Relaxed);
744    })
745    .context("Error setting Ctrl-C handler")?;
746
747    if let Some(intv) = opts.monitor.or(opts.stats) {
748        let shutdown_copy = shutdown.clone();
749        let jh = std::thread::spawn(move || {
750            match stats::monitor(Duration::from_secs_f64(intv), shutdown_copy) {
751                Ok(_) => {
752                    debug!("stats monitor thread finished successfully")
753                }
754                Err(error_object) => {
755                    warn!(
756                        "stats monitor thread finished because of an error {}",
757                        error_object
758                    )
759                }
760            }
761        });
762        if opts.monitor.is_some() {
763            let _ = jh.join();
764            return Ok(());
765        }
766    }
767
768    let mut open_object = MaybeUninit::uninit();
769    loop {
770        let mut sched = Scheduler::init(&opts, &mut open_object)?;
771        if !sched.run(shutdown.clone())?.should_restart() {
772            if sched.user_restart {
773                continue;
774            }
775            break;
776        }
777    }
778
779    Ok(())
780}