scx_bpfland/
main.rs

1// SPDX-License-Identifier: GPL-2.0
2//
3// Copyright (c) 2024 Andrea Righi <andrea.righi@linux.dev>
4
5// This software may be used and distributed according to the terms of the
6// GNU General Public License version 2.
7
8mod bpf_skel;
9pub use bpf_skel::*;
10pub mod bpf_intf;
11pub use bpf_intf::*;
12
13mod stats;
14use std::collections::BTreeMap;
15use std::ffi::c_int;
16use std::fmt::Write;
17use std::mem::MaybeUninit;
18use std::sync::atomic::AtomicBool;
19use std::sync::atomic::Ordering;
20use std::sync::Arc;
21use std::time::Duration;
22
23use anyhow::Context;
24use anyhow::Result;
25use clap::Parser;
26use crossbeam::channel::RecvTimeoutError;
27use libbpf_rs::OpenObject;
28use libbpf_rs::ProgramInput;
29use log::warn;
30use log::{debug, info};
31use scx_stats::prelude::*;
32use scx_utils::autopower::{fetch_power_profile, PowerProfile};
33use scx_utils::build_id;
34use scx_utils::compat;
35use scx_utils::pm::{cpu_idle_resume_latency_supported, update_cpu_idle_resume_latency};
36use scx_utils::scx_ops_attach;
37use scx_utils::scx_ops_load;
38use scx_utils::scx_ops_open;
39use scx_utils::set_rlimit_infinity;
40use scx_utils::uei_exited;
41use scx_utils::uei_report;
42use scx_utils::CoreType;
43use scx_utils::Cpumask;
44use scx_utils::Topology;
45use scx_utils::UserExitInfo;
46use scx_utils::NR_CPU_IDS;
47use stats::Metrics;
48
49const SCHEDULER_NAME: &str = "scx_bpfland";
50
51#[derive(PartialEq)]
52enum Powermode {
53    Performance,
54    Powersave,
55    Any,
56}
57
58fn get_primary_cpus(mode: Powermode) -> std::io::Result<Vec<usize>> {
59    let topo = Topology::new().unwrap();
60
61    let cpus: Vec<usize> = topo
62        .all_cores
63        .values()
64        .flat_map(|core| &core.cpus)
65        .filter_map(|(cpu_id, cpu)| match (&mode, &cpu.core_type) {
66            // Performance mode: add all the Big CPUs (either Turbo or non-Turbo)
67            (Powermode::Performance, CoreType::Big { .. }) |
68            // Powersave mode: add all the Little CPUs
69            (Powermode::Powersave, CoreType::Little) => Some(*cpu_id),
70            (Powermode::Any, ..) => Some(*cpu_id),
71            _ => None,
72        })
73        .collect();
74
75    Ok(cpus)
76}
77
78// Convert an array of CPUs to the corresponding cpumask of any arbitrary size.
79fn cpus_to_cpumask(cpus: &Vec<usize>) -> String {
80    if cpus.is_empty() {
81        return String::from("none");
82    }
83
84    // Determine the maximum CPU ID to create a sufficiently large byte vector.
85    let max_cpu_id = *cpus.iter().max().unwrap();
86
87    // Create a byte vector with enough bytes to cover all CPU IDs.
88    let mut bitmask = vec![0u8; (max_cpu_id + 1 + 7) / 8];
89
90    // Set the appropriate bits for each CPU ID.
91    for cpu_id in cpus {
92        let byte_index = cpu_id / 8;
93        let bit_index = cpu_id % 8;
94        bitmask[byte_index] |= 1 << bit_index;
95    }
96
97    // Convert the byte vector to a hexadecimal string.
98    let hex_str: String = bitmask.iter().rev().fold(String::new(), |mut f, byte| {
99        let _ = write!(&mut f, "{:02x}", byte);
100        f
101    });
102
103    format!("0x{}", hex_str)
104}
105
106/// scx_bpfland: a vruntime-based sched_ext scheduler that prioritizes interactive workloads.
107///
108/// This scheduler is derived from scx_rustland, but it is fully implemented in BPF. It has a minimal
109/// user-space part written in Rust to process command line options, collect metrics and log out
110/// scheduling statistics.
111///
112/// The BPF part makes all the scheduling decisions (see src/bpf/main.bpf.c).
113#[derive(Debug, Parser)]
114struct Opts {
115    /// Exit debug dump buffer length. 0 indicates default.
116    #[clap(long, default_value = "0")]
117    exit_dump_len: u32,
118
119    /// Maximum scheduling slice duration in microseconds.
120    #[clap(short = 's', long, default_value = "20000")]
121    slice_us: u64,
122
123    /// Minimum scheduling slice duration in microseconds.
124    #[clap(short = 'S', long, default_value = "1000")]
125    slice_us_min: u64,
126
127    /// Maximum time slice lag in microseconds.
128    ///
129    /// A positive value can help to enhance the responsiveness of interactive tasks, but it can
130    /// also make performance more "spikey".
131    ///
132    /// A negative value can make performance more consistent, but it can also reduce the
133    /// responsiveness of interactive tasks (by smoothing the effect of the vruntime scheduling and
134    /// making the task ordering closer to a FIFO).
135    #[clap(short = 'l', long, allow_hyphen_values = true, default_value = "20000")]
136    slice_us_lag: i64,
137
138    /// Throttle the running CPUs by periodically injecting idle cycles.
139    ///
140    /// This option can help extend battery life on portable devices, reduce heating, fan noise
141    /// and overall energy consumption (0 = disable).
142    #[clap(short = 't', long, default_value = "0")]
143    throttle_us: u64,
144
145    /// Set CPU idle QoS resume latency in microseconds (-1 = disabled).
146    ///
147    /// Setting a lower latency value makes CPUs less likely to enter deeper idle states, enhancing
148    /// performance at the cost of higher power consumption. Alternatively, increasing the latency
149    /// value may reduce performance, but also improve power efficiency.
150    #[clap(short = 'I', long, allow_hyphen_values = true, default_value = "-1")]
151    idle_resume_us: i64,
152
153    /// Disable preemption.
154    ///
155    /// Never allow tasks to be directly dispatched. This can help to increase fairness
156    /// over responsiveness.
157    #[clap(short = 'n', long, action = clap::ArgAction::SetTrue)]
158    no_preempt: bool,
159
160    /// Enable per-CPU tasks prioritization.
161    ///
162    /// This allows to prioritize per-CPU tasks that usually tend to be de-prioritized (since they
163    /// can't be migrated when their only usable CPU is busy). Enabling this option can introduce
164    /// unfairness and potentially trigger stalls, but it can improve performance of server-type
165    /// workloads (such as large parallel builds).
166    #[clap(short = 'p', long, action = clap::ArgAction::SetTrue)]
167    local_pcpu: bool,
168
169    /// Enable kthreads prioritization (EXPERIMENTAL).
170    ///
171    /// Enabling this can improve system performance, but it may also introduce noticeable
172    /// interactivity issues or unfairness in scenarios with high kthread activity, such as heavy
173    /// I/O or network traffic.
174    ///
175    /// Use it only when conducting specific experiments or if you have a clear understanding of
176    /// its implications.
177    #[clap(short = 'k', long, action = clap::ArgAction::SetTrue)]
178    local_kthreads: bool,
179
180    /// Disable direct dispatch during synchronous wakeups.
181    ///
182    /// Enabling this option can lead to a more uniform load distribution across available cores,
183    /// potentially improving performance in certain scenarios. However, it may come at the cost of
184    /// reduced efficiency for pipe-intensive workloads that benefit from tighter producer-consumer
185    /// coupling.
186    #[clap(short = 'w', long, action = clap::ArgAction::SetTrue)]
187    no_wake_sync: bool,
188
189    /// Specifies the initial set of CPUs, represented as a bitmask in hex (e.g., 0xff), that the
190    /// scheduler will use to dispatch tasks, until the system becomes saturated, at which point
191    /// tasks may overflow to other available CPUs.
192    ///
193    /// Special values:
194    ///  - "auto" = automatically detect the CPUs based on the active power profile
195    ///  - "performance" = automatically detect and prioritize the fastest CPUs
196    ///  - "powersave" = automatically detect and prioritize the slowest CPUs
197    ///  - "all" = all CPUs assigned to the primary domain
198    ///  - "none" = no prioritization, tasks are dispatched on the first CPU available
199    #[clap(short = 'm', long, default_value = "auto")]
200    primary_domain: String,
201
202    /// Disable L2 cache awareness.
203    #[clap(long, action = clap::ArgAction::SetTrue)]
204    disable_l2: bool,
205
206    /// Disable L3 cache awareness.
207    #[clap(long, action = clap::ArgAction::SetTrue)]
208    disable_l3: bool,
209
210    /// Disable SMT awareness.
211    #[clap(long, action = clap::ArgAction::SetTrue)]
212    disable_smt: bool,
213
214    /// Disable NUMA rebalancing.
215    #[clap(long, action = clap::ArgAction::SetTrue)]
216    disable_numa: bool,
217
218    /// Enable CPU frequency control (only with schedutil governor).
219    ///
220    /// With this option enabled the CPU frequency will be automatically scaled based on the load.
221    #[clap(short = 'f', long, action = clap::ArgAction::SetTrue)]
222    cpufreq: bool,
223
224    /// [DEPRECATED] Maximum threshold of voluntary context switches per second. This is used to
225    /// classify interactive.
226    ///
227    /// tasks (0 = disable interactive tasks classification).
228    #[clap(short = 'c', long, default_value = "10", hide = true)]
229    nvcsw_max_thresh: u64,
230
231    /// Enable stats monitoring with the specified interval.
232    #[clap(long)]
233    stats: Option<f64>,
234
235    /// Run in stats monitoring mode with the specified interval. Scheduler
236    /// is not launched.
237    #[clap(long)]
238    monitor: Option<f64>,
239
240    /// Enable BPF debugging via /sys/kernel/tracing/trace_pipe.
241    #[clap(short = 'd', long, action = clap::ArgAction::SetTrue)]
242    debug: bool,
243
244    /// Enable verbose output, including libbpf details.
245    #[clap(short = 'v', long, action = clap::ArgAction::SetTrue)]
246    verbose: bool,
247
248    /// Print scheduler version and exit.
249    #[clap(short = 'V', long, action = clap::ArgAction::SetTrue)]
250    version: bool,
251
252    /// Show descriptions for statistics.
253    #[clap(long)]
254    help_stats: bool,
255}
256
257struct Scheduler<'a> {
258    skel: BpfSkel<'a>,
259    struct_ops: Option<libbpf_rs::Link>,
260    opts: &'a Opts,
261    topo: Topology,
262    power_profile: PowerProfile,
263    stats_server: StatsServer<(), Metrics>,
264    user_restart: bool,
265}
266
267impl<'a> Scheduler<'a> {
268    fn init(opts: &'a Opts, open_object: &'a mut MaybeUninit<OpenObject>) -> Result<Self> {
269        set_rlimit_infinity();
270
271        // Validate command line arguments.
272        assert!(opts.slice_us >= opts.slice_us_min);
273
274        // Initialize CPU topology.
275        let topo = Topology::new().unwrap();
276
277        // Check host topology to determine if we need to enable SMT capabilities.
278        let smt_enabled = !opts.disable_smt && topo.smt_enabled;
279
280        // Determine the amount of non-empty NUMA nodes in the system.
281        let nr_nodes = topo
282            .nodes
283            .values()
284            .filter(|node| !node.all_cpus.is_empty())
285            .count();
286        info!("NUMA nodes: {}", nr_nodes);
287
288        // Automatically disable NUMA optimizations when running on non-NUMA systems.
289        let numa_disabled = opts.disable_numa || nr_nodes == 1;
290        if numa_disabled {
291            info!("Disabling NUMA optimizations");
292        }
293
294        info!(
295            "{} {} {}",
296            SCHEDULER_NAME,
297            build_id::full_version(env!("CARGO_PKG_VERSION")),
298            if smt_enabled { "SMT on" } else { "SMT off" }
299        );
300
301        if opts.idle_resume_us >= 0 {
302            if !cpu_idle_resume_latency_supported() {
303                warn!("idle resume latency not supported");
304            } else {
305                info!("Setting idle QoS to {} us", opts.idle_resume_us);
306                for cpu in topo.all_cpus.values() {
307                    update_cpu_idle_resume_latency(
308                        cpu.id,
309                        opts.idle_resume_us.try_into().unwrap(),
310                    )?;
311                }
312            }
313        }
314
315        // Initialize BPF connector.
316        let mut skel_builder = BpfSkelBuilder::default();
317        skel_builder.obj_builder.debug(opts.verbose);
318        let mut skel = scx_ops_open!(skel_builder, open_object, bpfland_ops)?;
319
320        skel.struct_ops.bpfland_ops_mut().exit_dump_len = opts.exit_dump_len;
321
322        // Override default BPF scheduling parameters.
323        skel.maps.rodata_data.debug = opts.debug;
324        skel.maps.rodata_data.smt_enabled = smt_enabled;
325        skel.maps.rodata_data.numa_disabled = numa_disabled;
326        skel.maps.rodata_data.local_pcpu = opts.local_pcpu;
327        skel.maps.rodata_data.no_preempt = opts.no_preempt;
328        skel.maps.rodata_data.no_wake_sync = opts.no_wake_sync;
329        skel.maps.rodata_data.slice_max = opts.slice_us * 1000;
330        skel.maps.rodata_data.slice_min = opts.slice_us_min * 1000;
331        skel.maps.rodata_data.slice_lag = opts.slice_us_lag * 1000;
332        skel.maps.rodata_data.throttle_ns = opts.throttle_us * 1000;
333
334        // Implicitly enable direct dispatch of per-CPU kthreads if CPU throttling is enabled
335        // (it's never a good idea to throttle per-CPU kthreads).
336        skel.maps.rodata_data.local_kthreads = opts.local_kthreads || opts.throttle_us > 0;
337
338        // Set scheduler compatibility flags.
339        skel.maps.rodata_data.__COMPAT_SCX_PICK_IDLE_IN_NODE = *compat::SCX_PICK_IDLE_IN_NODE;
340
341        // Set scheduler flags.
342        skel.struct_ops.bpfland_ops_mut().flags = *compat::SCX_OPS_ENQ_EXITING
343            | *compat::SCX_OPS_ENQ_LAST
344            | *compat::SCX_OPS_ENQ_MIGRATION_DISABLED
345            | *compat::SCX_OPS_ALLOW_QUEUED_WAKEUP
346            | if numa_disabled {
347                0
348            } else {
349                *compat::SCX_OPS_BUILTIN_IDLE_PER_NODE
350            };
351        info!(
352            "scheduler flags: {:#x}",
353            skel.struct_ops.bpfland_ops_mut().flags
354        );
355
356        // Load the BPF program for validation.
357        let mut skel = scx_ops_load!(skel, bpfland_ops, uei)?;
358
359        // Initialize the primary scheduling domain and the preferred domain.
360        let power_profile = Self::power_profile();
361        if let Err(err) = Self::init_energy_domain(&mut skel, &opts.primary_domain, power_profile) {
362            warn!("failed to initialize primary domain: error {}", err);
363        }
364        if let Err(err) = Self::init_cpufreq_perf(&mut skel, &opts.primary_domain, opts.cpufreq) {
365            warn!(
366                "failed to initialize cpufreq performance level: error {}",
367                err
368            );
369        }
370
371        // Initialize SMT domains.
372        if smt_enabled {
373            Self::init_smt_domains(&mut skel, &topo)?;
374        }
375
376        // Initialize L2 cache domains.
377        if !opts.disable_l2 {
378            Self::init_l2_cache_domains(&mut skel, &topo)?;
379        }
380        // Initialize L3 cache domains.
381        if !opts.disable_l3 {
382            Self::init_l3_cache_domains(&mut skel, &topo)?;
383        }
384
385        // Attach the scheduler.
386        let struct_ops = Some(scx_ops_attach!(skel, bpfland_ops)?);
387        let stats_server = StatsServer::new(stats::server_data()).launch()?;
388
389        Ok(Self {
390            skel,
391            struct_ops,
392            opts,
393            topo,
394            power_profile,
395            stats_server,
396            user_restart: false,
397        })
398    }
399
400    fn enable_primary_cpu(skel: &mut BpfSkel<'_>, cpu: i32) -> Result<(), u32> {
401        let prog = &mut skel.progs.enable_primary_cpu;
402        let mut args = cpu_arg {
403            cpu_id: cpu as c_int,
404        };
405        let input = ProgramInput {
406            context_in: Some(unsafe {
407                std::slice::from_raw_parts_mut(
408                    &mut args as *mut _ as *mut u8,
409                    std::mem::size_of_val(&args),
410                )
411            }),
412            ..Default::default()
413        };
414        let out = prog.test_run(input).unwrap();
415        if out.return_value != 0 {
416            return Err(out.return_value);
417        }
418
419        Ok(())
420    }
421
422    fn epp_to_cpumask(profile: Powermode) -> Result<Cpumask> {
423        let mut cpus = get_primary_cpus(profile).unwrap_or_default();
424        if cpus.is_empty() {
425            cpus = get_primary_cpus(Powermode::Any).unwrap_or_default();
426        }
427        Cpumask::from_str(&cpus_to_cpumask(&cpus))
428    }
429
430    fn init_energy_domain(
431        skel: &mut BpfSkel<'_>,
432        primary_domain: &str,
433        power_profile: PowerProfile,
434    ) -> Result<()> {
435        let domain = match primary_domain {
436            "powersave" => Self::epp_to_cpumask(Powermode::Powersave)?,
437            "performance" => Self::epp_to_cpumask(Powermode::Performance)?,
438            "auto" => match power_profile {
439                PowerProfile::Powersave => Self::epp_to_cpumask(Powermode::Powersave)?,
440                PowerProfile::Balanced { power: true } => {
441                    Self::epp_to_cpumask(Powermode::Powersave)?
442                }
443                PowerProfile::Balanced { power: false } => Self::epp_to_cpumask(Powermode::Any)?,
444                PowerProfile::Performance => Self::epp_to_cpumask(Powermode::Any)?,
445                PowerProfile::Unknown => Self::epp_to_cpumask(Powermode::Any)?,
446            },
447            "all" => Self::epp_to_cpumask(Powermode::Any)?,
448            &_ => Cpumask::from_str(primary_domain)?,
449        };
450
451        info!("primary CPU domain = 0x{:x}", domain);
452
453        // Clear the primary domain by passing a negative CPU id.
454        if let Err(err) = Self::enable_primary_cpu(skel, -1) {
455            warn!("failed to reset primary domain: error {}", err);
456        }
457        // Update primary scheduling domain.
458        for cpu in 0..*NR_CPU_IDS {
459            if domain.test_cpu(cpu) {
460                if let Err(err) = Self::enable_primary_cpu(skel, cpu as i32) {
461                    warn!("failed to add CPU {} to primary domain: error {}", cpu, err);
462                }
463            }
464        }
465
466        Ok(())
467    }
468
469    // Update hint for the cpufreq governor.
470    fn init_cpufreq_perf(
471        skel: &mut BpfSkel<'_>,
472        primary_domain: &String,
473        auto: bool,
474    ) -> Result<()> {
475        // If we are using the powersave profile always scale the CPU frequency to the minimum,
476        // otherwise use the maximum, unless automatic frequency scaling is enabled.
477        let perf_lvl: i64 = match primary_domain.as_str() {
478            "powersave" => 0,
479            _ if auto => -1,
480            _ => 1024,
481        };
482        info!(
483            "cpufreq performance level: {}",
484            match perf_lvl {
485                1024 => "max".into(),
486                0 => "min".into(),
487                n if n < 0 => "auto".into(),
488                _ => perf_lvl.to_string(),
489            }
490        );
491        skel.maps.bss_data.cpufreq_perf_lvl = perf_lvl;
492
493        Ok(())
494    }
495
496    fn power_profile() -> PowerProfile {
497        let profile = fetch_power_profile(true);
498        if profile == PowerProfile::Unknown {
499            fetch_power_profile(false)
500        } else {
501            profile
502        }
503    }
504
505    fn refresh_sched_domain(&mut self) -> bool {
506        if self.power_profile != PowerProfile::Unknown {
507            let power_profile = Self::power_profile();
508            if power_profile != self.power_profile {
509                self.power_profile = power_profile;
510
511                if self.opts.primary_domain == "auto" {
512                    return true;
513                }
514                if let Err(err) = Self::init_cpufreq_perf(
515                    &mut self.skel,
516                    &self.opts.primary_domain,
517                    self.opts.cpufreq,
518                ) {
519                    warn!("failed to refresh cpufreq performance level: error {}", err);
520                }
521            }
522        }
523
524        false
525    }
526
527    fn enable_sibling_cpu(
528        skel: &mut BpfSkel<'_>,
529        lvl: usize,
530        cpu: usize,
531        sibling_cpu: usize,
532    ) -> Result<(), u32> {
533        let prog = &mut skel.progs.enable_sibling_cpu;
534        let mut args = domain_arg {
535            lvl_id: lvl as c_int,
536            cpu_id: cpu as c_int,
537            sibling_cpu_id: sibling_cpu as c_int,
538        };
539        let input = ProgramInput {
540            context_in: Some(unsafe {
541                std::slice::from_raw_parts_mut(
542                    &mut args as *mut _ as *mut u8,
543                    std::mem::size_of_val(&args),
544                )
545            }),
546            ..Default::default()
547        };
548        let out = prog.test_run(input).unwrap();
549        if out.return_value != 0 {
550            return Err(out.return_value);
551        }
552
553        Ok(())
554    }
555
556    fn init_smt_domains(skel: &mut BpfSkel<'_>, topo: &Topology) -> Result<(), std::io::Error> {
557        let smt_siblings = topo.sibling_cpus();
558
559        info!("SMT sibling CPUs: {:?}", smt_siblings);
560        for (cpu, sibling_cpu) in smt_siblings.iter().enumerate() {
561            Self::enable_sibling_cpu(skel, 0, cpu, *sibling_cpu as usize).unwrap();
562        }
563
564        Ok(())
565    }
566
567    fn are_smt_siblings(topo: &Topology, cpus: &[usize]) -> bool {
568        // Single CPU or empty array are considered siblings.
569        if cpus.len() <= 1 {
570            return true;
571        }
572
573        // Check if each CPU is a sibling of the first CPU.
574        let first_cpu = cpus[0];
575        let smt_siblings = topo.sibling_cpus();
576        cpus.iter().all(|&cpu| {
577            cpu == first_cpu
578                || smt_siblings[cpu] == first_cpu as i32
579                || (smt_siblings[first_cpu] >= 0 && smt_siblings[first_cpu] == cpu as i32)
580        })
581    }
582
583    fn init_cache_domains(
584        skel: &mut BpfSkel<'_>,
585        topo: &Topology,
586        cache_lvl: usize,
587        enable_sibling_cpu_fn: &dyn Fn(&mut BpfSkel<'_>, usize, usize, usize) -> Result<(), u32>,
588    ) -> Result<(), std::io::Error> {
589        // Determine the list of CPU IDs associated to each cache node.
590        let mut cache_id_map: BTreeMap<usize, Vec<usize>> = BTreeMap::new();
591        for core in topo.all_cores.values() {
592            for (cpu_id, cpu) in &core.cpus {
593                let cache_id = match cache_lvl {
594                    2 => cpu.l2_id,
595                    3 => cpu.llc_id,
596                    _ => panic!("invalid cache level {}", cache_lvl),
597                };
598                cache_id_map.entry(cache_id).or_default().push(*cpu_id);
599            }
600        }
601
602        // Update the BPF cpumasks for the cache domains.
603        for (cache_id, cpus) in cache_id_map {
604            // Ignore the cache domain if it includes a single CPU.
605            if cpus.len() <= 1 {
606                continue;
607            }
608
609            // Ignore the cache domain if all the CPUs are part of the same SMT core.
610            if Self::are_smt_siblings(topo, &cpus) {
611                continue;
612            }
613
614            info!(
615                "L{} cache ID {}: sibling CPUs: {:?}",
616                cache_lvl, cache_id, cpus
617            );
618            for cpu in &cpus {
619                for sibling_cpu in &cpus {
620                    if enable_sibling_cpu_fn(skel, cache_lvl, *cpu, *sibling_cpu).is_err() {
621                        warn!(
622                            "L{} cache ID {}: failed to set CPU {} sibling {}",
623                            cache_lvl, cache_id, *cpu, *sibling_cpu
624                        );
625                    }
626                }
627            }
628        }
629
630        Ok(())
631    }
632
633    fn init_l2_cache_domains(
634        skel: &mut BpfSkel<'_>,
635        topo: &Topology,
636    ) -> Result<(), std::io::Error> {
637        Self::init_cache_domains(skel, topo, 2, &|skel, lvl, cpu, sibling_cpu| {
638            Self::enable_sibling_cpu(skel, lvl, cpu, sibling_cpu)
639        })
640    }
641
642    fn init_l3_cache_domains(
643        skel: &mut BpfSkel<'_>,
644        topo: &Topology,
645    ) -> Result<(), std::io::Error> {
646        Self::init_cache_domains(skel, topo, 3, &|skel, lvl, cpu, sibling_cpu| {
647            Self::enable_sibling_cpu(skel, lvl, cpu, sibling_cpu)
648        })
649    }
650
651    fn get_metrics(&self) -> Metrics {
652        Metrics {
653            nr_running: self.skel.maps.bss_data.nr_running,
654            nr_cpus: self.skel.maps.bss_data.nr_online_cpus,
655            nr_kthread_dispatches: self.skel.maps.bss_data.nr_kthread_dispatches,
656            nr_direct_dispatches: self.skel.maps.bss_data.nr_direct_dispatches,
657            nr_shared_dispatches: self.skel.maps.bss_data.nr_shared_dispatches,
658        }
659    }
660
661    pub fn exited(&mut self) -> bool {
662        uei_exited!(&self.skel, uei)
663    }
664
665    fn run(&mut self, shutdown: Arc<AtomicBool>) -> Result<UserExitInfo> {
666        let (res_ch, req_ch) = self.stats_server.channels();
667        while !shutdown.load(Ordering::Relaxed) && !self.exited() {
668            if self.refresh_sched_domain() {
669                self.user_restart = true;
670                break;
671            }
672            match req_ch.recv_timeout(Duration::from_secs(1)) {
673                Ok(()) => res_ch.send(self.get_metrics())?,
674                Err(RecvTimeoutError::Timeout) => {}
675                Err(e) => Err(e)?,
676            }
677        }
678
679        let _ = self.struct_ops.take();
680        uei_report!(&self.skel, uei)
681    }
682}
683
684impl Drop for Scheduler<'_> {
685    fn drop(&mut self) {
686        info!("Unregister {} scheduler", SCHEDULER_NAME);
687
688        // Restore default CPU idle QoS resume latency.
689        if self.opts.idle_resume_us >= 0 {
690            if cpu_idle_resume_latency_supported() {
691                for cpu in self.topo.all_cpus.values() {
692                    update_cpu_idle_resume_latency(cpu.id, cpu.pm_qos_resume_latency_us as i32)
693                        .unwrap();
694                }
695            }
696        }
697    }
698}
699
700fn main() -> Result<()> {
701    let opts = Opts::parse();
702
703    if opts.version {
704        println!(
705            "{} {}",
706            SCHEDULER_NAME,
707            build_id::full_version(env!("CARGO_PKG_VERSION"))
708        );
709        return Ok(());
710    }
711
712    if opts.help_stats {
713        stats::server_data().describe_meta(&mut std::io::stdout(), None)?;
714        return Ok(());
715    }
716
717    let loglevel = simplelog::LevelFilter::Info;
718
719    let mut lcfg = simplelog::ConfigBuilder::new();
720    lcfg.set_time_offset_to_local()
721        .expect("Failed to set local time offset")
722        .set_time_level(simplelog::LevelFilter::Error)
723        .set_location_level(simplelog::LevelFilter::Off)
724        .set_target_level(simplelog::LevelFilter::Off)
725        .set_thread_level(simplelog::LevelFilter::Off);
726    simplelog::TermLogger::init(
727        loglevel,
728        lcfg.build(),
729        simplelog::TerminalMode::Stderr,
730        simplelog::ColorChoice::Auto,
731    )?;
732
733    let shutdown = Arc::new(AtomicBool::new(false));
734    let shutdown_clone = shutdown.clone();
735    ctrlc::set_handler(move || {
736        shutdown_clone.store(true, Ordering::Relaxed);
737    })
738    .context("Error setting Ctrl-C handler")?;
739
740    if let Some(intv) = opts.monitor.or(opts.stats) {
741        let shutdown_copy = shutdown.clone();
742        let jh = std::thread::spawn(move || {
743            match stats::monitor(Duration::from_secs_f64(intv), shutdown_copy) {
744                Ok(_) => {
745                    debug!("stats monitor thread finished successfully")
746                }
747                Err(error_object) => {
748                    warn!(
749                        "stats monitor thread finished because of an error {}",
750                        error_object
751                    )
752                }
753            }
754        });
755        if opts.monitor.is_some() {
756            let _ = jh.join();
757            return Ok(());
758        }
759    }
760
761    let mut open_object = MaybeUninit::uninit();
762    loop {
763        let mut sched = Scheduler::init(&opts, &mut open_object)?;
764        if !sched.run(shutdown.clone())?.should_restart() {
765            if sched.user_restart {
766                continue;
767            }
768            break;
769        }
770    }
771
772    Ok(())
773}