Skip to main content

scx_bpfland/
main.rs

1// SPDX-License-Identifier: GPL-2.0
2//
3// Copyright (c) 2024 Andrea Righi <andrea.righi@linux.dev>
4
5// This software may be used and distributed according to the terms of the
6// GNU General Public License version 2.
7
8mod bpf_skel;
9pub use bpf_skel::*;
10pub mod bpf_intf;
11pub use bpf_intf::*;
12
13mod stats;
14use std::ffi::{c_int, c_ulong};
15use std::fmt::Write;
16use std::mem::MaybeUninit;
17use std::sync::atomic::AtomicBool;
18use std::sync::atomic::Ordering;
19use std::sync::Arc;
20use std::time::Duration;
21
22use anyhow::anyhow;
23use anyhow::bail;
24use anyhow::Context;
25use anyhow::Result;
26use clap::Parser;
27use crossbeam::channel::RecvTimeoutError;
28use libbpf_rs::OpenObject;
29use libbpf_rs::ProgramInput;
30use log::warn;
31use log::{debug, info};
32use scx_stats::prelude::*;
33use scx_utils::autopower::{fetch_power_profile, PowerProfile};
34use scx_utils::build_id;
35use scx_utils::compat;
36use scx_utils::libbpf_clap_opts::LibbpfOpts;
37use scx_utils::pm::{cpu_idle_resume_latency_supported, update_cpu_idle_resume_latency};
38use scx_utils::scx_ops_attach;
39use scx_utils::scx_ops_load;
40use scx_utils::scx_ops_open;
41use scx_utils::try_set_rlimit_infinity;
42use scx_utils::uei_exited;
43use scx_utils::uei_report;
44use scx_utils::CoreType;
45use scx_utils::Cpumask;
46use scx_utils::Topology;
47use scx_utils::UserExitInfo;
48use scx_utils::NR_CPU_IDS;
49use stats::Metrics;
50
51const SCHEDULER_NAME: &str = "scx_bpfland";
52
53#[derive(PartialEq)]
54enum Powermode {
55    Turbo,
56    Performance,
57    Powersave,
58    Any,
59}
60
61fn get_primary_cpus(mode: Powermode) -> std::io::Result<Vec<usize>> {
62    let topo = Topology::new().unwrap();
63
64    let cpus: Vec<usize> = topo
65        .all_cores
66        .values()
67        .flat_map(|core| &core.cpus)
68        .filter_map(|(cpu_id, cpu)| match (&mode, &cpu.core_type) {
69            // Performance mode: add all the Big CPUs (either Turbo or non-Turbo)
70            (Powermode::Performance, CoreType::Big { .. }) |
71            // Powersave mode: add all the Little CPUs
72            (Powermode::Powersave, CoreType::Little) => Some(*cpu_id),
73            (Powermode::Any, ..) => Some(*cpu_id),
74            _ => None,
75        })
76        .collect();
77
78    Ok(cpus)
79}
80
81// Convert an array of CPUs to the corresponding cpumask of any arbitrary size.
82fn cpus_to_cpumask(cpus: &Vec<usize>) -> String {
83    if cpus.is_empty() {
84        return String::from("none");
85    }
86
87    // Determine the maximum CPU ID to create a sufficiently large byte vector.
88    let max_cpu_id = *cpus.iter().max().unwrap();
89
90    // Create a byte vector with enough bytes to cover all CPU IDs.
91    let mut bitmask = vec![0u8; (max_cpu_id + 1 + 7) / 8];
92
93    // Set the appropriate bits for each CPU ID.
94    for cpu_id in cpus {
95        let byte_index = cpu_id / 8;
96        let bit_index = cpu_id % 8;
97        bitmask[byte_index] |= 1 << bit_index;
98    }
99
100    // Convert the byte vector to a hexadecimal string.
101    let hex_str: String = bitmask.iter().rev().fold(String::new(), |mut f, byte| {
102        let _ = write!(&mut f, "{:02x}", byte);
103        f
104    });
105
106    format!("0x{}", hex_str)
107}
108
109/// scx_bpfland: a vruntime-based sched_ext scheduler that prioritizes interactive workloads.
110///
111/// This scheduler is derived from scx_rustland, but it is fully implemented in BPF. It has a minimal
112/// user-space part written in Rust to process command line options, collect metrics and log out
113/// scheduling statistics.
114///
115/// The BPF part makes all the scheduling decisions (see src/bpf/main.bpf.c).
116#[derive(Debug, Parser)]
117struct Opts {
118    /// Exit debug dump buffer length. 0 indicates default.
119    #[clap(long, default_value = "0")]
120    exit_dump_len: u32,
121
122    /// Maximum scheduling slice duration in microseconds.
123    #[clap(short = 's', long, default_value = "1000")]
124    slice_us: u64,
125
126    /// Minimum scheduling slice duration in microseconds (0 = no minimum time slice).
127    #[clap(short = 'L', long, default_value = "0")]
128    slice_min_us: u64,
129
130    /// Maximum time slice lag in microseconds.
131    ///
132    /// A positive value can help to enhance the responsiveness of interactive tasks, but it can
133    /// also make performance more "spikey".
134    #[clap(short = 'l', long, default_value = "40000")]
135    slice_us_lag: u64,
136
137    /// Throttle the running CPUs by periodically injecting idle cycles.
138    ///
139    /// This option can help extend battery life on portable devices, reduce heating, fan noise
140    /// and overall energy consumption (0 = disable).
141    #[clap(short = 't', long, default_value = "0")]
142    throttle_us: u64,
143
144    /// Set CPU idle QoS resume latency in microseconds (-1 = disabled).
145    ///
146    /// Setting a lower latency value makes CPUs less likely to enter deeper idle states, enhancing
147    /// performance at the cost of higher power consumption. Alternatively, increasing the latency
148    /// value may reduce performance, but also improve power efficiency.
149    #[clap(short = 'I', long, allow_hyphen_values = true, default_value = "-1")]
150    idle_resume_us: i64,
151
152    /// Enable per-CPU tasks prioritization.
153    ///
154    /// This allows to prioritize per-CPU tasks that usually tend to be de-prioritized (since they
155    /// can't be migrated when their only usable CPU is busy). Enabling this option can introduce
156    /// unfairness and potentially trigger stalls, but it can improve performance of server-type
157    /// workloads (such as large parallel builds).
158    #[clap(short = 'p', long, action = clap::ArgAction::SetTrue)]
159    local_pcpu: bool,
160
161    /// Enable kthreads prioritization (EXPERIMENTAL).
162    ///
163    /// Enabling this can improve system performance, but it may also introduce noticeable
164    /// interactivity issues or unfairness in scenarios with high kthread activity, such as heavy
165    /// I/O or network traffic.
166    ///
167    /// Use it only when conducting specific experiments or if you have a clear understanding of
168    /// its implications.
169    #[clap(short = 'k', long, action = clap::ArgAction::SetTrue)]
170    local_kthreads: bool,
171
172    /// Disable direct dispatch during synchronous wakeups.
173    ///
174    /// Enabling this option can lead to a more uniform load distribution across available cores,
175    /// potentially improving performance in certain scenarios. However, it may come at the cost of
176    /// reduced efficiency for pipe-intensive workloads that benefit from tighter producer-consumer
177    /// coupling.
178    #[clap(short = 'w', long, action = clap::ArgAction::SetTrue)]
179    no_wake_sync: bool,
180
181    /// Enable sticky tasks.
182    ///
183    /// If enabled force tasks with a high rate of enqueues/sec to stay on the same CPU, to reduce
184    /// locking contention on the shared runqueues.
185    ///
186    /// This can help making the scheduler more robust with intensive scheduling workloads and
187    /// benchmarks, but it can negatively impact on latency.
188    #[clap(short = 'S', long, action = clap::ArgAction::SetTrue)]
189    sticky_tasks: bool,
190
191    /// Specifies the initial set of CPUs, represented as a bitmask in hex (e.g., 0xff), that the
192    /// scheduler will use to dispatch tasks, until the system becomes saturated, at which point
193    /// tasks may overflow to other available CPUs.
194    ///
195    /// Special values:
196    ///  - "auto" = automatically detect the CPUs based on the active power profile
197    ///  - "performance" = automatically detect and prioritize the fastest CPUs
198    ///  - "powersave" = automatically detect and prioritize the slowest CPUs
199    ///  - "all" = all CPUs assigned to the primary domain
200    ///  - "none" = no prioritization, tasks are dispatched on the first CPU available
201    #[clap(short = 'm', long, default_value = "auto")]
202    primary_domain: String,
203
204    /// Enable preferred idle CPU scanning.
205    ///
206    /// With this option enabled, the scheduler will prioritize assigning tasks to higher-ranked
207    /// cores before considering lower-ranked ones.
208    #[clap(short = 'P', long, action = clap::ArgAction::SetTrue)]
209    preferred_idle_scan: bool,
210
211    /// Disable SMT awareness.
212    #[clap(long, action = clap::ArgAction::SetTrue)]
213    disable_smt: bool,
214
215    /// Disable NUMA awareness.
216    #[clap(long, action = clap::ArgAction::SetTrue)]
217    disable_numa: bool,
218
219    /// Enable CPU frequency control (only with schedutil governor).
220    ///
221    /// With this option enabled the CPU frequency will be automatically scaled based on the load.
222    #[clap(short = 'f', long, action = clap::ArgAction::SetTrue)]
223    cpufreq: bool,
224
225    /// Enable TIMELY mode: use TIMELY's delay-driven feedback for adaptive time slices.
226    #[clap(short = 'T', long, action = clap::ArgAction::SetTrue)]
227    timely: bool,
228
229    /// TIMELY lower delay threshold in microseconds.
230    #[clap(long, default_value = "5000")]
231    timely_tlow_us: u64,
232
233    /// TIMELY higher delay threshold in microseconds.
234    #[clap(long, default_value = "50000")]
235    timely_thigh_us: u64,
236
237    /// TIMELY minimum gain value (fixed-point).
238    #[clap(long, default_value = "128")]
239    timely_gain_min: u32,
240
241    /// TIMELY gain step (fixed-point).
242    #[clap(long, default_value = "32")]
243    timely_gain_step: u32,
244
245    /// TIMELY HAI threshold (fixed-point).
246    #[clap(long, default_value = "768")]
247    timely_hai_thresh: u32,
248
249    /// TIMELY HAI multiplier.
250    #[clap(long, default_value = "2")]
251    timely_hai_multiplier: u32,
252
253    /// TIMELY backoff low (fixed-point).
254    #[clap(long, default_value = "768")]
255    timely_backoff_low: u32,
256
257    /// TIMELY backoff high (fixed-point).
258    #[clap(long, default_value = "960")]
259    timely_backoff_high: u32,
260
261    /// TIMELY backoff gradient (fixed-point).
262    #[clap(long, default_value = "992")]
263    timely_backoff_gradient: u32,
264
265    /// TIMELY gradient margin in microseconds.
266    #[clap(long, default_value = "125")]
267    timely_gradient_margin_us: u64,
268
269    /// TIMELY control interval in microseconds.
270    #[clap(long, default_value = "500")]
271    timely_control_interval_us: u64,
272
273    /// Enable stats monitoring with the specified interval.
274    #[clap(long)]
275    stats: Option<f64>,
276
277    /// Run in stats monitoring mode with the specified interval. Scheduler
278    /// is not launched.
279    #[clap(long)]
280    monitor: Option<f64>,
281
282    /// Enable BPF debugging via /sys/kernel/tracing/trace_pipe.
283    #[clap(short = 'd', long, action = clap::ArgAction::SetTrue)]
284    debug: bool,
285
286    /// Enable verbose output, including libbpf details.
287    #[clap(short = 'v', long, action = clap::ArgAction::SetTrue)]
288    verbose: bool,
289
290    /// Print scheduler version and exit.
291    #[clap(short = 'V', long, action = clap::ArgAction::SetTrue)]
292    version: bool,
293
294    /// Show descriptions for statistics.
295    #[clap(long)]
296    help_stats: bool,
297
298    #[clap(flatten, next_help_heading = "Libbpf Options")]
299    pub libbpf: LibbpfOpts,
300}
301
302struct Scheduler<'a> {
303    skel: BpfSkel<'a>,
304    struct_ops: Option<libbpf_rs::Link>,
305    opts: &'a Opts,
306    topo: Topology,
307    power_profile: PowerProfile,
308    stats_server: StatsServer<(), Metrics>,
309    user_restart: bool,
310}
311
312impl<'a> Scheduler<'a> {
313    fn init(opts: &'a Opts, open_object: &'a mut MaybeUninit<OpenObject>) -> Result<Self> {
314        try_set_rlimit_infinity();
315
316        // Initialize CPU topology.
317        let topo = Topology::new().unwrap();
318
319        // Check host topology to determine if we need to enable SMT capabilities.
320        let smt_enabled = !opts.disable_smt && topo.smt_enabled;
321
322        // Determine the amount of non-empty NUMA nodes in the system.
323        let nr_nodes = topo
324            .nodes
325            .values()
326            .filter(|node| !node.all_cpus.is_empty())
327            .count();
328        info!("NUMA nodes: {}", nr_nodes);
329
330        // Automatically disable NUMA optimizations when running on non-NUMA systems.
331        let numa_enabled = !opts.disable_numa && nr_nodes > 1;
332        if !numa_enabled {
333            info!("Disabling NUMA optimizations");
334        }
335
336        // Determine the primary scheduling domain.
337        let power_profile = Self::power_profile();
338        let domain =
339            Self::resolve_energy_domain(&opts.primary_domain, power_profile).map_err(|err| {
340                anyhow!(
341                    "failed to resolve primary domain '{}': {}",
342                    &opts.primary_domain,
343                    err
344                )
345            })?;
346
347        info!(
348            "{} {} {}",
349            SCHEDULER_NAME,
350            build_id::full_version(env!("CARGO_PKG_VERSION")),
351            if smt_enabled { "SMT on" } else { "SMT off" }
352        );
353
354        // Print command line.
355        info!(
356            "scheduler options: {}",
357            std::env::args().collect::<Vec<_>>().join(" ")
358        );
359
360        if opts.idle_resume_us >= 0 {
361            if !cpu_idle_resume_latency_supported() {
362                warn!("idle resume latency not supported");
363            } else {
364                info!("Setting idle QoS to {} us", opts.idle_resume_us);
365                for cpu in topo.all_cpus.values() {
366                    update_cpu_idle_resume_latency(
367                        cpu.id,
368                        opts.idle_resume_us.try_into().unwrap(),
369                    )?;
370                }
371            }
372        }
373
374        // Initialize BPF connector.
375        let mut skel_builder = BpfSkelBuilder::default();
376        skel_builder.obj_builder.debug(opts.verbose);
377        let open_opts = opts.libbpf.clone().into_bpf_open_opts();
378        let mut skel = scx_ops_open!(skel_builder, open_object, bpfland_ops, open_opts)?;
379
380        skel.struct_ops.bpfland_ops_mut().exit_dump_len = opts.exit_dump_len;
381
382        // Override default BPF scheduling parameters.
383        let rodata = skel.maps.rodata_data.as_mut().unwrap();
384        rodata.debug = opts.debug;
385        rodata.smt_enabled = smt_enabled;
386        rodata.numa_enabled = numa_enabled;
387        rodata.local_pcpu = opts.local_pcpu;
388        rodata.no_wake_sync = opts.no_wake_sync;
389        rodata.sticky_tasks = opts.sticky_tasks;
390        rodata.slice_max = opts.slice_us * 1000;
391        rodata.slice_min = opts.slice_min_us * 1000;
392        rodata.slice_lag = opts.slice_us_lag * 1000;
393        rodata.throttle_ns = opts.throttle_us * 1000;
394        rodata.primary_all = domain.weight() == *NR_CPU_IDS;
395
396        // TIMELY settings (only effective when timely_enabled=true)
397        rodata.timely_enabled = opts.timely;
398        rodata.timely_tlow_ns = opts.timely_tlow_us * 1000;
399        rodata.timely_thigh_ns = opts.timely_thigh_us * 1000;
400        rodata.timely_gain_min_fp = opts.timely_gain_min;
401        rodata.timely_gain_max_fp = 1024;
402        rodata.timely_gain_step_fp = opts.timely_gain_step;
403        rodata.timely_hai_thresh_fp = opts.timely_hai_thresh;
404        rodata.timely_hai_multiplier = opts.timely_hai_multiplier;
405        rodata.timely_backoff_low_fp = opts.timely_backoff_low;
406        rodata.timely_backoff_high_fp = opts.timely_backoff_high;
407        rodata.timely_backoff_gradient_fp = opts.timely_backoff_gradient;
408        rodata.timely_gradient_margin_ns = opts.timely_gradient_margin_us * 1000;
409        rodata.timely_control_interval_ns = opts.timely_control_interval_us * 1000;
410
411        // Generate the list of available CPUs sorted by capacity in descending order.
412        let mut cpus: Vec<_> = topo.all_cpus.values().collect();
413        cpus.sort_by_key(|cpu| std::cmp::Reverse(cpu.cpu_capacity));
414        for (i, cpu) in cpus.iter().enumerate() {
415            rodata.cpu_capacity[cpu.id] = cpu.cpu_capacity as c_ulong;
416            rodata.preferred_cpus[i] = cpu.id as u64;
417        }
418        if opts.preferred_idle_scan {
419            info!(
420                "Preferred CPUs: {:?}",
421                &rodata.preferred_cpus[0..cpus.len()]
422            );
423        }
424        rodata.preferred_idle_scan = opts.preferred_idle_scan;
425
426        // Implicitly enable direct dispatch of per-CPU kthreads if CPU throttling is enabled
427        // (it's never a good idea to throttle per-CPU kthreads).
428        rodata.local_kthreads = opts.local_kthreads || opts.throttle_us > 0;
429
430        // Set scheduler flags.
431        skel.struct_ops.bpfland_ops_mut().flags = *compat::SCX_OPS_ENQ_EXITING
432            | *compat::SCX_OPS_ENQ_LAST
433            | *compat::SCX_OPS_ENQ_MIGRATION_DISABLED
434            | *compat::SCX_OPS_ALLOW_QUEUED_WAKEUP
435            | if numa_enabled {
436                *compat::SCX_OPS_BUILTIN_IDLE_PER_NODE
437            } else {
438                0
439            };
440        info!(
441            "scheduler flags: {:#x}",
442            skel.struct_ops.bpfland_ops_mut().flags
443        );
444
445        // Load the BPF program for validation.
446        let mut skel = scx_ops_load!(skel, bpfland_ops, uei)?;
447
448        // Initialize the primary scheduling domain.
449        Self::init_energy_domain(&mut skel, &domain).map_err(|err| {
450            anyhow!(
451                "failed to initialize primary domain 0x{:x}: {}",
452                domain,
453                err
454            )
455        })?;
456
457        // Initialize CPU frequency scaling.
458        if let Err(err) = Self::init_cpufreq_perf(&mut skel, &opts.primary_domain, opts.cpufreq) {
459            bail!(
460                "failed to initialize cpufreq performance level: error {}",
461                err
462            );
463        }
464
465        // Initialize SMT domains.
466        if smt_enabled {
467            Self::init_smt_domains(&mut skel, &topo)?;
468        }
469
470        // Attach the scheduler.
471        let struct_ops = Some(scx_ops_attach!(skel, bpfland_ops)?);
472        let stats_server = StatsServer::new(stats::server_data()).launch()?;
473
474        Ok(Self {
475            skel,
476            struct_ops,
477            opts,
478            topo,
479            power_profile,
480            stats_server,
481            user_restart: false,
482        })
483    }
484
485    fn enable_primary_cpu(skel: &mut BpfSkel<'_>, cpu: i32) -> Result<(), u32> {
486        let prog = &mut skel.progs.enable_primary_cpu;
487        let mut args = cpu_arg {
488            cpu_id: cpu as c_int,
489        };
490        let input = ProgramInput {
491            context_in: Some(unsafe {
492                std::slice::from_raw_parts_mut(
493                    &mut args as *mut _ as *mut u8,
494                    std::mem::size_of_val(&args),
495                )
496            }),
497            ..Default::default()
498        };
499        let out = prog.test_run(input).unwrap();
500        if out.return_value != 0 {
501            return Err(out.return_value);
502        }
503
504        Ok(())
505    }
506
507    fn epp_to_cpumask(profile: Powermode) -> Result<Cpumask> {
508        let mut cpus = get_primary_cpus(profile).unwrap_or_default();
509        if cpus.is_empty() {
510            cpus = get_primary_cpus(Powermode::Any).unwrap_or_default();
511        }
512        Cpumask::from_str(&cpus_to_cpumask(&cpus))
513    }
514
515    fn resolve_energy_domain(primary_domain: &str, power_profile: PowerProfile) -> Result<Cpumask> {
516        let domain = match primary_domain {
517            "powersave" => Self::epp_to_cpumask(Powermode::Powersave)?,
518            "performance" => Self::epp_to_cpumask(Powermode::Performance)?,
519            "turbo" => Self::epp_to_cpumask(Powermode::Turbo)?,
520            "auto" => match power_profile {
521                PowerProfile::Powersave => Self::epp_to_cpumask(Powermode::Powersave)?,
522                PowerProfile::Balanced { .. }
523                | PowerProfile::Performance
524                | PowerProfile::Unknown => Self::epp_to_cpumask(Powermode::Any)?,
525            },
526            "all" => Self::epp_to_cpumask(Powermode::Any)?,
527            &_ => Cpumask::from_str(primary_domain)?,
528        };
529
530        Ok(domain)
531    }
532
533    fn init_energy_domain(skel: &mut BpfSkel<'_>, domain: &Cpumask) -> Result<()> {
534        info!("primary CPU domain = 0x{:x}", domain);
535
536        // Clear the primary domain by passing a negative CPU id.
537        if let Err(err) = Self::enable_primary_cpu(skel, -1) {
538            bail!("failed to reset primary domain: error {}", err);
539        }
540
541        // Update primary scheduling domain.
542        for cpu in 0..*NR_CPU_IDS {
543            if domain.test_cpu(cpu) {
544                if let Err(err) = Self::enable_primary_cpu(skel, cpu as i32) {
545                    bail!("failed to add CPU {} to primary domain: error {}", cpu, err);
546                }
547            }
548        }
549
550        Ok(())
551    }
552
553    // Update hint for the cpufreq governor.
554    fn init_cpufreq_perf(
555        skel: &mut BpfSkel<'_>,
556        primary_domain: &String,
557        auto: bool,
558    ) -> Result<()> {
559        // If we are using the powersave profile always scale the CPU frequency to the minimum,
560        // otherwise use the maximum, unless automatic frequency scaling is enabled.
561        let perf_lvl: i64 = match primary_domain.as_str() {
562            "powersave" => 0,
563            _ if auto => -1,
564            _ => 1024,
565        };
566        info!(
567            "cpufreq performance level: {}",
568            match perf_lvl {
569                1024 => "max".into(),
570                0 => "min".into(),
571                n if n < 0 => "auto".into(),
572                _ => perf_lvl.to_string(),
573            }
574        );
575        skel.maps.bss_data.as_mut().unwrap().cpufreq_perf_lvl = perf_lvl;
576
577        Ok(())
578    }
579
580    fn power_profile() -> PowerProfile {
581        let profile = fetch_power_profile(true);
582        if profile == PowerProfile::Unknown {
583            fetch_power_profile(false)
584        } else {
585            profile
586        }
587    }
588
589    fn refresh_sched_domain(&mut self) -> bool {
590        if self.power_profile != PowerProfile::Unknown {
591            let power_profile = Self::power_profile();
592            if power_profile != self.power_profile {
593                self.power_profile = power_profile;
594
595                if self.opts.primary_domain == "auto" {
596                    return true;
597                }
598                if let Err(err) = Self::init_cpufreq_perf(
599                    &mut self.skel,
600                    &self.opts.primary_domain,
601                    self.opts.cpufreq,
602                ) {
603                    warn!("failed to refresh cpufreq performance level: error {}", err);
604                }
605            }
606        }
607
608        false
609    }
610
611    fn enable_sibling_cpu(
612        skel: &mut BpfSkel<'_>,
613        cpu: usize,
614        sibling_cpu: usize,
615    ) -> Result<(), u32> {
616        let prog = &mut skel.progs.enable_sibling_cpu;
617        let mut args = domain_arg {
618            cpu_id: cpu as c_int,
619            sibling_cpu_id: sibling_cpu as c_int,
620        };
621        let input = ProgramInput {
622            context_in: Some(unsafe {
623                std::slice::from_raw_parts_mut(
624                    &mut args as *mut _ as *mut u8,
625                    std::mem::size_of_val(&args),
626                )
627            }),
628            ..Default::default()
629        };
630        let out = prog.test_run(input).unwrap();
631        if out.return_value != 0 {
632            return Err(out.return_value);
633        }
634
635        Ok(())
636    }
637
638    fn init_smt_domains(skel: &mut BpfSkel<'_>, topo: &Topology) -> Result<(), std::io::Error> {
639        let smt_siblings = topo.sibling_cpus();
640
641        info!("SMT sibling CPUs: {:?}", smt_siblings);
642        for (cpu, sibling_cpu) in smt_siblings.iter().enumerate() {
643            Self::enable_sibling_cpu(skel, cpu, *sibling_cpu as usize).unwrap();
644        }
645
646        Ok(())
647    }
648
649    fn get_metrics(&self) -> Metrics {
650        let bss_data = self.skel.maps.bss_data.as_ref().unwrap();
651        Metrics {
652            nr_running: bss_data.nr_running,
653            nr_cpus: bss_data.nr_online_cpus,
654            nr_kthread_dispatches: bss_data.nr_kthread_dispatches,
655            nr_direct_dispatches: bss_data.nr_direct_dispatches,
656            nr_shared_dispatches: bss_data.nr_shared_dispatches,
657            nr_delay_recovery_dispatches: bss_data.nr_delay_recovery_dispatches,
658            nr_delay_middle_add_dispatches: bss_data.nr_delay_middle_add_dispatches,
659            nr_delay_fast_recovery_dispatches: bss_data.nr_delay_fast_recovery_dispatches,
660            nr_delay_rate_limited_dispatches: bss_data.nr_delay_rate_limited_dispatches,
661            nr_gain_floor_dispatches: bss_data.nr_gain_floor_dispatches,
662            nr_gain_ceiling_dispatches: bss_data.nr_gain_ceiling_dispatches,
663            nr_delay_low_region_samples: bss_data.nr_delay_low_region_samples,
664            nr_delay_mid_region_samples: bss_data.nr_delay_mid_region_samples,
665            nr_delay_high_region_samples: bss_data.nr_delay_high_region_samples,
666            nr_gain_floor_resident_samples: bss_data.nr_gain_floor_resident_samples,
667            nr_gain_mid_resident_samples: bss_data.nr_gain_mid_resident_samples,
668            nr_gain_ceiling_resident_samples: bss_data.nr_gain_ceiling_resident_samples,
669            nr_idle_select_path_picks: bss_data.nr_idle_select_path_picks,
670            nr_idle_enqueue_path_picks: bss_data.nr_idle_enqueue_path_picks,
671            nr_idle_prev_cpu_picks: bss_data.nr_idle_prev_cpu_picks,
672            nr_idle_primary_picks: bss_data.nr_idle_primary_picks,
673            nr_idle_spill_picks: bss_data.nr_idle_spill_picks,
674            nr_idle_pick_failures: bss_data.nr_idle_pick_failures,
675            nr_idle_primary_domain_misses: bss_data.nr_idle_primary_domain_misses,
676            nr_idle_global_misses: bss_data.nr_idle_global_misses,
677            nr_waker_cpu_biases: bss_data.nr_waker_cpu_biases,
678            nr_keep_running_reuses: bss_data.nr_keep_running_reuses,
679            nr_keep_running_queue_empty: bss_data.nr_keep_running_queue_empty,
680            nr_keep_running_smt_blocked: bss_data.nr_keep_running_smt_blocked,
681            nr_keep_running_queued_work: bss_data.nr_keep_running_queued_work,
682            nr_dispatch_cpu_dsq_consumes: bss_data.nr_dispatch_cpu_dsq_consumes,
683            nr_dispatch_node_dsq_consumes: bss_data.nr_dispatch_node_dsq_consumes,
684            nr_cpu_release_reenqueue: bss_data.nr_cpu_release_reenqueue,
685        }
686    }
687
688    pub fn exited(&mut self) -> bool {
689        uei_exited!(&self.skel, uei)
690    }
691
692    fn run(&mut self, shutdown: Arc<AtomicBool>) -> Result<UserExitInfo> {
693        let (res_ch, req_ch) = self.stats_server.channels();
694        while !shutdown.load(Ordering::Relaxed) && !self.exited() {
695            if self.refresh_sched_domain() {
696                self.user_restart = true;
697                break;
698            }
699            match req_ch.recv_timeout(Duration::from_secs(1)) {
700                Ok(()) => res_ch.send(self.get_metrics())?,
701                Err(RecvTimeoutError::Timeout) => {}
702                Err(e) => Err(e)?,
703            }
704        }
705
706        let _ = self.struct_ops.take();
707        uei_report!(&self.skel, uei)
708    }
709}
710
711impl Drop for Scheduler<'_> {
712    fn drop(&mut self) {
713        info!("Unregister {SCHEDULER_NAME} scheduler");
714
715        // Restore default CPU idle QoS resume latency.
716        if self.opts.idle_resume_us >= 0 {
717            if cpu_idle_resume_latency_supported() {
718                for cpu in self.topo.all_cpus.values() {
719                    update_cpu_idle_resume_latency(cpu.id, cpu.pm_qos_resume_latency_us as i32)
720                        .unwrap();
721                }
722            }
723        }
724    }
725}
726
727fn main() -> Result<()> {
728    let opts = Opts::parse();
729
730    if opts.version {
731        println!(
732            "{} {}",
733            SCHEDULER_NAME,
734            build_id::full_version(env!("CARGO_PKG_VERSION"))
735        );
736        return Ok(());
737    }
738
739    if opts.help_stats {
740        stats::server_data().describe_meta(&mut std::io::stdout(), None)?;
741        return Ok(());
742    }
743
744    let loglevel = simplelog::LevelFilter::Info;
745
746    let mut lcfg = simplelog::ConfigBuilder::new();
747    lcfg.set_time_offset_to_local()
748        .expect("Failed to set local time offset")
749        .set_time_level(simplelog::LevelFilter::Error)
750        .set_location_level(simplelog::LevelFilter::Off)
751        .set_target_level(simplelog::LevelFilter::Off)
752        .set_thread_level(simplelog::LevelFilter::Off);
753    simplelog::TermLogger::init(
754        loglevel,
755        lcfg.build(),
756        simplelog::TerminalMode::Stderr,
757        simplelog::ColorChoice::Auto,
758    )?;
759
760    let shutdown = Arc::new(AtomicBool::new(false));
761    let shutdown_clone = shutdown.clone();
762    ctrlc::set_handler(move || {
763        shutdown_clone.store(true, Ordering::Relaxed);
764    })
765    .context("Error setting Ctrl-C handler")?;
766
767    if let Some(intv) = opts.monitor.or(opts.stats) {
768        let shutdown_copy = shutdown.clone();
769        let jh = std::thread::spawn(move || {
770            match stats::monitor(Duration::from_secs_f64(intv), shutdown_copy) {
771                Ok(_) => {
772                    debug!("stats monitor thread finished successfully")
773                }
774                Err(error_object) => {
775                    warn!(
776                        "stats monitor thread finished because of an error {}",
777                        error_object
778                    )
779                }
780            }
781        });
782        if opts.monitor.is_some() {
783            let _ = jh.join();
784            return Ok(());
785        }
786    }
787
788    let mut open_object = MaybeUninit::uninit();
789    loop {
790        let mut sched = Scheduler::init(&opts, &mut open_object)?;
791        if !sched.run(shutdown.clone())?.should_restart() {
792            if sched.user_restart {
793                continue;
794            }
795            break;
796        }
797    }
798
799    Ok(())
800}