Skip to main content

scx_bpfland/
main.rs

1// SPDX-License-Identifier: GPL-2.0
2//
3// Copyright (c) 2024 Andrea Righi <andrea.righi@linux.dev>
4
5// This software may be used and distributed according to the terms of the
6// GNU General Public License version 2.
7
8mod bpf_skel;
9pub use bpf_skel::*;
10pub mod bpf_intf;
11pub use bpf_intf::*;
12
13mod stats;
14use std::ffi::{c_int, c_ulong};
15use std::fmt::Write;
16use std::mem::MaybeUninit;
17use std::sync::atomic::AtomicBool;
18use std::sync::atomic::Ordering;
19use std::sync::Arc;
20use std::time::Duration;
21
22use anyhow::anyhow;
23use anyhow::bail;
24use anyhow::Context;
25use anyhow::Result;
26use clap::Parser;
27use crossbeam::channel::RecvTimeoutError;
28use libbpf_rs::OpenObject;
29use libbpf_rs::ProgramInput;
30use log::warn;
31use log::{debug, info};
32use scx_stats::prelude::*;
33use scx_utils::autopower::{fetch_power_profile, PowerProfile};
34use scx_utils::build_id;
35use scx_utils::compat;
36use scx_utils::get_primary_cpus;
37use scx_utils::libbpf_clap_opts::LibbpfOpts;
38use scx_utils::pm::{cpu_idle_resume_latency_supported, update_cpu_idle_resume_latency};
39use scx_utils::scx_ops_attach;
40use scx_utils::scx_ops_load;
41use scx_utils::scx_ops_open;
42use scx_utils::try_set_rlimit_infinity;
43use scx_utils::uei_exited;
44use scx_utils::uei_report;
45use scx_utils::Cpumask;
46use scx_utils::Powermode;
47use scx_utils::Topology;
48use scx_utils::UserExitInfo;
49use scx_utils::NR_CPU_IDS;
50use stats::Metrics;
51
52const SCHEDULER_NAME: &str = "scx_bpfland";
53
54// Convert an array of CPUs to the corresponding cpumask of any arbitrary size.
55fn cpus_to_cpumask(cpus: &Vec<usize>) -> String {
56    if cpus.is_empty() {
57        return String::from("none");
58    }
59
60    // Determine the maximum CPU ID to create a sufficiently large byte vector.
61    let max_cpu_id = *cpus.iter().max().unwrap();
62
63    // Create a byte vector with enough bytes to cover all CPU IDs.
64    let mut bitmask = vec![0u8; (max_cpu_id + 1 + 7) / 8];
65
66    // Set the appropriate bits for each CPU ID.
67    for cpu_id in cpus {
68        let byte_index = cpu_id / 8;
69        let bit_index = cpu_id % 8;
70        bitmask[byte_index] |= 1 << bit_index;
71    }
72
73    // Convert the byte vector to a hexadecimal string.
74    let hex_str: String = bitmask.iter().rev().fold(String::new(), |mut f, byte| {
75        let _ = write!(&mut f, "{:02x}", byte);
76        f
77    });
78
79    format!("0x{}", hex_str)
80}
81
82/// scx_bpfland: a vruntime-based sched_ext scheduler that prioritizes interactive workloads.
83///
84/// This scheduler is derived from scx_rustland, but it is fully implemented in BPF. It has a minimal
85/// user-space part written in Rust to process command line options, collect metrics and log out
86/// scheduling statistics.
87///
88/// The BPF part makes all the scheduling decisions (see src/bpf/main.bpf.c).
89#[derive(Debug, Parser)]
90struct Opts {
91    /// Exit debug dump buffer length. 0 indicates default.
92    #[clap(long, default_value = "0")]
93    exit_dump_len: u32,
94
95    /// Maximum scheduling slice duration in microseconds.
96    #[clap(short = 's', long, default_value = "1000")]
97    slice_us: u64,
98
99    /// Minimum scheduling slice duration in microseconds (0 = no minimum time slice).
100    #[clap(short = 'L', long, default_value = "0")]
101    slice_min_us: u64,
102
103    /// Maximum time slice lag in microseconds.
104    ///
105    /// A positive value can help to enhance the responsiveness of interactive tasks, but it can
106    /// also make performance more "spikey".
107    #[clap(short = 'l', long, default_value = "40000")]
108    slice_us_lag: u64,
109
110    /// Throttle the running CPUs by periodically injecting idle cycles.
111    ///
112    /// This option can help extend battery life on portable devices, reduce heating, fan noise
113    /// and overall energy consumption (0 = disable).
114    #[clap(short = 't', long, default_value = "0")]
115    throttle_us: u64,
116
117    /// Set CPU idle QoS resume latency in microseconds (-1 = disabled).
118    ///
119    /// Setting a lower latency value makes CPUs less likely to enter deeper idle states, enhancing
120    /// performance at the cost of higher power consumption. Alternatively, increasing the latency
121    /// value may reduce performance, but also improve power efficiency.
122    #[clap(short = 'I', long, allow_hyphen_values = true, default_value = "-1")]
123    idle_resume_us: i64,
124
125    /// Enable per-CPU tasks prioritization.
126    ///
127    /// This allows to prioritize per-CPU tasks that usually tend to be de-prioritized (since they
128    /// can't be migrated when their only usable CPU is busy). Enabling this option can introduce
129    /// unfairness and potentially trigger stalls, but it can improve performance of server-type
130    /// workloads (such as large parallel builds).
131    #[clap(short = 'p', long, action = clap::ArgAction::SetTrue)]
132    local_pcpu: bool,
133
134    /// Enable kthreads prioritization (EXPERIMENTAL).
135    ///
136    /// Enabling this can improve system performance, but it may also introduce noticeable
137    /// interactivity issues or unfairness in scenarios with high kthread activity, such as heavy
138    /// I/O or network traffic.
139    ///
140    /// Use it only when conducting specific experiments or if you have a clear understanding of
141    /// its implications.
142    #[clap(short = 'k', long, action = clap::ArgAction::SetTrue)]
143    local_kthreads: bool,
144
145    /// Disable direct dispatch during synchronous wakeups.
146    ///
147    /// Enabling this option can lead to a more uniform load distribution across available cores,
148    /// potentially improving performance in certain scenarios. However, it may come at the cost of
149    /// reduced efficiency for pipe-intensive workloads that benefit from tighter producer-consumer
150    /// coupling.
151    #[clap(short = 'w', long, action = clap::ArgAction::SetTrue)]
152    no_wake_sync: bool,
153
154    /// Enable sticky tasks.
155    ///
156    /// If enabled force tasks with a high rate of enqueues/sec to stay on the same CPU, to reduce
157    /// locking contention on the shared runqueues.
158    ///
159    /// This can help making the scheduler more robust with intensive scheduling workloads and
160    /// benchmarks, but it can negatively impact on latency.
161    #[clap(short = 'S', long, action = clap::ArgAction::SetTrue)]
162    sticky_tasks: bool,
163
164    /// Specifies the initial set of CPUs, represented as a bitmask in hex (e.g., 0xff), that the
165    /// scheduler will use to dispatch tasks, until the system becomes saturated, at which point
166    /// tasks may overflow to other available CPUs.
167    ///
168    /// Special values:
169    ///  - "auto" = automatically detect the CPUs based on the active power profile
170    ///  - "performance" = automatically detect and prioritize the fastest CPUs
171    ///  - "powersave" = automatically detect and prioritize the slowest CPUs
172    ///  - "all" = all CPUs assigned to the primary domain
173    ///  - "none" = no prioritization, tasks are dispatched on the first CPU available
174    #[clap(short = 'm', long, default_value = "auto")]
175    primary_domain: String,
176
177    /// Enable preferred idle CPU scanning.
178    ///
179    /// With this option enabled, the scheduler will prioritize assigning tasks to higher-ranked
180    /// cores before considering lower-ranked ones.
181    #[clap(short = 'P', long, action = clap::ArgAction::SetTrue)]
182    preferred_idle_scan: bool,
183
184    /// Disable SMT awareness.
185    #[clap(long, action = clap::ArgAction::SetTrue)]
186    disable_smt: bool,
187
188    /// Disable NUMA awareness.
189    #[clap(long, action = clap::ArgAction::SetTrue)]
190    disable_numa: bool,
191
192    /// Enable CPU frequency control (only with schedutil governor).
193    ///
194    /// With this option enabled the CPU frequency will be automatically scaled based on the load.
195    #[clap(short = 'f', long, action = clap::ArgAction::SetTrue)]
196    cpufreq: bool,
197
198    /// Enable TIMELY mode: use TIMELY's delay-driven feedback for adaptive time slices.
199    #[clap(short = 'T', long, action = clap::ArgAction::SetTrue)]
200    timely: bool,
201
202    /// TIMELY lower delay threshold in microseconds.
203    #[clap(long, default_value = "5000")]
204    timely_tlow_us: u64,
205
206    /// TIMELY higher delay threshold in microseconds.
207    #[clap(long, default_value = "50000")]
208    timely_thigh_us: u64,
209
210    /// TIMELY minimum gain value (fixed-point).
211    #[clap(long, default_value = "128")]
212    timely_gain_min: u32,
213
214    /// TIMELY gain step (fixed-point).
215    #[clap(long, default_value = "32")]
216    timely_gain_step: u32,
217
218    /// TIMELY HAI threshold (fixed-point).
219    #[clap(long, default_value = "768")]
220    timely_hai_thresh: u32,
221
222    /// TIMELY HAI multiplier.
223    #[clap(long, default_value = "2")]
224    timely_hai_multiplier: u32,
225
226    /// TIMELY backoff low (fixed-point).
227    #[clap(long, default_value = "768")]
228    timely_backoff_low: u32,
229
230    /// TIMELY backoff high (fixed-point).
231    #[clap(long, default_value = "960")]
232    timely_backoff_high: u32,
233
234    /// TIMELY backoff gradient (fixed-point).
235    #[clap(long, default_value = "992")]
236    timely_backoff_gradient: u32,
237
238    /// TIMELY gradient margin in microseconds.
239    #[clap(long, default_value = "125")]
240    timely_gradient_margin_us: u64,
241
242    /// TIMELY control interval in microseconds.
243    #[clap(long, default_value = "500")]
244    timely_control_interval_us: u64,
245
246    /// Enable stats monitoring with the specified interval.
247    #[clap(long)]
248    stats: Option<f64>,
249
250    /// Run in stats monitoring mode with the specified interval. Scheduler
251    /// is not launched.
252    #[clap(long)]
253    monitor: Option<f64>,
254
255    /// Enable BPF debugging via /sys/kernel/tracing/trace_pipe.
256    #[clap(short = 'd', long, action = clap::ArgAction::SetTrue)]
257    debug: bool,
258
259    /// Enable verbose output, including libbpf details.
260    #[clap(short = 'v', long, action = clap::ArgAction::SetTrue)]
261    verbose: bool,
262
263    /// Print scheduler version and exit.
264    #[clap(short = 'V', long, action = clap::ArgAction::SetTrue)]
265    version: bool,
266
267    /// Show descriptions for statistics.
268    #[clap(long)]
269    help_stats: bool,
270
271    #[clap(flatten, next_help_heading = "Libbpf Options")]
272    pub libbpf: LibbpfOpts,
273}
274
275struct Scheduler<'a> {
276    skel: BpfSkel<'a>,
277    struct_ops: Option<libbpf_rs::Link>,
278    opts: &'a Opts,
279    topo: Topology,
280    power_profile: PowerProfile,
281    stats_server: StatsServer<(), Metrics>,
282    user_restart: bool,
283}
284
285impl<'a> Scheduler<'a> {
286    fn init(opts: &'a Opts, open_object: &'a mut MaybeUninit<OpenObject>) -> Result<Self> {
287        try_set_rlimit_infinity();
288
289        // Initialize CPU topology.
290        let topo = Topology::new().unwrap();
291
292        // Check host topology to determine if we need to enable SMT capabilities.
293        let smt_enabled = !opts.disable_smt && topo.smt_enabled;
294
295        // Determine the amount of non-empty NUMA nodes in the system.
296        let nr_nodes = topo
297            .nodes
298            .values()
299            .filter(|node| !node.all_cpus.is_empty())
300            .count();
301        info!("NUMA nodes: {}", nr_nodes);
302
303        // Automatically disable NUMA optimizations when running on non-NUMA systems.
304        let numa_enabled = !opts.disable_numa && nr_nodes > 1;
305        if !numa_enabled {
306            info!("Disabling NUMA optimizations");
307        }
308
309        // Determine the primary scheduling domain.
310        let power_profile = Self::power_profile();
311        let domain =
312            Self::resolve_energy_domain(&opts.primary_domain, power_profile).map_err(|err| {
313                anyhow!(
314                    "failed to resolve primary domain '{}': {}",
315                    &opts.primary_domain,
316                    err
317                )
318            })?;
319
320        info!(
321            "{} {} {}",
322            SCHEDULER_NAME,
323            build_id::full_version(env!("CARGO_PKG_VERSION")),
324            if smt_enabled { "SMT on" } else { "SMT off" }
325        );
326
327        // Print command line.
328        info!(
329            "scheduler options: {}",
330            std::env::args().collect::<Vec<_>>().join(" ")
331        );
332
333        if opts.idle_resume_us >= 0 {
334            if !cpu_idle_resume_latency_supported() {
335                warn!("idle resume latency not supported");
336            } else {
337                info!("Setting idle QoS to {} us", opts.idle_resume_us);
338                for cpu in topo.all_cpus.values() {
339                    update_cpu_idle_resume_latency(
340                        cpu.id,
341                        opts.idle_resume_us.try_into().unwrap(),
342                    )?;
343                }
344            }
345        }
346
347        // Initialize BPF connector.
348        let mut skel_builder = BpfSkelBuilder::default();
349        skel_builder.obj_builder.debug(opts.verbose);
350        let open_opts = opts.libbpf.clone().into_bpf_open_opts();
351        let mut skel = scx_ops_open!(skel_builder, open_object, bpfland_ops, open_opts)?;
352
353        skel.struct_ops.bpfland_ops_mut().exit_dump_len = opts.exit_dump_len;
354
355        // Override default BPF scheduling parameters.
356        let rodata = skel.maps.rodata_data.as_mut().unwrap();
357        rodata.debug = opts.debug;
358        rodata.smt_enabled = smt_enabled;
359        rodata.numa_enabled = numa_enabled;
360        rodata.local_pcpu = opts.local_pcpu;
361        rodata.no_wake_sync = opts.no_wake_sync;
362        rodata.sticky_tasks = opts.sticky_tasks;
363        rodata.slice_max = opts.slice_us * 1000;
364        rodata.slice_min = opts.slice_min_us * 1000;
365        rodata.slice_lag = opts.slice_us_lag * 1000;
366        rodata.throttle_ns = opts.throttle_us * 1000;
367        rodata.primary_all = domain.weight() == *NR_CPU_IDS;
368
369        // TIMELY settings (only effective when timely_enabled=true)
370        rodata.timely_enabled = opts.timely;
371        rodata.timely_tlow_ns = opts.timely_tlow_us * 1000;
372        rodata.timely_thigh_ns = opts.timely_thigh_us * 1000;
373        rodata.timely_gain_min_fp = opts.timely_gain_min;
374        rodata.timely_gain_max_fp = 1024;
375        rodata.timely_gain_step_fp = opts.timely_gain_step;
376        rodata.timely_hai_thresh_fp = opts.timely_hai_thresh;
377        rodata.timely_hai_multiplier = opts.timely_hai_multiplier;
378        rodata.timely_backoff_low_fp = opts.timely_backoff_low;
379        rodata.timely_backoff_high_fp = opts.timely_backoff_high;
380        rodata.timely_backoff_gradient_fp = opts.timely_backoff_gradient;
381        rodata.timely_gradient_margin_ns = opts.timely_gradient_margin_us * 1000;
382        rodata.timely_control_interval_ns = opts.timely_control_interval_us * 1000;
383
384        // Generate the list of available CPUs sorted by capacity in descending order.
385        let mut cpus: Vec<_> = topo.all_cpus.values().collect();
386        cpus.sort_by_key(|cpu| std::cmp::Reverse(cpu.cpu_capacity));
387        for (i, cpu) in cpus.iter().enumerate() {
388            rodata.cpu_capacity[cpu.id] = cpu.cpu_capacity as c_ulong;
389            rodata.preferred_cpus[i] = cpu.id as u64;
390        }
391        if opts.preferred_idle_scan {
392            info!(
393                "Preferred CPUs: {:?}",
394                &rodata.preferred_cpus[0..cpus.len()]
395            );
396        }
397        rodata.preferred_idle_scan = opts.preferred_idle_scan;
398
399        // Implicitly enable direct dispatch of per-CPU kthreads if CPU throttling is enabled
400        // (it's never a good idea to throttle per-CPU kthreads).
401        rodata.local_kthreads = opts.local_kthreads || opts.throttle_us > 0;
402
403        // Set scheduler flags.
404        skel.struct_ops.bpfland_ops_mut().flags = *compat::SCX_OPS_ENQ_EXITING
405            | *compat::SCX_OPS_ENQ_LAST
406            | *compat::SCX_OPS_ENQ_MIGRATION_DISABLED
407            | *compat::SCX_OPS_ALLOW_QUEUED_WAKEUP
408            | if numa_enabled {
409                *compat::SCX_OPS_BUILTIN_IDLE_PER_NODE
410            } else {
411                0
412            };
413        info!(
414            "scheduler flags: {:#x}",
415            skel.struct_ops.bpfland_ops_mut().flags
416        );
417
418        // Load the BPF program for validation.
419        let mut skel = scx_ops_load!(skel, bpfland_ops, uei)?;
420
421        // Initialize the primary scheduling domain.
422        Self::init_energy_domain(&mut skel, &domain).map_err(|err| {
423            anyhow!(
424                "failed to initialize primary domain 0x{:x}: {}",
425                domain,
426                err
427            )
428        })?;
429
430        // Initialize CPU frequency scaling.
431        if let Err(err) = Self::init_cpufreq_perf(&mut skel, &opts.primary_domain, opts.cpufreq) {
432            bail!(
433                "failed to initialize cpufreq performance level: error {}",
434                err
435            );
436        }
437
438        // Initialize SMT domains.
439        if smt_enabled {
440            Self::init_smt_domains(&mut skel, &topo)?;
441        }
442
443        // Attach the scheduler.
444        let struct_ops = Some(scx_ops_attach!(skel, bpfland_ops)?);
445        let stats_server = StatsServer::new(stats::server_data()).launch()?;
446
447        Ok(Self {
448            skel,
449            struct_ops,
450            opts,
451            topo,
452            power_profile,
453            stats_server,
454            user_restart: false,
455        })
456    }
457
458    fn enable_primary_cpu(skel: &mut BpfSkel<'_>, cpu: i32) -> Result<(), u32> {
459        let prog = &mut skel.progs.enable_primary_cpu;
460        let mut args = cpu_arg {
461            cpu_id: cpu as c_int,
462        };
463        let input = ProgramInput {
464            context_in: Some(unsafe {
465                std::slice::from_raw_parts_mut(
466                    &mut args as *mut _ as *mut u8,
467                    std::mem::size_of_val(&args),
468                )
469            }),
470            ..Default::default()
471        };
472        let out = prog.test_run(input).unwrap();
473        if out.return_value != 0 {
474            return Err(out.return_value);
475        }
476
477        Ok(())
478    }
479
480    fn epp_to_cpumask(profile: Powermode) -> Result<Cpumask> {
481        let mut cpus = get_primary_cpus(profile).unwrap_or_default();
482        if cpus.is_empty() {
483            cpus = get_primary_cpus(Powermode::Any).unwrap_or_default();
484        }
485        Cpumask::from_str(&cpus_to_cpumask(&cpus))
486    }
487
488    fn resolve_energy_domain(primary_domain: &str, power_profile: PowerProfile) -> Result<Cpumask> {
489        let domain = match primary_domain {
490            "powersave" => Self::epp_to_cpumask(Powermode::Powersave)?,
491            "performance" => Self::epp_to_cpumask(Powermode::Performance)?,
492            "turbo" => Self::epp_to_cpumask(Powermode::Turbo)?,
493            "auto" => match power_profile {
494                PowerProfile::Powersave => Self::epp_to_cpumask(Powermode::Powersave)?,
495                PowerProfile::Balanced { .. }
496                | PowerProfile::Performance
497                | PowerProfile::Unknown => Self::epp_to_cpumask(Powermode::Any)?,
498            },
499            "all" => Self::epp_to_cpumask(Powermode::Any)?,
500            &_ => Cpumask::from_str(primary_domain)?,
501        };
502
503        Ok(domain)
504    }
505
506    fn init_energy_domain(skel: &mut BpfSkel<'_>, domain: &Cpumask) -> Result<()> {
507        info!("primary CPU domain = 0x{:x}", domain);
508
509        // Clear the primary domain by passing a negative CPU id.
510        if let Err(err) = Self::enable_primary_cpu(skel, -1) {
511            bail!("failed to reset primary domain: error {}", err);
512        }
513
514        // Update primary scheduling domain.
515        for cpu in 0..*NR_CPU_IDS {
516            if domain.test_cpu(cpu) {
517                if let Err(err) = Self::enable_primary_cpu(skel, cpu as i32) {
518                    bail!("failed to add CPU {} to primary domain: error {}", cpu, err);
519                }
520            }
521        }
522
523        Ok(())
524    }
525
526    // Update hint for the cpufreq governor.
527    fn init_cpufreq_perf(
528        skel: &mut BpfSkel<'_>,
529        primary_domain: &String,
530        auto: bool,
531    ) -> Result<()> {
532        // If we are using the powersave profile always scale the CPU frequency to the minimum,
533        // otherwise use the maximum, unless automatic frequency scaling is enabled.
534        let perf_lvl: i64 = match primary_domain.as_str() {
535            "powersave" => 0,
536            _ if auto => -1,
537            _ => 1024,
538        };
539        info!(
540            "cpufreq performance level: {}",
541            match perf_lvl {
542                1024 => "max".into(),
543                0 => "min".into(),
544                n if n < 0 => "auto".into(),
545                _ => perf_lvl.to_string(),
546            }
547        );
548        skel.maps.bss_data.as_mut().unwrap().cpufreq_perf_lvl = perf_lvl;
549
550        Ok(())
551    }
552
553    fn power_profile() -> PowerProfile {
554        let profile = fetch_power_profile(true);
555        if profile == PowerProfile::Unknown {
556            fetch_power_profile(false)
557        } else {
558            profile
559        }
560    }
561
562    fn refresh_sched_domain(&mut self) -> bool {
563        if self.power_profile != PowerProfile::Unknown {
564            let power_profile = Self::power_profile();
565            if power_profile != self.power_profile {
566                self.power_profile = power_profile;
567
568                if self.opts.primary_domain == "auto" {
569                    return true;
570                }
571                if let Err(err) = Self::init_cpufreq_perf(
572                    &mut self.skel,
573                    &self.opts.primary_domain,
574                    self.opts.cpufreq,
575                ) {
576                    warn!("failed to refresh cpufreq performance level: error {}", err);
577                }
578            }
579        }
580
581        false
582    }
583
584    fn enable_sibling_cpu(
585        skel: &mut BpfSkel<'_>,
586        cpu: usize,
587        sibling_cpu: usize,
588    ) -> Result<(), u32> {
589        let prog = &mut skel.progs.enable_sibling_cpu;
590        let mut args = domain_arg {
591            cpu_id: cpu as c_int,
592            sibling_cpu_id: sibling_cpu as c_int,
593        };
594        let input = ProgramInput {
595            context_in: Some(unsafe {
596                std::slice::from_raw_parts_mut(
597                    &mut args as *mut _ as *mut u8,
598                    std::mem::size_of_val(&args),
599                )
600            }),
601            ..Default::default()
602        };
603        let out = prog.test_run(input).unwrap();
604        if out.return_value != 0 {
605            return Err(out.return_value);
606        }
607
608        Ok(())
609    }
610
611    fn init_smt_domains(skel: &mut BpfSkel<'_>, topo: &Topology) -> Result<(), std::io::Error> {
612        let smt_siblings = topo.sibling_cpus();
613
614        info!("SMT sibling CPUs: {:?}", smt_siblings);
615        for (cpu, sibling_cpu) in smt_siblings.iter().enumerate() {
616            Self::enable_sibling_cpu(skel, cpu, *sibling_cpu as usize).unwrap();
617        }
618
619        Ok(())
620    }
621
622    fn get_metrics(&self) -> Metrics {
623        let bss_data = self.skel.maps.bss_data.as_ref().unwrap();
624        Metrics {
625            nr_running: bss_data.nr_running,
626            nr_cpus: bss_data.nr_online_cpus,
627            nr_kthread_dispatches: bss_data.nr_kthread_dispatches,
628            nr_direct_dispatches: bss_data.nr_direct_dispatches,
629            nr_shared_dispatches: bss_data.nr_shared_dispatches,
630            nr_delay_recovery_dispatches: bss_data.nr_delay_recovery_dispatches,
631            nr_delay_middle_add_dispatches: bss_data.nr_delay_middle_add_dispatches,
632            nr_delay_fast_recovery_dispatches: bss_data.nr_delay_fast_recovery_dispatches,
633            nr_delay_rate_limited_dispatches: bss_data.nr_delay_rate_limited_dispatches,
634            nr_gain_floor_dispatches: bss_data.nr_gain_floor_dispatches,
635            nr_gain_ceiling_dispatches: bss_data.nr_gain_ceiling_dispatches,
636            nr_delay_low_region_samples: bss_data.nr_delay_low_region_samples,
637            nr_delay_mid_region_samples: bss_data.nr_delay_mid_region_samples,
638            nr_delay_high_region_samples: bss_data.nr_delay_high_region_samples,
639            nr_gain_floor_resident_samples: bss_data.nr_gain_floor_resident_samples,
640            nr_gain_mid_resident_samples: bss_data.nr_gain_mid_resident_samples,
641            nr_gain_ceiling_resident_samples: bss_data.nr_gain_ceiling_resident_samples,
642            nr_idle_select_path_picks: bss_data.nr_idle_select_path_picks,
643            nr_idle_enqueue_path_picks: bss_data.nr_idle_enqueue_path_picks,
644            nr_idle_prev_cpu_picks: bss_data.nr_idle_prev_cpu_picks,
645            nr_idle_primary_picks: bss_data.nr_idle_primary_picks,
646            nr_idle_spill_picks: bss_data.nr_idle_spill_picks,
647            nr_idle_pick_failures: bss_data.nr_idle_pick_failures,
648            nr_idle_primary_domain_misses: bss_data.nr_idle_primary_domain_misses,
649            nr_idle_global_misses: bss_data.nr_idle_global_misses,
650            nr_waker_cpu_biases: bss_data.nr_waker_cpu_biases,
651            nr_keep_running_reuses: bss_data.nr_keep_running_reuses,
652            nr_keep_running_queue_empty: bss_data.nr_keep_running_queue_empty,
653            nr_keep_running_smt_blocked: bss_data.nr_keep_running_smt_blocked,
654            nr_keep_running_queued_work: bss_data.nr_keep_running_queued_work,
655            nr_dispatch_cpu_dsq_consumes: bss_data.nr_dispatch_cpu_dsq_consumes,
656            nr_dispatch_node_dsq_consumes: bss_data.nr_dispatch_node_dsq_consumes,
657            nr_cpu_release_reenqueue: bss_data.nr_cpu_release_reenqueue,
658        }
659    }
660
661    pub fn exited(&mut self) -> bool {
662        uei_exited!(&self.skel, uei)
663    }
664
665    fn run(&mut self, shutdown: Arc<AtomicBool>) -> Result<UserExitInfo> {
666        let (res_ch, req_ch) = self.stats_server.channels();
667        while !shutdown.load(Ordering::Relaxed) && !self.exited() {
668            if self.refresh_sched_domain() {
669                self.user_restart = true;
670                break;
671            }
672            match req_ch.recv_timeout(Duration::from_secs(1)) {
673                Ok(()) => res_ch.send(self.get_metrics())?,
674                Err(RecvTimeoutError::Timeout) => {}
675                Err(e) => Err(e)?,
676            }
677        }
678
679        let _ = self.struct_ops.take();
680        uei_report!(&self.skel, uei)
681    }
682}
683
684impl Drop for Scheduler<'_> {
685    fn drop(&mut self) {
686        info!("Unregister {SCHEDULER_NAME} scheduler");
687
688        // Restore default CPU idle QoS resume latency.
689        if self.opts.idle_resume_us >= 0 {
690            if cpu_idle_resume_latency_supported() {
691                for cpu in self.topo.all_cpus.values() {
692                    update_cpu_idle_resume_latency(cpu.id, cpu.pm_qos_resume_latency_us as i32)
693                        .unwrap();
694                }
695            }
696        }
697    }
698}
699
700fn main() -> Result<()> {
701    let opts = Opts::parse();
702
703    if opts.version {
704        println!(
705            "{} {}",
706            SCHEDULER_NAME,
707            build_id::full_version(env!("CARGO_PKG_VERSION"))
708        );
709        return Ok(());
710    }
711
712    if opts.help_stats {
713        stats::server_data().describe_meta(&mut std::io::stdout(), None)?;
714        return Ok(());
715    }
716
717    let loglevel = simplelog::LevelFilter::Info;
718
719    let mut lcfg = simplelog::ConfigBuilder::new();
720    lcfg.set_time_offset_to_local()
721        .expect("Failed to set local time offset")
722        .set_time_level(simplelog::LevelFilter::Error)
723        .set_location_level(simplelog::LevelFilter::Off)
724        .set_target_level(simplelog::LevelFilter::Off)
725        .set_thread_level(simplelog::LevelFilter::Off);
726    simplelog::TermLogger::init(
727        loglevel,
728        lcfg.build(),
729        simplelog::TerminalMode::Stderr,
730        simplelog::ColorChoice::Auto,
731    )?;
732
733    let shutdown = Arc::new(AtomicBool::new(false));
734    let shutdown_clone = shutdown.clone();
735    ctrlc::set_handler(move || {
736        shutdown_clone.store(true, Ordering::Relaxed);
737    })
738    .context("Error setting Ctrl-C handler")?;
739
740    if let Some(intv) = opts.monitor.or(opts.stats) {
741        let shutdown_copy = shutdown.clone();
742        let jh = std::thread::spawn(move || {
743            match stats::monitor(Duration::from_secs_f64(intv), shutdown_copy) {
744                Ok(_) => {
745                    debug!("stats monitor thread finished successfully")
746                }
747                Err(error_object) => {
748                    warn!(
749                        "stats monitor thread finished because of an error {}",
750                        error_object
751                    )
752                }
753            }
754        });
755        if opts.monitor.is_some() {
756            let _ = jh.join();
757            return Ok(());
758        }
759    }
760
761    let mut open_object = MaybeUninit::uninit();
762    loop {
763        let mut sched = Scheduler::init(&opts, &mut open_object)?;
764        if !sched.run(shutdown.clone())?.should_restart() {
765            if sched.user_restart {
766                continue;
767            }
768            break;
769        }
770    }
771
772    Ok(())
773}