scx_tickless/
main.rs

1// SPDX-License-Identifier: GPL-2.0
2//
3// Copyright (c) 2025 Andrea Righi <arighi@nvidia.com>
4
5// This software may be used and distributed according to the terms of the
6// GNU General Public License version 2.
7
8mod bpf_skel;
9pub use bpf_skel::*;
10pub mod bpf_intf;
11pub use bpf_intf::*;
12
13mod stats;
14use std::ffi::c_int;
15use std::fs;
16use std::mem::MaybeUninit;
17use std::sync::Arc;
18use std::sync::atomic::AtomicBool;
19use std::sync::atomic::Ordering;
20use std::time::Duration;
21
22use affinity::set_thread_affinity;
23use anyhow::Context;
24use anyhow::Result;
25use anyhow::bail;
26use clap::Parser;
27use crossbeam::channel::RecvTimeoutError;
28use libbpf_rs::OpenObject;
29use libbpf_rs::ProgramInput;
30use log::warn;
31use log::{debug, info};
32use scx_stats::prelude::*;
33use scx_utils::Cpumask;
34use scx_utils::NR_CPU_IDS;
35use scx_utils::Topology;
36use scx_utils::UserExitInfo;
37use scx_utils::build_id;
38use scx_utils::scx_ops_attach;
39use scx_utils::scx_ops_load;
40use scx_utils::scx_ops_open;
41use scx_utils::set_rlimit_infinity;
42use scx_utils::uei_exited;
43use scx_utils::uei_report;
44use stats::Metrics;
45
46const SCHEDULER_NAME: &'static str = "scx_tickless";
47
48#[derive(Debug, Parser)]
49struct Opts {
50    /// Exit debug dump buffer length. 0 indicates default.
51    #[clap(long, default_value = "0")]
52    exit_dump_len: u32,
53
54    /// Define the set of CPUs, represented as a bitmask in hex (e.g., 0xff), dedicated to process
55    /// scheduling events.
56    #[clap(short = 'm', long, default_value = "0x1")]
57    primary_domain: String,
58
59    /// Maximum scheduling slice duration in microseconds (applied only when multiple tasks are
60    /// contending the same CPU).
61    #[clap(short = 's', long, default_value = "20000")]
62    slice_us: u64,
63
64    /// Frequency of the tick triggered on the scheduling CPUs to check for task time slice
65    /// expiration (0 == CONFIG_HZ).
66    ///
67    /// A higher frequency can increase the overall system responsiveness but it can also introduce
68    /// more scheduling overhead and load on the primary CPUs.
69    #[clap(short = 'f', long, default_value = "0")]
70    frequency: u64,
71
72    /// Try to keep tasks running on the same CPU
73    ///
74    /// This can help to improve cache locality at the cost of introducing some extra overhead in
75    /// the scheduler (and increase the load on the primary CPUs).
76    #[clap(short = 'p', long, action = clap::ArgAction::SetTrue)]
77    prefer_same_cpu: bool,
78
79    /// Disable SMT topology awareness.
80    #[clap(short = 'n', long, action = clap::ArgAction::SetTrue)]
81    nosmt: bool,
82
83    /// Enable stats monitoring with the specified interval.
84    #[clap(long)]
85    stats: Option<f64>,
86
87    /// Run in stats monitoring mode with the specified interval. Scheduler
88    /// is not launched.
89    #[clap(long)]
90    monitor: Option<f64>,
91
92    /// Enable verbose output, including libbpf details.
93    #[clap(short = 'v', long, action = clap::ArgAction::SetTrue)]
94    verbose: bool,
95
96    /// Print scheduler version and exit.
97    #[clap(short = 'V', long, action = clap::ArgAction::SetTrue)]
98    version: bool,
99
100    /// Show descriptions for statistics.
101    #[clap(long)]
102    help_stats: bool,
103}
104
105pub fn is_nohz_enabled() -> bool {
106    if let Ok(contents) = fs::read_to_string("/sys/devices/system/cpu/nohz_full") {
107        let trimmed = contents.trim();
108        return trimmed != "(null)" && !trimmed.is_empty();
109    }
110    false
111}
112
113struct Scheduler<'a> {
114    skel: BpfSkel<'a>,
115    struct_ops: Option<libbpf_rs::Link>,
116    stats_server: StatsServer<(), Metrics>,
117}
118
119impl<'a> Scheduler<'a> {
120    fn init(opts: &'a Opts, open_object: &'a mut MaybeUninit<OpenObject>) -> Result<Self> {
121        set_rlimit_infinity();
122
123        // Initialize CPU topology.
124        let topo = Topology::new().unwrap();
125        let smt_enabled = !opts.nosmt && topo.smt_enabled;
126        info!(
127            "{} {} {}",
128            SCHEDULER_NAME,
129            build_id::full_version(env!("CARGO_PKG_VERSION")),
130            if smt_enabled { "SMT on" } else { "SMT off" }
131        );
132
133        // Check if nohz_full is enabled.
134        if !is_nohz_enabled() {
135            warn!("nohz_full is not enabled in the kernel");
136        }
137
138        // Process the domain of primary CPUs.
139        let domain = Cpumask::from_str(&opts.primary_domain)?;
140        info!("primary CPU domain = 0x{:x}", domain);
141
142        // Initialize BPF connector.
143        let mut skel_builder = BpfSkelBuilder::default();
144        skel_builder.obj_builder.debug(opts.verbose);
145        let mut skel = scx_ops_open!(skel_builder, open_object, tickless_ops)?;
146
147        skel.struct_ops.tickless_ops_mut().exit_dump_len = opts.exit_dump_len;
148
149        skel.maps.rodata_data.smt_enabled = smt_enabled;
150        skel.maps.rodata_data.nr_cpu_ids = *NR_CPU_IDS as u32;
151
152        // Override default BPF scheduling parameters.
153        skel.maps.rodata_data.slice_ns = opts.slice_us * 1000;
154        skel.maps.rodata_data.tick_freq = opts.frequency;
155        skel.maps.rodata_data.prefer_same_cpu = opts.prefer_same_cpu;
156
157        // Load the BPF program for validation.
158        let mut skel = scx_ops_load!(skel, tickless_ops, uei)?;
159
160        // Set task affinity to the first primary CPU.
161        let timer_cpu = domain.iter().next();
162        if timer_cpu.is_none() {
163            bail!("primary cpumask is empty");
164        }
165        if let Err(e) = set_thread_affinity(&[timer_cpu.unwrap() as usize]) {
166            bail!("cannot set central CPU affinity: {}", e);
167        }
168
169        // Initialize the group of primary CPUs.
170        if let Err(err) = Self::init_primary_domain(&mut skel, &domain) {
171            warn!("failed to initialize primary domain: error {}", err);
172        }
173
174        // Attach the scheduler.
175        let struct_ops = Some(scx_ops_attach!(skel, tickless_ops)?);
176        let stats_server = StatsServer::new(stats::server_data()).launch()?;
177
178        // Reset task affinity.
179        if let Err(e) = set_thread_affinity((0..*NR_CPU_IDS).collect::<Vec<usize>>()) {
180            bail!("cannot reset CPU affinity: {}", e);
181        }
182
183        Ok(Self {
184            skel,
185            struct_ops,
186            stats_server,
187        })
188    }
189
190    fn enable_primary_cpu(skel: &mut BpfSkel<'_>, cpu: i32) -> Result<(), u32> {
191        let prog = &mut skel.progs.enable_primary_cpu;
192        let mut args = cpu_arg {
193            cpu_id: cpu as c_int,
194        };
195        let input = ProgramInput {
196            context_in: Some(unsafe {
197                std::slice::from_raw_parts_mut(
198                    &mut args as *mut _ as *mut u8,
199                    std::mem::size_of_val(&args),
200                )
201            }),
202            ..Default::default()
203        };
204        let out = prog.test_run(input).unwrap();
205        if out.return_value != 0 {
206            return Err(out.return_value);
207        }
208
209        Ok(())
210    }
211
212    fn init_primary_domain(skel: &mut BpfSkel<'_>, domain: &Cpumask) -> Result<()> {
213        // Clear the primary domain by passing a negative CPU id.
214        if let Err(err) = Self::enable_primary_cpu(skel, -1) {
215            warn!("failed to reset primary domain: error {}", err as i32);
216        }
217        // Update primary scheduling domain.
218        for cpu in 0..*NR_CPU_IDS {
219            if domain.test_cpu(cpu) {
220                if let Err(err) = Self::enable_primary_cpu(skel, cpu as i32) {
221                    warn!("failed to add CPU {} to primary domain: error {}", cpu, err);
222                }
223            }
224        }
225
226        Ok(())
227    }
228
229    fn get_metrics(&self) -> Metrics {
230        Metrics {
231            nr_ticks: self.skel.maps.bss_data.nr_ticks,
232            nr_preemptions: self.skel.maps.bss_data.nr_preemptions,
233            nr_direct_dispatches: self.skel.maps.bss_data.nr_direct_dispatches,
234            nr_primary_dispatches: self.skel.maps.bss_data.nr_primary_dispatches,
235            nr_timer_dispatches: self.skel.maps.bss_data.nr_timer_dispatches,
236        }
237    }
238
239    pub fn exited(&mut self) -> bool {
240        uei_exited!(&self.skel, uei)
241    }
242
243    fn run(&mut self, shutdown: Arc<AtomicBool>) -> Result<UserExitInfo> {
244        let (res_ch, req_ch) = self.stats_server.channels();
245        while !shutdown.load(Ordering::Relaxed) && !self.exited() {
246            match req_ch.recv_timeout(Duration::from_secs(1)) {
247                Ok(()) => res_ch.send(self.get_metrics())?,
248                Err(RecvTimeoutError::Timeout) => {}
249                Err(e) => Err(e)?,
250            }
251        }
252
253        self.struct_ops.take();
254        uei_report!(&self.skel, uei)
255    }
256}
257
258impl Drop for Scheduler<'_> {
259    fn drop(&mut self) {
260        info!("Unregister {} scheduler", SCHEDULER_NAME);
261    }
262}
263
264fn main() -> Result<()> {
265    let opts = Opts::parse();
266
267    if opts.version {
268        println!(
269            "{} {}",
270            SCHEDULER_NAME,
271            build_id::full_version(env!("CARGO_PKG_VERSION"))
272        );
273        return Ok(());
274    }
275
276    if opts.help_stats {
277        stats::server_data().describe_meta(&mut std::io::stdout(), None)?;
278        return Ok(());
279    }
280
281    let loglevel = simplelog::LevelFilter::Info;
282
283    let mut lcfg = simplelog::ConfigBuilder::new();
284    lcfg.set_time_level(simplelog::LevelFilter::Error)
285        .set_location_level(simplelog::LevelFilter::Off)
286        .set_target_level(simplelog::LevelFilter::Off)
287        .set_thread_level(simplelog::LevelFilter::Off);
288    simplelog::TermLogger::init(
289        loglevel,
290        lcfg.build(),
291        simplelog::TerminalMode::Stderr,
292        simplelog::ColorChoice::Auto,
293    )?;
294
295    let shutdown = Arc::new(AtomicBool::new(false));
296    let shutdown_clone = shutdown.clone();
297    ctrlc::set_handler(move || {
298        shutdown_clone.store(true, Ordering::Relaxed);
299    })
300    .context("Error setting Ctrl-C handler")?;
301
302    if let Some(intv) = opts.monitor.or(opts.stats) {
303        let shutdown_copy = shutdown.clone();
304        let jh = std::thread::spawn(move || {
305            match stats::monitor(Duration::from_secs_f64(intv), shutdown_copy) {
306                Ok(_) => {
307                    debug!("stats monitor thread finished successfully")
308                }
309                Err(error_object) => {
310                    warn!(
311                        "stats monitor thread finished because of an error {}",
312                        error_object
313                    )
314                }
315            }
316        });
317        if opts.monitor.is_some() {
318            let _ = jh.join();
319            return Ok(());
320        }
321    }
322
323    let mut open_object = MaybeUninit::uninit();
324    loop {
325        let mut sched = Scheduler::init(&opts, &mut open_object)?;
326        if !sched.run(shutdown.clone())?.should_restart() {
327            break;
328        }
329    }
330
331    Ok(())
332}