1mod bpf_skel;
9pub use bpf_skel::*;
10pub mod bpf_intf;
11pub use bpf_intf::*;
12
13mod stats;
14use std::ffi::c_int;
15use std::fs;
16use std::mem::MaybeUninit;
17use std::sync::atomic::AtomicBool;
18use std::sync::atomic::Ordering;
19use std::sync::Arc;
20use std::time::Duration;
21
22use affinity::set_thread_affinity;
23use anyhow::bail;
24use anyhow::Context;
25use anyhow::Result;
26use clap::Parser;
27use crossbeam::channel::RecvTimeoutError;
28use libbpf_rs::OpenObject;
29use libbpf_rs::ProgramInput;
30use log::warn;
31use log::{debug, info};
32use scx_stats::prelude::*;
33use scx_utils::build_id;
34use scx_utils::scx_ops_attach;
35use scx_utils::scx_ops_load;
36use scx_utils::scx_ops_open;
37use scx_utils::set_rlimit_infinity;
38use scx_utils::uei_exited;
39use scx_utils::uei_report;
40use scx_utils::Cpumask;
41use scx_utils::Topology;
42use scx_utils::UserExitInfo;
43use scx_utils::NR_CPU_IDS;
44use stats::Metrics;
45
46const SCHEDULER_NAME: &'static str = "scx_tickless";
47
48#[derive(Debug, Parser)]
49struct Opts {
50 #[clap(long, default_value = "0")]
52 exit_dump_len: u32,
53
54 #[clap(short = 'm', long, default_value = "0x1")]
57 primary_domain: String,
58
59 #[clap(short = 's', long, default_value = "20000")]
62 slice_us: u64,
63
64 #[clap(short = 'f', long, default_value = "0")]
70 frequency: u64,
71
72 #[clap(short = 'p', long, action = clap::ArgAction::SetTrue)]
77 prefer_same_cpu: bool,
78
79 #[clap(short = 'n', long, action = clap::ArgAction::SetTrue)]
81 nosmt: bool,
82
83 #[clap(long)]
85 stats: Option<f64>,
86
87 #[clap(long)]
90 monitor: Option<f64>,
91
92 #[clap(short = 'v', long, action = clap::ArgAction::SetTrue)]
94 verbose: bool,
95
96 #[clap(short = 'V', long, action = clap::ArgAction::SetTrue)]
98 version: bool,
99
100 #[clap(long)]
102 help_stats: bool,
103}
104
105pub fn is_nohz_enabled() -> bool {
106 if let Ok(contents) = fs::read_to_string("/sys/devices/system/cpu/nohz_full") {
107 let trimmed = contents.trim();
108 return trimmed != "(null)" && !trimmed.is_empty();
109 }
110 false
111}
112
113struct Scheduler<'a> {
114 skel: BpfSkel<'a>,
115 struct_ops: Option<libbpf_rs::Link>,
116 stats_server: StatsServer<(), Metrics>,
117}
118
119impl<'a> Scheduler<'a> {
120 fn init(opts: &'a Opts, open_object: &'a mut MaybeUninit<OpenObject>) -> Result<Self> {
121 set_rlimit_infinity();
122
123 let topo = Topology::new().unwrap();
125 let smt_enabled = !opts.nosmt && topo.smt_enabled;
126 info!(
127 "{} {} {}",
128 SCHEDULER_NAME,
129 build_id::full_version(env!("CARGO_PKG_VERSION")),
130 if smt_enabled { "SMT on" } else { "SMT off" }
131 );
132
133 if !is_nohz_enabled() {
135 warn!("nohz_full is not enabled in the kernel");
136 }
137
138 let domain = Cpumask::from_str(&opts.primary_domain)?;
140 info!("primary CPU domain = 0x{:x}", domain);
141
142 let mut skel_builder = BpfSkelBuilder::default();
144 skel_builder.obj_builder.debug(opts.verbose);
145 let mut skel = scx_ops_open!(skel_builder, open_object, tickless_ops)?;
146
147 skel.struct_ops.tickless_ops_mut().exit_dump_len = opts.exit_dump_len;
148
149 skel.maps.rodata_data.smt_enabled = smt_enabled;
150 skel.maps.rodata_data.nr_cpu_ids = *NR_CPU_IDS as u32;
151
152 skel.maps.rodata_data.slice_ns = opts.slice_us * 1000;
154 skel.maps.rodata_data.tick_freq = opts.frequency;
155 skel.maps.rodata_data.prefer_same_cpu = opts.prefer_same_cpu;
156
157 let mut skel = scx_ops_load!(skel, tickless_ops, uei)?;
159
160 let timer_cpu = domain.iter().next();
163 if timer_cpu.is_none() {
164 bail!("primary cpumask is empty");
165 }
166 if let Err(e) = set_thread_affinity(&[timer_cpu.unwrap() as usize]) {
167 bail!("cannot set central CPU affinity: {}", e);
168 }
169
170 if let Err(err) = Self::init_primary_domain(&mut skel, &domain) {
172 warn!("failed to initialize primary domain: error {}", err);
173 }
174
175 let struct_ops = Some(scx_ops_attach!(skel, tickless_ops)?);
177 let stats_server = StatsServer::new(stats::server_data()).launch()?;
178
179 if let Err(e) = set_thread_affinity((0..*NR_CPU_IDS).collect::<Vec<usize>>()) {
181 bail!("cannot reset CPU affinity: {}", e);
182 }
183
184 Ok(Self {
185 skel,
186 struct_ops,
187 stats_server,
188 })
189 }
190
191 fn enable_primary_cpu(skel: &mut BpfSkel<'_>, cpu: i32) -> Result<(), u32> {
192 let prog = &mut skel.progs.enable_primary_cpu;
193 let mut args = cpu_arg {
194 cpu_id: cpu as c_int,
195 };
196 let input = ProgramInput {
197 context_in: Some(unsafe {
198 std::slice::from_raw_parts_mut(
199 &mut args as *mut _ as *mut u8,
200 std::mem::size_of_val(&args),
201 )
202 }),
203 ..Default::default()
204 };
205 let out = prog.test_run(input).unwrap();
206 if out.return_value != 0 {
207 return Err(out.return_value);
208 }
209
210 Ok(())
211 }
212
213 fn init_primary_domain(skel: &mut BpfSkel<'_>, domain: &Cpumask) -> Result<()> {
214 if let Err(err) = Self::enable_primary_cpu(skel, -1) {
216 warn!("failed to reset primary domain: error {}", err as i32);
217 }
218 for cpu in 0..*NR_CPU_IDS {
220 if domain.test_cpu(cpu) {
221 if let Err(err) = Self::enable_primary_cpu(skel, cpu as i32) {
222 warn!("failed to add CPU {} to primary domain: error {}", cpu, err);
223 }
224 }
225 }
226
227 Ok(())
228 }
229
230 fn get_metrics(&self) -> Metrics {
231 Metrics {
232 nr_ticks: self.skel.maps.bss_data.nr_ticks,
233 nr_preemptions: self.skel.maps.bss_data.nr_preemptions,
234 nr_direct_dispatches: self.skel.maps.bss_data.nr_direct_dispatches,
235 nr_primary_dispatches: self.skel.maps.bss_data.nr_primary_dispatches,
236 nr_timer_dispatches: self.skel.maps.bss_data.nr_timer_dispatches,
237 }
238 }
239
240 pub fn exited(&mut self) -> bool {
241 uei_exited!(&self.skel, uei)
242 }
243
244 fn run(&mut self, shutdown: Arc<AtomicBool>) -> Result<UserExitInfo> {
245 let (res_ch, req_ch) = self.stats_server.channels();
246 while !shutdown.load(Ordering::Relaxed) && !self.exited() {
247 match req_ch.recv_timeout(Duration::from_secs(1)) {
248 Ok(()) => res_ch.send(self.get_metrics())?,
249 Err(RecvTimeoutError::Timeout) => {}
250 Err(e) => Err(e)?,
251 }
252 }
253
254 let _ = self.struct_ops.take();
255 uei_report!(&self.skel, uei)
256 }
257}
258
259impl Drop for Scheduler<'_> {
260 fn drop(&mut self) {
261 info!("Unregister {} scheduler", SCHEDULER_NAME);
262 }
263}
264
265fn main() -> Result<()> {
266 let opts = Opts::parse();
267
268 if opts.version {
269 println!(
270 "{} {}",
271 SCHEDULER_NAME,
272 build_id::full_version(env!("CARGO_PKG_VERSION"))
273 );
274 return Ok(());
275 }
276
277 if opts.help_stats {
278 stats::server_data().describe_meta(&mut std::io::stdout(), None)?;
279 return Ok(());
280 }
281
282 let loglevel = simplelog::LevelFilter::Info;
283
284 let mut lcfg = simplelog::ConfigBuilder::new();
285 lcfg.set_time_offset_to_local()
286 .expect("Failed to set local time offset")
287 .set_time_level(simplelog::LevelFilter::Error)
288 .set_location_level(simplelog::LevelFilter::Off)
289 .set_target_level(simplelog::LevelFilter::Off)
290 .set_thread_level(simplelog::LevelFilter::Off);
291 simplelog::TermLogger::init(
292 loglevel,
293 lcfg.build(),
294 simplelog::TerminalMode::Stderr,
295 simplelog::ColorChoice::Auto,
296 )?;
297
298 let shutdown = Arc::new(AtomicBool::new(false));
299 let shutdown_clone = shutdown.clone();
300 ctrlc::set_handler(move || {
301 shutdown_clone.store(true, Ordering::Relaxed);
302 })
303 .context("Error setting Ctrl-C handler")?;
304
305 if let Some(intv) = opts.monitor.or(opts.stats) {
306 let shutdown_copy = shutdown.clone();
307 let jh = std::thread::spawn(move || {
308 match stats::monitor(Duration::from_secs_f64(intv), shutdown_copy) {
309 Ok(_) => {
310 debug!("stats monitor thread finished successfully")
311 }
312 Err(error_object) => {
313 warn!(
314 "stats monitor thread finished because of an error {}",
315 error_object
316 )
317 }
318 }
319 });
320 if opts.monitor.is_some() {
321 let _ = jh.join();
322 return Ok(());
323 }
324 }
325
326 let mut open_object = MaybeUninit::uninit();
327 loop {
328 let mut sched = Scheduler::init(&opts, &mut open_object)?;
329 if !sched.run(shutdown.clone())?.should_restart() {
330 break;
331 }
332 }
333
334 Ok(())
335}