scx_rlfifo/main.rs
1// Copyright (c) Andrea Righi <andrea.righi@linux.dev>
2
3// This software may be used and distributed according to the terms of the
4// GNU General Public License version 2.
5
6//! # Round-Robin Linux kernel scheduler that runs in user-space
7//!
8//! ## Overview
9//!
10//! This is a fully functional Round-Robin scheduler for the Linux kernel that operates
11//! in user-space and it is 100% implemented in Rust.
12//!
13//! It dequeues tasks in FIFO order and assigns dynamic time slices, preempting and
14//! re-enqueuing tasks to achieve basic Round-Robin behavior.
15//!
16//! The scheduler is designed to serve as a simple template for developers looking to implement
17//! more advanced scheduling policies.
18//!
19//! It is based on `scx_rustland_core`, a framework that is specifically designed to simplify the
20//! creation of user-space schedulers, leveraging the Linux kernel's `sched_ext` feature (a
21//! technology that allows to implement schedulers in BPF).
22//!
23//! The `scx_rustland_core` crate offers an abstraction layer over `sched_ext`, enabling developers
24//! to write schedulers in Rust without needing to interact directly with low-level kernel or BPF
25//! internal details.
26//!
27//! ## scx_rustland_core API
28//!
29//! ### struct `BpfScheduler`
30//!
31//! The `BpfScheduler` struct is the core interface for interacting with `sched_ext` via BPF.
32//!
33//! - **Initialization**:
34//! - `BpfScheduler::init()` registers the scheduler and initializes the BPF component.
35//!
36//! - **Task Management**:
37//! - `dequeue_task()`: Consume a task that wants to run, returns a QueuedTask object
38//! - `select_cpu(pid: i32, prev_cpu: i32, flags: u64)`: Select an idle CPU for a task
39//! - `dispatch_task(task: &DispatchedTask)`: Dispatch a task
40//!
41//! - **Completion Notification**:
42//! - `notify_complete(nr_pending: u64)` Give control to the BPF component and report the number
43//! of tasks that are still pending (this function can sleep)
44//!
45//! Each task received from dequeue_task() contains the following:
46//!
47//!
48//! struct QueuedTask {
49//! pub pid: i32, // pid that uniquely identifies a task
50//! pub cpu: i32, // CPU previously used by the task
51//! pub nr_cpus_allowed: u64, // Number of CPUs that the task can use
52//! pub flags: u64, // task's enqueue flags
53//! pub start_ts: u64, // Timestamp since last time the task ran on a CPU (in ns)
54//! pub stop_ts: u64, // Timestamp since last time the task released a CPU (in ns)
55//! pub exec_runtime: u64, // Total cpu time since last sleep (in ns)
56//! pub weight: u64, // Task priority in the range [1..10000] (default is 100)
57//! pub vtime: u64, // Current task vruntime / deadline (set by the scheduler)
58//! pub comm: [c_char; TASK_COMM_LEN], // Task's executable name
59//! }
60//!
61//! Each task dispatched using dispatch_task() contains the following:
62//!
63//! struct DispatchedTask {
64//! pub pid: i32, // pid that uniquely identifies a task
65//! pub cpu: i32, // target CPU selected by the scheduler
66//! // (RL_CPU_ANY = dispatch on the first CPU available)
67//! pub flags: u64, // task's enqueue flags
68//! pub slice_ns: u64, // time slice in nanoseconds assigned to the task
69//! // (0 = use default time slice)
70//! pub vtime: u64, // this value can be used to send the task's vruntime or deadline
71//! // directly to the underlying BPF dispatcher
72//! }
73//!
74//! Other internal statistics that can be used to implement better scheduling policies:
75//!
76//! let n: u64 = *self.bpf.nr_online_cpus_mut(); // amount of online CPUs
77//! let n: u64 = *self.bpf.nr_running_mut(); // amount of currently running tasks
78//! let n: u64 = *self.bpf.nr_queued_mut(); // amount of tasks queued to be scheduled
79//! let n: u64 = *self.bpf.nr_scheduled_mut(); // amount of tasks managed by the user-space scheduler
80//! let n: u64 = *self.bpf.nr_user_dispatches_mut(); // amount of user-space dispatches
81//! let n: u64 = *self.bpf.nr_kernel_dispatches_mut(); // amount of kernel dispatches
82//! let n: u64 = *self.bpf.nr_cancel_dispatches_mut(); // amount of cancelled dispatches
83//! let n: u64 = *self.bpf.nr_bounce_dispatches_mut(); // amount of bounced dispatches
84//! let n: u64 = *self.bpf.nr_failed_dispatches_mut(); // amount of failed dispatches
85//! let n: u64 = *self.bpf.nr_sched_congested_mut(); // amount of scheduler congestion events
86
87mod bpf_skel;
88pub use bpf_skel::*;
89pub mod bpf_intf;
90
91#[rustfmt::skip]
92mod bpf;
93use std::mem::MaybeUninit;
94use std::time::SystemTime;
95
96use anyhow::Result;
97use bpf::*;
98use libbpf_rs::OpenObject;
99use scx_utils::libbpf_clap_opts::LibbpfOpts;
100use scx_utils::UserExitInfo;
101
102// Maximum time slice (in nanoseconds) that a task can use before it is re-enqueued.
103const SLICE_NS: u64 = 5_000_000;
104
105struct Scheduler<'a> {
106 bpf: BpfScheduler<'a>, // Connector to the sched_ext BPF backend
107}
108
109impl<'a> Scheduler<'a> {
110 fn init(open_object: &'a mut MaybeUninit<OpenObject>) -> Result<Self> {
111 let open_opts = LibbpfOpts::default();
112 let bpf = BpfScheduler::init(
113 open_object,
114 open_opts.clone().into_bpf_open_opts(),
115 0, // exit_dump_len (buffer size of exit info, 0 = default)
116 false, // partial (false = include all tasks)
117 false, // debug (false = debug mode off)
118 true, // builtin_idle (true = allow BPF to use idle CPUs if available)
119 SLICE_NS, // default time slice (for tasks automatically dispatched by the backend)
120 "rlfifo", // name of the scx ops
121 )?;
122 Ok(Self { bpf })
123 }
124
125 fn dispatch_tasks(&mut self) {
126 // Get the amount of tasks that are waiting to be scheduled.
127 let nr_waiting = *self.bpf.nr_queued_mut();
128
129 // Start consuming and dispatching tasks, until all the CPUs are busy or there are no more
130 // tasks to be dispatched.
131 while let Ok(Some(task)) = self.bpf.dequeue_task() {
132 // Create a new task to be dispatched from the received enqueued task.
133 let mut dispatched_task = DispatchedTask::new(&task);
134
135 // Decide where the task needs to run (pick a target CPU).
136 //
137 // A call to select_cpu() will return the most suitable idle CPU for the task,
138 // prioritizing its previously used CPU (task.cpu).
139 //
140 // If we can't find any idle CPU, run on the first CPU available.
141 let cpu = self.bpf.select_cpu(task.pid, task.cpu, task.flags);
142 dispatched_task.cpu = if cpu >= 0 { cpu } else { RL_CPU_ANY };
143
144 // Determine the task's time slice: assign value inversely proportional to the number
145 // of tasks waiting to be scheduled.
146 dispatched_task.slice_ns = SLICE_NS / (nr_waiting + 1);
147
148 // Dispatch the task.
149 self.bpf.dispatch_task(&dispatched_task).unwrap();
150 }
151
152 // Notify the BPF component that tasks have been dispatched.
153 //
154 // This function will put the scheduler to sleep, until another task needs to run.
155 self.bpf.notify_complete(0);
156 }
157
158 fn print_stats(&mut self) {
159 // Internal scx_rustland_core statistics.
160 let nr_user_dispatches = *self.bpf.nr_user_dispatches_mut();
161 let nr_kernel_dispatches = *self.bpf.nr_kernel_dispatches_mut();
162 let nr_cancel_dispatches = *self.bpf.nr_cancel_dispatches_mut();
163 let nr_bounce_dispatches = *self.bpf.nr_bounce_dispatches_mut();
164 let nr_failed_dispatches = *self.bpf.nr_failed_dispatches_mut();
165 let nr_sched_congested = *self.bpf.nr_sched_congested_mut();
166
167 println!(
168 "user={} kernel={} cancel={} bounce={} fail={} cong={}",
169 nr_user_dispatches,
170 nr_kernel_dispatches,
171 nr_cancel_dispatches,
172 nr_bounce_dispatches,
173 nr_failed_dispatches,
174 nr_sched_congested,
175 );
176 }
177
178 fn now() -> u64 {
179 SystemTime::now()
180 .duration_since(SystemTime::UNIX_EPOCH)
181 .unwrap()
182 .as_secs()
183 }
184
185 fn run(&mut self) -> Result<UserExitInfo> {
186 let mut prev_ts = Self::now();
187
188 while !self.bpf.exited() {
189 self.dispatch_tasks();
190
191 let curr_ts = Self::now();
192 if curr_ts > prev_ts {
193 self.print_stats();
194 prev_ts = curr_ts;
195 }
196 }
197 self.bpf.shutdown_and_report()
198 }
199}
200
201fn print_warning() {
202 let warning = r#"
203**************************************************************************
204
205WARNING: The purpose of scx_rlfifo is to provide a simple scheduler
206implementation based on scx_rustland_core, and it is not intended for
207use in production environments. If you want to run a scheduler that makes
208decisions in user space, it is recommended to use *scx_rustland* instead.
209
210Please do not open GitHub issues in the event of poor performance, or
211scheduler eviction due to a runnable task timeout. However, if running this
212scheduler results in a system crash or the entire system becoming unresponsive,
213please open a GitHub issue.
214
215**************************************************************************"#;
216
217 println!("{}", warning);
218}
219
220fn main() -> Result<()> {
221 print_warning();
222
223 // Initialize and load the FIFO scheduler.
224 let mut open_object = MaybeUninit::uninit();
225 loop {
226 let mut sched = Scheduler::init(&mut open_object)?;
227 if !sched.run()?.should_restart() {
228 break;
229 }
230 }
231
232 Ok(())
233}