scx_rlfifo/
main.rs

1// Copyright (c) Andrea Righi <andrea.righi@linux.dev>
2
3// This software may be used and distributed according to the terms of the
4// GNU General Public License version 2.
5
6//! # Round-Robin Linux kernel scheduler that runs in user-space
7//!
8//! ## Overview
9//!
10//! This is a fully functional Round-Robin scheduler for the Linux kernel that operates
11//! in user-space and it is 100% implemented in Rust.
12//!
13//! It dequeues tasks in FIFO order and assigns dynamic time slices, preempting and
14//! re-enqueuing tasks to achieve basic Round-Robin behavior.
15//!
16//! The scheduler is designed to serve as a simple template for developers looking to implement
17//! more advanced scheduling policies.
18//!
19//! It is based on `scx_rustland_core`, a framework that is specifically designed to simplify the
20//! creation of user-space schedulers, leveraging the Linux kernel's `sched_ext` feature (a
21//! technology that allows to implement schedulers in BPF).
22//!
23//! The `scx_rustland_core` crate offers an abstraction layer over `sched_ext`, enabling developers
24//! to write schedulers in Rust without needing to interact directly with low-level kernel or BPF
25//! internal details.
26//!
27//! ## scx_rustland_core API
28//!
29//! ### struct `BpfScheduler`
30//!
31//! The `BpfScheduler` struct is the core interface for interacting with `sched_ext` via BPF.
32//!
33//! - **Initialization**:
34//!   - `BpfScheduler::init()` registers the scheduler and initializes the BPF component.
35//!
36//! - **Task Management**:
37//!   - `dequeue_task()`: Consume a task that wants to run, returns a QueuedTask object
38//!   - `select_cpu(pid: i32, prev_cpu: i32, flags: u64)`: Select an idle CPU for a task
39//!   - `dispatch_task(task: &DispatchedTask)`: Dispatch a task
40//!
41//! - **Completion Notification**:
42//!   - `notify_complete(nr_pending: u64)` Give control to the BPF component and report the number
43//!      of tasks that are still pending (this function can sleep)
44//!
45//! Each task received from dequeue_task() contains the following:
46//!
47//! struct QueuedTask {
48//!     pub pid: i32,              // pid that uniquely identifies a task
49//!     pub cpu: i32,              // CPU previously used by the task
50//!     pub flags: u64,            // task's enqueue flags
51//!     pub sum_exec_runtime: u64, // Total cpu time in nanoseconds
52//!     pub weight: u64,           // Task priority in the range [1..10000] (default is 100)
53//!     pub nvcsw: u64,            // Total amount of voluntary context switches
54//!     pub slice: u64,            // Remaining time slice budget
55//!     pub vtime: u64,            // Current task vruntime / deadline (set by the scheduler)
56//! }
57//!
58//! Each task dispatched using dispatch_task() contains the following:
59//!
60//! struct DispatchedTask {
61//!     pub pid: i32,      // pid that uniquely identifies a task
62//!     pub cpu: i32,      // target CPU selected by the scheduler
63//!                        // (RL_CPU_ANY = dispatch on the first CPU available)
64//!     pub flags: u64,    // task's enqueue flags
65//!     pub slice_ns: u64, // time slice in nanoseconds assigned to the task
66//!                        // (0 = use default time slice)
67//!     pub vtime: u64,    // this value can be used to send the task's vruntime or deadline
68//!                        // directly to the underlying BPF dispatcher
69//! }
70//!
71//! Other internal statistics that can be used to implement better scheduling policies:
72//!
73//!  let n: u64 = *self.bpf.nr_online_cpus_mut();       // amount of online CPUs
74//!  let n: u64 = *self.bpf.nr_running_mut();           // amount of currently running tasks
75//!  let n: u64 = *self.bpf.nr_queued_mut();            // amount of tasks queued to be scheduled
76//!  let n: u64 = *self.bpf.nr_scheduled_mut();         // amount of tasks managed by the user-space scheduler
77//!  let n: u64 = *self.bpf.nr_user_dispatches_mut();   // amount of user-space dispatches
78//!  let n: u64 = *self.bpf.nr_kernel_dispatches_mut(); // amount of kernel dispatches
79//!  let n: u64 = *self.bpf.nr_cancel_dispatches_mut(); // amount of cancelled dispatches
80//!  let n: u64 = *self.bpf.nr_bounce_dispatches_mut(); // amount of bounced dispatches
81//!  let n: u64 = *self.bpf.nr_failed_dispatches_mut(); // amount of failed dispatches
82//!  let n: u64 = *self.bpf.nr_sched_congested_mut();   // amount of scheduler congestion events
83
84mod bpf_skel;
85pub use bpf_skel::*;
86pub mod bpf_intf;
87
88#[rustfmt::skip]
89mod bpf;
90use std::mem::MaybeUninit;
91use std::time::SystemTime;
92
93use anyhow::Result;
94use bpf::*;
95use libbpf_rs::OpenObject;
96use scx_utils::UserExitInfo;
97
98// Maximum time slice (in nanoseconds) that a task can use before it is re-enqueued.
99const SLICE_NS: u64 = 5_000_000;
100
101struct Scheduler<'a> {
102    bpf: BpfScheduler<'a>, // Connector to the sched_ext BPF backend
103}
104
105impl<'a> Scheduler<'a> {
106    fn init(open_object: &'a mut MaybeUninit<OpenObject>) -> Result<Self> {
107        let bpf = BpfScheduler::init(
108            open_object,
109            0,     // exit_dump_len (buffer size of exit info, 0 = default)
110            false, // partial (false = include all tasks)
111            false, // debug (false = debug mode off)
112            false, // builtin_idle (false = idle selection policy in user-space)
113        )?;
114        Ok(Self { bpf })
115    }
116
117    fn dispatch_tasks(&mut self) {
118        // Get the amount of tasks that are waiting to be scheduled.
119        let nr_waiting = *self.bpf.nr_queued_mut();
120
121        // Start consuming and dispatching tasks, until all the CPUs are busy or there are no more
122        // tasks to be dispatched.
123        while let Ok(Some(task)) = self.bpf.dequeue_task() {
124            // Create a new task to be dispatched from the received enqueued task.
125            let mut dispatched_task = DispatchedTask::new(&task);
126
127            // Decide where the task needs to run (pick a target CPU).
128            //
129            // A call to select_cpu() will return the most suitable idle CPU for the task,
130            // prioritizing its previously used CPU (task.cpu).
131            //
132            // If we can't find any idle CPU, keep the task running on the same CPU.
133            let cpu = self.bpf.select_cpu(task.pid, task.cpu, task.flags);
134            dispatched_task.cpu = if cpu >= 0 { cpu } else { task.cpu };
135
136            // Determine the task's time slice: assign value inversely proportional to the number
137            // of tasks waiting to be scheduled.
138            dispatched_task.slice_ns = SLICE_NS / (nr_waiting + 1);
139
140            // Dispatch the task.
141            self.bpf.dispatch_task(&dispatched_task).unwrap();
142        }
143
144        // Notify the BPF component that tasks have been dispatched.
145        //
146        // This function will put the scheduler to sleep, until another task needs to run.
147        self.bpf.notify_complete(0);
148    }
149
150    fn print_stats(&mut self) {
151        // Internal scx_rustland_core statistics.
152        let nr_user_dispatches = *self.bpf.nr_user_dispatches_mut();
153        let nr_kernel_dispatches = *self.bpf.nr_kernel_dispatches_mut();
154        let nr_cancel_dispatches = *self.bpf.nr_cancel_dispatches_mut();
155        let nr_bounce_dispatches = *self.bpf.nr_bounce_dispatches_mut();
156        let nr_failed_dispatches = *self.bpf.nr_failed_dispatches_mut();
157        let nr_sched_congested = *self.bpf.nr_sched_congested_mut();
158
159        println!(
160            "user={} kernel={} cancel={} bounce={} fail={} cong={}",
161            nr_user_dispatches,
162            nr_kernel_dispatches,
163            nr_cancel_dispatches,
164            nr_bounce_dispatches,
165            nr_failed_dispatches,
166            nr_sched_congested,
167        );
168    }
169
170    fn now() -> u64 {
171        SystemTime::now()
172            .duration_since(SystemTime::UNIX_EPOCH)
173            .unwrap()
174            .as_secs()
175    }
176
177    fn run(&mut self) -> Result<UserExitInfo> {
178        let mut prev_ts = Self::now();
179
180        while !self.bpf.exited() {
181            self.dispatch_tasks();
182
183            let curr_ts = Self::now();
184            if curr_ts > prev_ts {
185                self.print_stats();
186                prev_ts = curr_ts;
187            }
188        }
189        self.bpf.shutdown_and_report()
190    }
191}
192
193fn print_warning() {
194    let warning = r#"
195**************************************************************************
196
197WARNING: The purpose of scx_rlfifo is to provide a simple scheduler
198implementation based on scx_rustland_core, and it is not intended for
199use in production environments. If you want to run a scheduler that makes
200decisions in user space, it is recommended to use *scx_rustland* instead.
201
202Please do not open GitHub issues in the event of poor performance, or
203scheduler eviction due to a runnable task timeout. However, if running this
204scheduler results in a system crash or the entire system becoming unresponsive,
205please open a GitHub issue.
206
207**************************************************************************"#;
208
209    println!("{}", warning);
210}
211
212fn main() -> Result<()> {
213    print_warning();
214
215    // Initialize and load the FIFO scheduler.
216    let mut open_object = MaybeUninit::uninit();
217    loop {
218        let mut sched = Scheduler::init(&mut open_object)?;
219        if !sched.run()?.should_restart() {
220            break;
221        }
222    }
223
224    Ok(())
225}