scx_lavd/
main.rs

1// SPDX-License-Identifier: GPL-2.0
2//
3// Copyright (c) 2024 Valve Corporation.
4// Author: Changwoo Min <changwoo@igalia.com>
5
6// This software may be used and distributed according to the terms of the
7// GNU General Public License version 2.
8
9mod bpf_skel;
10pub use bpf_skel::*;
11pub mod bpf_intf;
12pub use bpf_intf::*;
13
14mod cpu_order;
15use scx_utils::init_libbpf_logging;
16mod stats;
17use std::ffi::c_int;
18use std::ffi::CStr;
19use std::mem;
20use std::mem::MaybeUninit;
21use std::str;
22use std::sync::atomic::AtomicBool;
23use std::sync::atomic::Ordering;
24use std::sync::Arc;
25use std::thread::ThreadId;
26use std::time::Duration;
27
28use anyhow::Context;
29use anyhow::Result;
30use clap::Parser;
31use clap_num::number_range;
32use cpu_order::CpuOrder;
33use cpu_order::PerfCpuOrder;
34use crossbeam::channel;
35use crossbeam::channel::Receiver;
36use crossbeam::channel::RecvTimeoutError;
37use crossbeam::channel::Sender;
38use crossbeam::channel::TrySendError;
39use libbpf_rs::skel::Skel;
40use libbpf_rs::OpenObject;
41use libbpf_rs::PrintLevel;
42use libbpf_rs::ProgramInput;
43use libc::c_char;
44use log::debug;
45use log::info;
46use plain::Plain;
47use scx_arena::ArenaLib;
48use scx_stats::prelude::*;
49use scx_utils::autopower::{fetch_power_profile, PowerProfile};
50use scx_utils::build_id;
51use scx_utils::compat;
52use scx_utils::libbpf_clap_opts::LibbpfOpts;
53use scx_utils::scx_ops_attach;
54use scx_utils::scx_ops_load;
55use scx_utils::scx_ops_open;
56use scx_utils::try_set_rlimit_infinity;
57use scx_utils::uei_exited;
58use scx_utils::uei_report;
59use scx_utils::EnergyModel;
60use scx_utils::TopologyArgs;
61use scx_utils::UserExitInfo;
62use scx_utils::NR_CPU_IDS;
63use stats::SchedSample;
64use stats::SchedSamples;
65use stats::StatsReq;
66use stats::StatsRes;
67use stats::SysStats;
68
69const SCHEDULER_NAME: &str = "scx_lavd";
70/// scx_lavd: Latency-criticality Aware Virtual Deadline (LAVD) scheduler
71///
72/// The rust part is minimal. It processes command line options and logs out
73/// scheduling statistics. The BPF part makes all the scheduling decisions.
74/// See the more detailed overview of the LAVD design at main.bpf.c.
75#[derive(Debug, Parser)]
76struct Opts {
77    /// Automatically decide the scheduler's power mode (performance vs.
78    /// powersave vs. balanced), CPU preference order, etc, based on system
79    /// load. The options affecting the power mode and the use of core compaction
80    /// (--autopower, --performance, --powersave, --balanced,
81    /// --no-core-compaction) cannot be used with this option. When no option
82    /// is specified, this is a default mode.
83    #[clap(long = "autopilot", action = clap::ArgAction::SetTrue)]
84    autopilot: bool,
85
86    /// Automatically decide the scheduler's power mode (performance vs.
87    /// powersave vs. balanced) based on the system's active power profile.
88    /// The scheduler's power mode decides the CPU preference order and the use
89    /// of core compaction, so the options affecting these (--autopilot,
90    /// --performance, --powersave, --balanced, --no-core-compaction) cannot
91    /// be used with this option.
92    #[clap(long = "autopower", action = clap::ArgAction::SetTrue)]
93    autopower: bool,
94
95    /// Run the scheduler in performance mode to get maximum performance.
96    /// This option cannot be used with other conflicting options (--autopilot,
97    /// --autopower, --balanced, --powersave, --no-core-compaction)
98    /// affecting the use of core compaction.
99    #[clap(long = "performance", action = clap::ArgAction::SetTrue)]
100    performance: bool,
101
102    /// Run the scheduler in powersave mode to minimize powr consumption.
103    /// This option cannot be used with other conflicting options (--autopilot,
104    /// --autopower, --performance, --balanced, --no-core-compaction)
105    /// affecting the use of core compaction.
106    #[clap(long = "powersave", action = clap::ArgAction::SetTrue)]
107    powersave: bool,
108
109    /// Run the scheduler in balanced mode aiming for sweetspot between power
110    /// and performance. This option cannot be used with other conflicting
111    /// options (--autopilot, --autopower, --performance, --powersave,
112    /// --no-core-compaction) affecting the use of core compaction.
113    #[clap(long = "balanced", action = clap::ArgAction::SetTrue)]
114    balanced: bool,
115
116    /// Maximum scheduling slice duration in microseconds.
117    #[clap(long = "slice-max-us", default_value = "5000")]
118    slice_max_us: u64,
119
120    /// Minimum scheduling slice duration in microseconds.
121    #[clap(long = "slice-min-us", default_value = "500")]
122    slice_min_us: u64,
123
124    /// Migration delta threshold percentage (0-100). When set to a non-zero value,
125    /// uses average utilization for threshold calculation instead of current
126    /// utilization, and the threshold is calculated as: avg_load * (mig-delta-pct / 100).
127    /// Additionally, disables force task stealing in the consume path, relying only
128    /// on the is_stealer/is_stealee thresholds for more predictable load balancing.
129    /// Default is 0 (disabled, uses dynamic threshold based on load with both
130    /// probabilistic and force task stealing enabled). This is an experimental feature.
131    #[clap(long = "mig-delta-pct", default_value = "0", value_parser=Opts::mig_delta_pct_range)]
132    mig_delta_pct: u8,
133
134    /// Slice duration in microseconds to use for all tasks when pinned tasks
135    /// are running on a CPU. Must be between slice-min-us and slice-max-us.
136    /// When this option is enabled, pinned tasks are always enqueued to per-CPU DSQs
137    /// and the dispatch logic compares vtimes across all DSQs to select the lowest
138    /// vtime task. This helps improve responsiveness for pinned tasks.
139    #[clap(long = "pinned-slice-us")]
140    pinned_slice_us: Option<u64>,
141
142    /// Limit the ratio of preemption to the roughly top P% of latency-critical
143    /// tasks. When N is given as an argument, P is 0.5^N * 100. The default
144    /// value is 6, which limits the preemption for the top 1.56% of
145    /// latency-critical tasks.
146    #[clap(long = "preempt-shift", default_value = "6", value_parser=Opts::preempt_shift_range)]
147    preempt_shift: u8,
148
149    /// List of CPUs in preferred order (e.g., "0-3,7,6,5,4"). The scheduler
150    /// uses the CPU preference mode only when the core compaction is enabled
151    /// (i.e., balanced or powersave mode is specified as an option or chosen
152    /// in the autopilot or autopower mode). When "--cpu-pref-order" is given,
153    /// it implies "--no-use-em".
154    #[clap(long = "cpu-pref-order", default_value = "")]
155    cpu_pref_order: String,
156
157    /// Do not use the energy model in making CPU preference order decisions.
158    #[clap(long = "no-use-em", action = clap::ArgAction::SetTrue)]
159    no_use_em: bool,
160
161    /// Do not boost futex holders.
162    #[clap(long = "no-futex-boost", action = clap::ArgAction::SetTrue)]
163    no_futex_boost: bool,
164
165    /// Disable preemption.
166    #[clap(long = "no-preemption", action = clap::ArgAction::SetTrue)]
167    no_preemption: bool,
168
169    /// Disable an optimization for synchronous wake-up.
170    #[clap(long = "no-wake-sync", action = clap::ArgAction::SetTrue)]
171    no_wake_sync: bool,
172
173    /// Disable dynamic slice boost for long-running tasks.
174    #[clap(long = "no-slice-boost", action = clap::ArgAction::SetTrue)]
175    no_slice_boost: bool,
176
177    /// Enables DSQs per CPU, this enables task queuing and dispatching
178    /// from CPU specific DSQs. This generally increases L1/L2 cache
179    /// locality for tasks and lowers lock contention compared to shared DSQs,
180    /// but at the cost of higher load balancing complexity. This is a
181    /// highly experimental feature.
182    #[clap(long = "per-cpu-dsq", action = clap::ArgAction::SetTrue)]
183    per_cpu_dsq: bool,
184
185    /// Enable CPU bandwidth control using cpu.max in cgroup v2.
186    /// This is a highly experimental feature.
187    #[clap(long = "enable-cpu-bw", action = clap::ArgAction::SetTrue)]
188    enable_cpu_bw: bool,
189
190    ///
191    /// Disable core compaction so the scheduler uses all the online CPUs.
192    /// The core compaction attempts to minimize the number of actively used
193    /// CPUs for unaffinitized tasks, respecting the CPU preference order.
194    /// Normally, the core compaction is enabled by the power mode (i.e.,
195    /// balanced or powersave mode is specified as an option or chosen in
196    /// the autopilot or autopower mode). This option cannot be used with the
197    /// other options that control the core compaction (--autopilot,
198    /// --autopower, --performance, --balanced, --powersave).
199    #[clap(long = "no-core-compaction", action = clap::ArgAction::SetTrue)]
200    no_core_compaction: bool,
201
202    /// Disable controlling the CPU frequency.
203    #[clap(long = "no-freq-scaling", action = clap::ArgAction::SetTrue)]
204    no_freq_scaling: bool,
205
206    /// Enable stats monitoring with the specified interval.
207    #[clap(long)]
208    stats: Option<f64>,
209
210    /// Run in stats monitoring mode with the specified interval. Scheduler is not launched.
211    #[clap(long)]
212    monitor: Option<f64>,
213
214    /// Run in monitoring mode. Show the specified number of scheduling
215    /// samples every second.
216    #[clap(long)]
217    monitor_sched_samples: Option<u64>,
218
219    /// Enable verbose output, including libbpf details. Specify multiple
220    /// times to increase verbosity.
221    #[clap(short = 'v', long, action = clap::ArgAction::Count)]
222    verbose: u8,
223
224    /// Print scheduler version and exit.
225    #[clap(short = 'V', long, action = clap::ArgAction::SetTrue)]
226    version: bool,
227
228    /// Show descriptions for statistics.
229    #[clap(long)]
230    help_stats: bool,
231
232    #[clap(flatten, next_help_heading = "Libbpf Options")]
233    pub libbpf: LibbpfOpts,
234
235    /// Topology configuration options
236    #[clap(flatten)]
237    topology: Option<TopologyArgs>,
238}
239
240impl Opts {
241    fn can_autopilot(&self) -> bool {
242        self.autopower == false
243            && self.performance == false
244            && self.powersave == false
245            && self.balanced == false
246            && self.no_core_compaction == false
247    }
248
249    fn can_autopower(&self) -> bool {
250        self.autopilot == false
251            && self.performance == false
252            && self.powersave == false
253            && self.balanced == false
254            && self.no_core_compaction == false
255    }
256
257    fn can_performance(&self) -> bool {
258        self.autopilot == false
259            && self.autopower == false
260            && self.powersave == false
261            && self.balanced == false
262    }
263
264    fn can_balanced(&self) -> bool {
265        self.autopilot == false
266            && self.autopower == false
267            && self.performance == false
268            && self.powersave == false
269            && self.no_core_compaction == false
270    }
271
272    fn can_powersave(&self) -> bool {
273        self.autopilot == false
274            && self.autopower == false
275            && self.performance == false
276            && self.balanced == false
277            && self.no_core_compaction == false
278    }
279
280    fn proc(&mut self) -> Option<&mut Self> {
281        if !self.autopilot {
282            self.autopilot = self.can_autopilot();
283        }
284
285        if self.autopilot {
286            if !self.can_autopilot() {
287                info!("Autopilot mode cannot be used with conflicting options.");
288                return None;
289            }
290            info!("Autopilot mode is enabled.");
291        }
292
293        if self.autopower {
294            if !self.can_autopower() {
295                info!("Autopower mode cannot be used with conflicting options.");
296                return None;
297            }
298            info!("Autopower mode is enabled.");
299        }
300
301        if self.performance {
302            if !self.can_performance() {
303                info!("Performance mode cannot be used with conflicting options.");
304                return None;
305            }
306            info!("Performance mode is enabled.");
307            self.no_core_compaction = true;
308        }
309
310        if self.powersave {
311            if !self.can_powersave() {
312                info!("Powersave mode cannot be used with conflicting options.");
313                return None;
314            }
315            info!("Powersave mode is enabled.");
316            self.no_core_compaction = false;
317        }
318
319        if self.balanced {
320            if !self.can_balanced() {
321                info!("Balanced mode cannot be used with conflicting options.");
322                return None;
323            }
324            info!("Balanced mode is enabled.");
325            self.no_core_compaction = false;
326        }
327
328        if !EnergyModel::has_energy_model() || !self.cpu_pref_order.is_empty() {
329            self.no_use_em = true;
330            info!("Energy model won't be used for CPU preference order.");
331        }
332
333        if let Some(pinned_slice) = self.pinned_slice_us {
334            if pinned_slice < self.slice_min_us || pinned_slice > self.slice_max_us {
335                info!(
336                    "pinned-slice-us ({}) must be between slice-min-us ({}) and slice-max-us ({})",
337                    pinned_slice, self.slice_min_us, self.slice_max_us
338                );
339                return None;
340            }
341            info!(
342                "Pinned task slice mode is enabled ({} us). Pinned tasks will use per-CPU DSQs.",
343                pinned_slice
344            );
345        }
346
347        Some(self)
348    }
349
350    fn preempt_shift_range(s: &str) -> Result<u8, String> {
351        number_range(s, 0, 10)
352    }
353
354    fn mig_delta_pct_range(s: &str) -> Result<u8, String> {
355        number_range(s, 0, 100)
356    }
357}
358
359unsafe impl Plain for msg_task_ctx {}
360
361impl msg_task_ctx {
362    fn from_bytes(buf: &[u8]) -> &msg_task_ctx {
363        plain::from_bytes(buf).expect("The buffer is either too short or not aligned!")
364    }
365}
366
367impl introspec {
368    fn new() -> Self {
369        let intrspc = unsafe { mem::MaybeUninit::<introspec>::zeroed().assume_init() };
370        intrspc
371    }
372}
373
374struct Scheduler<'a> {
375    skel: BpfSkel<'a>,
376    struct_ops: Option<libbpf_rs::Link>,
377    rb_mgr: libbpf_rs::RingBuffer<'static>,
378    intrspc: introspec,
379    intrspc_rx: Receiver<SchedSample>,
380    monitor_tid: Option<ThreadId>,
381    stats_server: StatsServer<StatsReq, StatsRes>,
382    mseq_id: u64,
383}
384
385impl<'a> Scheduler<'a> {
386    fn init(opts: &'a Opts, open_object: &'a mut MaybeUninit<OpenObject>) -> Result<Self> {
387        if *NR_CPU_IDS > LAVD_CPU_ID_MAX as usize {
388            panic!(
389                "Num possible CPU IDs ({}) exceeds maximum of ({})",
390                *NR_CPU_IDS, LAVD_CPU_ID_MAX
391            );
392        }
393
394        try_set_rlimit_infinity();
395
396        // Open the BPF prog first for verification.
397        let mut skel_builder = BpfSkelBuilder::default();
398        skel_builder.obj_builder.debug(opts.verbose > 0);
399        init_libbpf_logging(Some(PrintLevel::Debug));
400
401        let open_opts = opts.libbpf.clone().into_bpf_open_opts();
402        let mut skel = scx_ops_open!(skel_builder, open_object, lavd_ops, open_opts)?;
403
404        // Enable futex tracing using ftrace if available. If the ftrace is not
405        // available, use tracepoint, which is known to be slower than ftrace.
406        if !opts.no_futex_boost {
407            if Self::attach_futex_ftraces(&mut skel)? == false {
408                info!("Fail to attach futex ftraces. Try with tracepoints.");
409                if Self::attach_futex_tracepoints(&mut skel)? == false {
410                    info!("Fail to attach futex tracepoints.");
411                }
412            }
413        }
414
415        // Initialize CPU topology with CLI arguments
416        let order = CpuOrder::new(opts.topology.as_ref()).unwrap();
417        Self::init_cpus(&mut skel, &order);
418        Self::init_cpdoms(&mut skel, &order);
419
420        // Initialize skel according to @opts.
421        Self::init_globals(&mut skel, &opts, &order);
422
423        // Initialize arena
424        let mut skel = scx_ops_load!(skel, lavd_ops, uei)?;
425        let task_size = std::mem::size_of::<types::task_ctx>();
426        let arenalib = ArenaLib::init(skel.object_mut(), task_size, *NR_CPU_IDS)?;
427        arenalib.setup()?;
428
429        // Attach.
430        let struct_ops = Some(scx_ops_attach!(skel, lavd_ops)?);
431        let stats_server = StatsServer::new(stats::server_data(*NR_CPU_IDS as u64)).launch()?;
432
433        // Build a ring buffer for instrumentation
434        let (intrspc_tx, intrspc_rx) = channel::bounded(65536);
435        let rb_map = &mut skel.maps.introspec_msg;
436        let mut builder = libbpf_rs::RingBufferBuilder::new();
437        builder
438            .add(rb_map, move |data| {
439                Scheduler::relay_introspec(data, &intrspc_tx)
440            })
441            .unwrap();
442        let rb_mgr = builder.build().unwrap();
443
444        Ok(Self {
445            skel,
446            struct_ops,
447            rb_mgr,
448            intrspc: introspec::new(),
449            intrspc_rx,
450            monitor_tid: None,
451            stats_server,
452            mseq_id: 0,
453        })
454    }
455
456    fn attach_futex_ftraces(skel: &mut OpenBpfSkel) -> Result<bool> {
457        let ftraces = vec![
458            ("__futex_wait", &skel.progs.fexit___futex_wait),
459            ("futex_wait_multiple", &skel.progs.fexit_futex_wait_multiple),
460            (
461                "futex_wait_requeue_pi",
462                &skel.progs.fexit_futex_wait_requeue_pi,
463            ),
464            ("futex_wake", &skel.progs.fexit_futex_wake),
465            ("futex_wake_op", &skel.progs.fexit_futex_wake_op),
466            ("futex_lock_pi", &skel.progs.fexit_futex_lock_pi),
467            ("futex_unlock_pi", &skel.progs.fexit_futex_unlock_pi),
468        ];
469
470        if compat::tracer_available("function")? == false {
471            info!("Ftrace is not enabled in the kernel.");
472            return Ok(false);
473        }
474
475        compat::cond_kprobes_enable(ftraces)
476    }
477
478    fn attach_futex_tracepoints(skel: &mut OpenBpfSkel) -> Result<bool> {
479        let tracepoints = vec![
480            ("syscalls:sys_enter_futex", &skel.progs.rtp_sys_enter_futex),
481            ("syscalls:sys_exit_futex", &skel.progs.rtp_sys_exit_futex),
482            (
483                "syscalls:sys_exit_futex_wait",
484                &skel.progs.rtp_sys_exit_futex_wait,
485            ),
486            (
487                "syscalls:sys_exit_futex_waitv",
488                &skel.progs.rtp_sys_exit_futex_waitv,
489            ),
490            (
491                "syscalls:sys_exit_futex_wake",
492                &skel.progs.rtp_sys_exit_futex_wake,
493            ),
494        ];
495
496        compat::cond_tracepoints_enable(tracepoints)
497    }
498
499    fn init_cpus(skel: &mut OpenBpfSkel, order: &CpuOrder) {
500        debug!("{:#?}", order);
501
502        // Initialize CPU capacity and sibling
503        for cpu in order.cpuids.iter() {
504            skel.maps.rodata_data.as_mut().unwrap().cpu_capacity[cpu.cpu_adx] = cpu.cpu_cap as u16;
505            skel.maps.rodata_data.as_mut().unwrap().cpu_big[cpu.cpu_adx] = cpu.big_core as u8;
506            skel.maps.rodata_data.as_mut().unwrap().cpu_turbo[cpu.cpu_adx] = cpu.turbo_core as u8;
507            skel.maps.rodata_data.as_mut().unwrap().cpu_sibling[cpu.cpu_adx] =
508                cpu.cpu_sibling as u32;
509        }
510
511        // Initialize performance vs. CPU order table.
512        let nr_pco_states: u8 = order.perf_cpu_order.len() as u8;
513        if nr_pco_states > LAVD_PCO_STATE_MAX as u8 {
514            panic!("Generated performance vs. CPU order stats are too complex ({nr_pco_states}) to handle");
515        }
516
517        skel.maps.rodata_data.as_mut().unwrap().nr_pco_states = nr_pco_states;
518        for (i, (_, pco)) in order.perf_cpu_order.iter().enumerate() {
519            Self::init_pco_tuple(skel, i, &pco);
520            info!("{:#}", pco);
521        }
522
523        let (_, last_pco) = order.perf_cpu_order.last_key_value().unwrap();
524        for i in nr_pco_states..LAVD_PCO_STATE_MAX as u8 {
525            Self::init_pco_tuple(skel, i as usize, &last_pco);
526        }
527    }
528
529    fn init_pco_tuple(skel: &mut OpenBpfSkel, i: usize, pco: &PerfCpuOrder) {
530        let cpus_perf = pco.cpus_perf.borrow();
531        let cpus_ovflw = pco.cpus_ovflw.borrow();
532        let pco_nr_primary = cpus_perf.len();
533
534        skel.maps.rodata_data.as_mut().unwrap().pco_bounds[i] = pco.perf_cap as u32;
535        skel.maps.rodata_data.as_mut().unwrap().pco_nr_primary[i] = pco_nr_primary as u16;
536
537        for (j, &cpu_adx) in cpus_perf.iter().enumerate() {
538            skel.maps.rodata_data.as_mut().unwrap().pco_table[i][j] = cpu_adx as u16;
539        }
540
541        for (j, &cpu_adx) in cpus_ovflw.iter().enumerate() {
542            let k = j + pco_nr_primary;
543            skel.maps.rodata_data.as_mut().unwrap().pco_table[i][k] = cpu_adx as u16;
544        }
545    }
546
547    fn init_cpdoms(skel: &mut OpenBpfSkel, order: &CpuOrder) {
548        // Initialize compute domain contexts
549        for (k, v) in order.cpdom_map.iter() {
550            skel.maps.bss_data.as_mut().unwrap().cpdom_ctxs[v.cpdom_id].id = v.cpdom_id as u64;
551            skel.maps.bss_data.as_mut().unwrap().cpdom_ctxs[v.cpdom_id].alt_id =
552                v.cpdom_alt_id.get() as u64;
553            skel.maps.bss_data.as_mut().unwrap().cpdom_ctxs[v.cpdom_id].numa_id = k.numa_adx as u8;
554            skel.maps.bss_data.as_mut().unwrap().cpdom_ctxs[v.cpdom_id].llc_id = k.llc_adx as u8;
555            skel.maps.bss_data.as_mut().unwrap().cpdom_ctxs[v.cpdom_id].is_big = k.is_big as u8;
556            skel.maps.bss_data.as_mut().unwrap().cpdom_ctxs[v.cpdom_id].is_valid = 1;
557            for cpu_id in v.cpu_ids.iter() {
558                let i = cpu_id / 64;
559                let j = cpu_id % 64;
560                skel.maps.bss_data.as_mut().unwrap().cpdom_ctxs[v.cpdom_id].__cpumask[i] |=
561                    0x01 << j;
562            }
563
564            if v.neighbor_map.borrow().iter().len() > LAVD_CPDOM_MAX_DIST as usize {
565                panic!("The processor topology is too complex to handle in BPF.");
566            }
567
568            for (k, (_d, neighbors)) in v.neighbor_map.borrow().iter().enumerate() {
569                let nr_neighbors = neighbors.borrow().len() as u8;
570                if nr_neighbors > LAVD_CPDOM_MAX_NR as u8 {
571                    panic!("The processor topology is too complex to handle in BPF.");
572                }
573                skel.maps.bss_data.as_mut().unwrap().cpdom_ctxs[v.cpdom_id].nr_neighbors[k] =
574                    nr_neighbors;
575                for n in neighbors.borrow().iter() {
576                    skel.maps.bss_data.as_mut().unwrap().cpdom_ctxs[v.cpdom_id].neighbor_bits[k] |=
577                        0x1 << n;
578                }
579            }
580        }
581    }
582
583    fn init_globals(skel: &mut OpenBpfSkel, opts: &Opts, order: &CpuOrder) {
584        let bss_data = skel.maps.bss_data.as_mut().unwrap();
585        bss_data.no_preemption = opts.no_preemption;
586        bss_data.no_core_compaction = opts.no_core_compaction;
587        bss_data.no_freq_scaling = opts.no_freq_scaling;
588        bss_data.is_powersave_mode = opts.powersave;
589        let rodata = skel.maps.rodata_data.as_mut().unwrap();
590        rodata.nr_llcs = order.nr_llcs as u64;
591        rodata.nr_cpu_ids = *NR_CPU_IDS as u32;
592        rodata.is_smt_active = order.smt_enabled;
593        rodata.is_autopilot_on = opts.autopilot;
594        rodata.verbose = opts.verbose;
595        rodata.slice_max_ns = opts.slice_max_us * 1000;
596        rodata.slice_min_ns = opts.slice_min_us * 1000;
597        rodata.pinned_slice_ns = opts.pinned_slice_us.map(|v| v * 1000).unwrap_or(0);
598        rodata.preempt_shift = opts.preempt_shift;
599        rodata.mig_delta_pct = opts.mig_delta_pct;
600        rodata.no_use_em = opts.no_use_em as u8;
601        rodata.no_wake_sync = opts.no_wake_sync;
602        rodata.no_slice_boost = opts.no_slice_boost;
603        rodata.per_cpu_dsq = opts.per_cpu_dsq;
604        rodata.enable_cpu_bw = opts.enable_cpu_bw;
605
606        skel.struct_ops.lavd_ops_mut().flags = *compat::SCX_OPS_ENQ_EXITING
607            | *compat::SCX_OPS_ENQ_LAST
608            | *compat::SCX_OPS_ENQ_MIGRATION_DISABLED
609            | *compat::SCX_OPS_KEEP_BUILTIN_IDLE;
610    }
611
612    fn get_msg_seq_id() -> u64 {
613        static mut MSEQ: u64 = 0;
614        unsafe {
615            MSEQ += 1;
616            MSEQ
617        }
618    }
619
620    fn relay_introspec(data: &[u8], intrspc_tx: &Sender<SchedSample>) -> i32 {
621        let mt = msg_task_ctx::from_bytes(data);
622        let tx = mt.taskc_x;
623        let tc = mt.taskc;
624
625        // No idea how to print other types than LAVD_MSG_TASKC
626        if mt.hdr.kind != LAVD_MSG_TASKC {
627            return 0;
628        }
629
630        let mseq = Scheduler::get_msg_seq_id();
631
632        let c_tx_cm: *const c_char = (&tx.comm as *const [c_char; 17]) as *const c_char;
633        let c_tx_cm_str: &CStr = unsafe { CStr::from_ptr(c_tx_cm) };
634        let tx_comm: &str = c_tx_cm_str.to_str().unwrap();
635
636        let c_waker_cm: *const c_char = (&tc.waker_comm as *const [c_char; 17]) as *const c_char;
637        let c_waker_cm_str: &CStr = unsafe { CStr::from_ptr(c_waker_cm) };
638        let waker_comm: &str = c_waker_cm_str.to_str().unwrap();
639
640        let c_tx_st: *const c_char = (&tx.stat as *const [c_char; 5]) as *const c_char;
641        let c_tx_st_str: &CStr = unsafe { CStr::from_ptr(c_tx_st) };
642        let tx_stat: &str = c_tx_st_str.to_str().unwrap();
643
644        match intrspc_tx.try_send(SchedSample {
645            mseq,
646            pid: tc.pid,
647            comm: tx_comm.into(),
648            stat: tx_stat.into(),
649            cpu_id: tc.cpu_id,
650            prev_cpu_id: tc.prev_cpu_id,
651            suggested_cpu_id: tc.suggested_cpu_id,
652            waker_pid: tc.waker_pid,
653            waker_comm: waker_comm.into(),
654            slice: tc.slice,
655            lat_cri: tc.lat_cri,
656            avg_lat_cri: tx.avg_lat_cri,
657            static_prio: tx.static_prio,
658            rerunnable_interval: tx.rerunnable_interval,
659            resched_interval: tc.resched_interval,
660            run_freq: tc.run_freq,
661            avg_runtime: tc.avg_runtime,
662            wait_freq: tc.wait_freq,
663            wake_freq: tc.wake_freq,
664            perf_cri: tc.perf_cri,
665            thr_perf_cri: tx.thr_perf_cri,
666            cpuperf_cur: tx.cpuperf_cur,
667            cpu_util: tx.cpu_util,
668            cpu_sutil: tx.cpu_sutil,
669            nr_active: tx.nr_active,
670            dsq_id: tx.dsq_id,
671            dsq_consume_lat: tx.dsq_consume_lat,
672            slice_used: tc.last_slice_used,
673        }) {
674            Ok(()) | Err(TrySendError::Full(_)) => 0,
675            Err(e) => panic!("failed to send on intrspc_tx ({})", e),
676        }
677    }
678
679    fn prep_introspec(&mut self) {
680        if !self.skel.maps.bss_data.as_ref().unwrap().is_monitored {
681            self.skel.maps.bss_data.as_mut().unwrap().is_monitored = true;
682        }
683        self.skel.maps.bss_data.as_mut().unwrap().intrspc.cmd = self.intrspc.cmd;
684        self.skel.maps.bss_data.as_mut().unwrap().intrspc.arg = self.intrspc.arg;
685    }
686
687    fn cleanup_introspec(&mut self) {
688        self.skel.maps.bss_data.as_mut().unwrap().intrspc.cmd = LAVD_CMD_NOP;
689    }
690
691    fn get_pc(x: u64, y: u64) -> f64 {
692        return 100. * x as f64 / y as f64;
693    }
694
695    fn get_power_mode(power_mode: i32) -> &'static str {
696        match power_mode as u32 {
697            LAVD_PM_PERFORMANCE => "performance",
698            LAVD_PM_BALANCED => "balanced",
699            LAVD_PM_POWERSAVE => "powersave",
700            _ => "unknown",
701        }
702    }
703
704    fn stats_req_to_res(&mut self, req: &StatsReq) -> Result<StatsRes> {
705        Ok(match req {
706            StatsReq::NewSampler(tid) => {
707                self.rb_mgr.consume().unwrap();
708                self.monitor_tid = Some(*tid);
709                StatsRes::Ack
710            }
711            StatsReq::SysStatsReq { tid } => {
712                if Some(*tid) != self.monitor_tid {
713                    return Ok(StatsRes::Bye);
714                }
715                self.mseq_id += 1;
716
717                let bss_data = self.skel.maps.bss_data.as_ref().unwrap();
718                let st = bss_data.sys_stat;
719
720                let mseq = self.mseq_id;
721                let nr_queued_task = st.nr_queued_task;
722                let nr_active = st.nr_active;
723                let nr_sched = st.nr_sched;
724                let nr_preempt = st.nr_preempt;
725                let pc_pc = Self::get_pc(st.nr_perf_cri, nr_sched);
726                let pc_lc = Self::get_pc(st.nr_lat_cri, nr_sched);
727                let pc_x_migration = Self::get_pc(st.nr_x_migration, nr_sched);
728                let nr_stealee = st.nr_stealee;
729                let nr_big = st.nr_big;
730                let pc_big = Self::get_pc(nr_big, nr_sched);
731                let pc_pc_on_big = Self::get_pc(st.nr_pc_on_big, nr_big);
732                let pc_lc_on_big = Self::get_pc(st.nr_lc_on_big, nr_big);
733                let power_mode = Self::get_power_mode(bss_data.power_mode);
734                let total_time = bss_data.performance_mode_ns
735                    + bss_data.balanced_mode_ns
736                    + bss_data.powersave_mode_ns;
737                let pc_performance = Self::get_pc(bss_data.performance_mode_ns, total_time);
738                let pc_balanced = Self::get_pc(bss_data.balanced_mode_ns, total_time);
739                let pc_powersave = Self::get_pc(bss_data.powersave_mode_ns, total_time);
740
741                StatsRes::SysStats(SysStats {
742                    mseq,
743                    nr_queued_task,
744                    nr_active,
745                    nr_sched,
746                    nr_preempt,
747                    pc_pc,
748                    pc_lc,
749                    pc_x_migration,
750                    nr_stealee,
751                    pc_big,
752                    pc_pc_on_big,
753                    pc_lc_on_big,
754                    power_mode: power_mode.to_string(),
755                    pc_performance,
756                    pc_balanced,
757                    pc_powersave,
758                })
759            }
760            StatsReq::SchedSamplesNr {
761                tid,
762                nr_samples,
763                interval_ms,
764            } => {
765                if Some(*tid) != self.monitor_tid {
766                    return Ok(StatsRes::Bye);
767                }
768
769                self.intrspc.cmd = LAVD_CMD_SCHED_N;
770                self.intrspc.arg = *nr_samples;
771                self.prep_introspec();
772                std::thread::sleep(Duration::from_millis(*interval_ms));
773                self.rb_mgr.poll(Duration::from_millis(100)).unwrap();
774
775                let mut samples = vec![];
776                while let Ok(ts) = self.intrspc_rx.try_recv() {
777                    samples.push(ts);
778                }
779
780                self.cleanup_introspec();
781
782                StatsRes::SchedSamples(SchedSamples { samples })
783            }
784        })
785    }
786
787    fn stop_monitoring(&mut self) {
788        if self.skel.maps.bss_data.as_ref().unwrap().is_monitored {
789            self.skel.maps.bss_data.as_mut().unwrap().is_monitored = false;
790        }
791    }
792
793    pub fn exited(&mut self) -> bool {
794        uei_exited!(&self.skel, uei)
795    }
796
797    fn set_power_profile(&mut self, mode: u32) -> Result<(), u32> {
798        let prog = &mut self.skel.progs.set_power_profile;
799        let mut args = power_arg {
800            power_mode: mode as c_int,
801        };
802        let input = ProgramInput {
803            context_in: Some(unsafe {
804                std::slice::from_raw_parts_mut(
805                    &mut args as *mut _ as *mut u8,
806                    std::mem::size_of_val(&args),
807                )
808            }),
809            ..Default::default()
810        };
811        let out = prog.test_run(input).unwrap();
812        if out.return_value != 0 {
813            return Err(out.return_value);
814        }
815
816        Ok(())
817    }
818
819    fn update_power_profile(&mut self, prev_profile: PowerProfile) -> (bool, PowerProfile) {
820        let profile = fetch_power_profile(false);
821        if profile == prev_profile {
822            // If the profile is the same, skip updaring the profile for BPF.
823            return (true, profile);
824        }
825
826        let _ = match profile {
827            PowerProfile::Performance => self.set_power_profile(LAVD_PM_PERFORMANCE),
828            PowerProfile::Balanced { .. } => self.set_power_profile(LAVD_PM_BALANCED),
829            PowerProfile::Powersave => self.set_power_profile(LAVD_PM_POWERSAVE),
830            PowerProfile::Unknown => {
831                // We don't know how to handle an unknown energy profile,
832                // so we just give up updating the profile from now on.
833                return (false, profile);
834            }
835        };
836
837        info!("Set the scheduler's power profile to {profile} mode.");
838        (true, profile)
839    }
840
841    fn run(&mut self, opts: &Opts, shutdown: Arc<AtomicBool>) -> Result<UserExitInfo> {
842        let (res_ch, req_ch) = self.stats_server.channels();
843        let mut autopower = opts.autopower;
844        let mut profile = PowerProfile::Unknown;
845
846        if opts.performance {
847            let _ = self.set_power_profile(LAVD_PM_PERFORMANCE);
848        } else if opts.powersave {
849            let _ = self.set_power_profile(LAVD_PM_POWERSAVE);
850        } else {
851            let _ = self.set_power_profile(LAVD_PM_BALANCED);
852        }
853
854        while !shutdown.load(Ordering::Relaxed) && !self.exited() {
855            if autopower {
856                (autopower, profile) = self.update_power_profile(profile);
857            }
858
859            match req_ch.recv_timeout(Duration::from_secs(1)) {
860                Ok(req) => {
861                    let res = self.stats_req_to_res(&req)?;
862                    res_ch.send(res)?;
863                }
864                Err(RecvTimeoutError::Timeout) => {
865                    self.stop_monitoring();
866                }
867                Err(e) => {
868                    self.stop_monitoring();
869                    Err(e)?
870                }
871            }
872            self.cleanup_introspec();
873        }
874        self.rb_mgr.consume().unwrap();
875
876        let _ = self.struct_ops.take();
877        uei_report!(&self.skel, uei)
878    }
879}
880
881impl Drop for Scheduler<'_> {
882    fn drop(&mut self) {
883        info!("Unregister {SCHEDULER_NAME} scheduler");
884
885        if let Some(struct_ops) = self.struct_ops.take() {
886            drop(struct_ops);
887        }
888    }
889}
890
891fn init_log(opts: &Opts) {
892    let llv = match opts.verbose {
893        0 => simplelog::LevelFilter::Info,
894        1 => simplelog::LevelFilter::Debug,
895        _ => simplelog::LevelFilter::Trace,
896    };
897    let mut lcfg = simplelog::ConfigBuilder::new();
898    lcfg.set_time_offset_to_local()
899        .expect("Failed to set local time offset")
900        .set_time_level(simplelog::LevelFilter::Error)
901        .set_location_level(simplelog::LevelFilter::Off)
902        .set_target_level(simplelog::LevelFilter::Off)
903        .set_thread_level(simplelog::LevelFilter::Off);
904    simplelog::TermLogger::init(
905        llv,
906        lcfg.build(),
907        simplelog::TerminalMode::Stderr,
908        simplelog::ColorChoice::Auto,
909    )
910    .unwrap();
911}
912
913fn main() -> Result<()> {
914    let mut opts = Opts::parse();
915
916    if opts.version {
917        println!(
918            "scx_lavd {}",
919            build_id::full_version(env!("CARGO_PKG_VERSION"))
920        );
921        return Ok(());
922    }
923
924    if opts.help_stats {
925        let sys_stats_meta_name = SysStats::meta().name;
926        let sched_sample_meta_name = SchedSample::meta().name;
927        let stats_meta_names: &[&str] = &[
928            sys_stats_meta_name.as_str(),
929            sched_sample_meta_name.as_str(),
930        ];
931        stats::server_data(0).describe_meta(&mut std::io::stdout(), Some(&stats_meta_names))?;
932        return Ok(());
933    }
934
935    init_log(&opts);
936
937    if opts.monitor.is_none() && opts.monitor_sched_samples.is_none() {
938        opts.proc().unwrap();
939        info!("{:#?}", opts);
940    }
941
942    let shutdown = Arc::new(AtomicBool::new(false));
943    let shutdown_clone = shutdown.clone();
944    ctrlc::set_handler(move || {
945        shutdown_clone.store(true, Ordering::Relaxed);
946    })
947    .context("Error setting Ctrl-C handler")?;
948
949    if let Some(nr_samples) = opts.monitor_sched_samples {
950        let shutdown_copy = shutdown.clone();
951        let jh = std::thread::spawn(move || {
952            stats::monitor_sched_samples(nr_samples, shutdown_copy).unwrap()
953        });
954        let _ = jh.join();
955        return Ok(());
956    }
957
958    if let Some(intv) = opts.monitor.or(opts.stats) {
959        let shutdown_copy = shutdown.clone();
960        let jh = std::thread::spawn(move || {
961            stats::monitor(Duration::from_secs_f64(intv), shutdown_copy).unwrap()
962        });
963        if opts.monitor.is_some() {
964            let _ = jh.join();
965            return Ok(());
966        }
967    }
968
969    let mut open_object = MaybeUninit::uninit();
970    loop {
971        let mut sched = Scheduler::init(&opts, &mut open_object)?;
972        info!(
973            "scx_lavd scheduler is initialized (build ID: {})",
974            build_id::full_version(env!("CARGO_PKG_VERSION"))
975        );
976        info!("scx_lavd scheduler starts running.");
977        if !sched.run(&opts, shutdown.clone())?.should_restart() {
978            break;
979        }
980    }
981
982    Ok(())
983}