scx_lavd/
main.rs

1// SPDX-License-Identifier: GPL-2.0
2//
3// Copyright (c) 2024 Valve Corporation.
4// Author: Changwoo Min <changwoo@igalia.com>
5
6// This software may be used and distributed according to the terms of the
7// GNU General Public License version 2.
8
9mod bpf_skel;
10pub use bpf_skel::*;
11pub mod bpf_intf;
12pub use bpf_intf::*;
13
14mod cpu_order;
15use scx_utils::init_libbpf_logging;
16mod stats;
17use std::ffi::c_int;
18use std::ffi::CStr;
19use std::mem;
20use std::mem::MaybeUninit;
21use std::str;
22use std::sync::atomic::AtomicBool;
23use std::sync::atomic::Ordering;
24use std::sync::Arc;
25use std::thread::ThreadId;
26use std::time::Duration;
27
28use anyhow::Context;
29use anyhow::Result;
30use clap::Parser;
31use clap_num::number_range;
32use cpu_order::CpuOrder;
33use cpu_order::PerfCpuOrder;
34use crossbeam::channel;
35use crossbeam::channel::Receiver;
36use crossbeam::channel::RecvTimeoutError;
37use crossbeam::channel::Sender;
38use crossbeam::channel::TrySendError;
39use libbpf_rs::OpenObject;
40use libbpf_rs::PrintLevel;
41use libbpf_rs::ProgramInput;
42use libc::c_char;
43use log::debug;
44use log::info;
45use plain::Plain;
46use scx_stats::prelude::*;
47use scx_utils::autopower::{fetch_power_profile, PowerProfile};
48use scx_utils::build_id;
49use scx_utils::compat;
50use scx_utils::libbpf_clap_opts::LibbpfOpts;
51use scx_utils::scx_ops_attach;
52use scx_utils::scx_ops_load;
53use scx_utils::scx_ops_open;
54use scx_utils::try_set_rlimit_infinity;
55use scx_utils::uei_exited;
56use scx_utils::uei_report;
57use scx_utils::EnergyModel;
58use scx_utils::TopologyArgs;
59use scx_utils::UserExitInfo;
60use scx_utils::NR_CPU_IDS;
61use stats::SchedSample;
62use stats::SchedSamples;
63use stats::StatsReq;
64use stats::StatsRes;
65use stats::SysStats;
66
67const SCHEDULER_NAME: &str = "scx_lavd";
68/// scx_lavd: Latency-criticality Aware Virtual Deadline (LAVD) scheduler
69///
70/// The rust part is minimal. It processes command line options and logs out
71/// scheduling statistics. The BPF part makes all the scheduling decisions.
72/// See the more detailed overview of the LAVD design at main.bpf.c.
73#[derive(Debug, Parser)]
74struct Opts {
75    /// Automatically decide the scheduler's power mode (performance vs.
76    /// powersave vs. balanced), CPU preference order, etc, based on system
77    /// load. The options affecting the power mode and the use of core compaction
78    /// (--autopower, --performance, --powersave, --balanced,
79    /// --no-core-compaction) cannot be used with this option. When no option
80    /// is specified, this is a default mode.
81    #[clap(long = "autopilot", action = clap::ArgAction::SetTrue)]
82    autopilot: bool,
83
84    /// Automatically decide the scheduler's power mode (performance vs.
85    /// powersave vs. balanced) based on the system's active power profile.
86    /// The scheduler's power mode decides the CPU preference order and the use
87    /// of core compaction, so the options affecting these (--autopilot,
88    /// --performance, --powersave, --balanced, --no-core-compaction) cannot
89    /// be used with this option.
90    #[clap(long = "autopower", action = clap::ArgAction::SetTrue)]
91    autopower: bool,
92
93    /// Run the scheduler in performance mode to get maximum performance.
94    /// This option cannot be used with other conflicting options (--autopilot,
95    /// --autopower, --balanced, --powersave, --no-core-compaction)
96    /// affecting the use of core compaction.
97    #[clap(long = "performance", action = clap::ArgAction::SetTrue)]
98    performance: bool,
99
100    /// Run the scheduler in powersave mode to minimize powr consumption.
101    /// This option cannot be used with other conflicting options (--autopilot,
102    /// --autopower, --performance, --balanced, --no-core-compaction)
103    /// affecting the use of core compaction.
104    #[clap(long = "powersave", action = clap::ArgAction::SetTrue)]
105    powersave: bool,
106
107    /// Run the scheduler in balanced mode aiming for sweetspot between power
108    /// and performance. This option cannot be used with other conflicting
109    /// options (--autopilot, --autopower, --performance, --powersave,
110    /// --no-core-compaction) affecting the use of core compaction.
111    #[clap(long = "balanced", action = clap::ArgAction::SetTrue)]
112    balanced: bool,
113
114    /// Maximum scheduling slice duration in microseconds.
115    #[clap(long = "slice-max-us", default_value = "5000")]
116    slice_max_us: u64,
117
118    /// Minimum scheduling slice duration in microseconds.
119    #[clap(long = "slice-min-us", default_value = "500")]
120    slice_min_us: u64,
121
122    /// Limit the ratio of preemption to the roughly top P% of latency-critical
123    /// tasks. When N is given as an argument, P is 0.5^N * 100. The default
124    /// value is 6, which limits the preemption for the top 1.56% of
125    /// latency-critical tasks.
126    #[clap(long = "preempt-shift", default_value = "6", value_parser=Opts::preempt_shift_range)]
127    preempt_shift: u8,
128
129    /// List of CPUs in preferred order (e.g., "0-3,7,6,5,4"). The scheduler
130    /// uses the CPU preference mode only when the core compaction is enabled
131    /// (i.e., balanced or powersave mode is specified as an option or chosen
132    /// in the autopilot or autopower mode). When "--cpu-pref-order" is given,
133    /// it implies "--no-use-em".
134    #[clap(long = "cpu-pref-order", default_value = "")]
135    cpu_pref_order: String,
136
137    /// Do not use the energy model in making CPU preference order decisions.
138    #[clap(long = "no-use-em", action = clap::ArgAction::SetTrue)]
139    no_use_em: bool,
140
141    /// Do not boost futex holders.
142    #[clap(long = "no-futex-boost", action = clap::ArgAction::SetTrue)]
143    no_futex_boost: bool,
144
145    /// Disable preemption.
146    #[clap(long = "no-preemption", action = clap::ArgAction::SetTrue)]
147    no_preemption: bool,
148
149    /// Disable an optimization for synchronous wake-up.
150    #[clap(long = "no-wake-sync", action = clap::ArgAction::SetTrue)]
151    no_wake_sync: bool,
152
153    /// Disable dynamic slice boost for long-running tasks.
154    #[clap(long = "no-slice-boost", action = clap::ArgAction::SetTrue)]
155    no_slice_boost: bool,
156
157    /// Enables DSQs per CPU, this enables task queuing and dispatching
158    /// from CPU specific DSQs. This generally increases L1/L2 cache
159    /// locality for tasks and lowers lock contention compared to shared DSQs,
160    /// but at the cost of higher load balancing complexity. This is a
161    /// highly experimental feature.
162    #[clap(long = "per-cpu-dsq", action = clap::ArgAction::SetTrue)]
163    per_cpu_dsq: bool,
164
165    ///
166    /// Disable core compaction so the scheduler uses all the online CPUs.
167    /// The core compaction attempts to minimize the number of actively used
168    /// CPUs for unaffinitized tasks, respecting the CPU preference order.
169    /// Normally, the core compaction is enabled by the power mode (i.e.,
170    /// balanced or powersave mode is specified as an option or chosen in
171    /// the autopilot or autopower mode). This option cannot be used with the
172    /// other options that control the core compaction (--autopilot,
173    /// --autopower, --performance, --balanced, --powersave).
174    #[clap(long = "no-core-compaction", action = clap::ArgAction::SetTrue)]
175    no_core_compaction: bool,
176
177    /// Disable controlling the CPU frequency.
178    #[clap(long = "no-freq-scaling", action = clap::ArgAction::SetTrue)]
179    no_freq_scaling: bool,
180
181    /// Enable stats monitoring with the specified interval.
182    #[clap(long)]
183    stats: Option<f64>,
184
185    /// Run in stats monitoring mode with the specified interval. Scheduler is not launched.
186    #[clap(long)]
187    monitor: Option<f64>,
188
189    /// Run in monitoring mode. Show the specified number of scheduling
190    /// samples every second.
191    #[clap(long)]
192    monitor_sched_samples: Option<u64>,
193
194    /// Enable verbose output, including libbpf details. Specify multiple
195    /// times to increase verbosity.
196    #[clap(short = 'v', long, action = clap::ArgAction::Count)]
197    verbose: u8,
198
199    /// Print scheduler version and exit.
200    #[clap(short = 'V', long, action = clap::ArgAction::SetTrue)]
201    version: bool,
202
203    /// Show descriptions for statistics.
204    #[clap(long)]
205    help_stats: bool,
206
207    #[clap(flatten, next_help_heading = "Libbpf Options")]
208    pub libbpf: LibbpfOpts,
209
210    /// Topology configuration options
211    #[clap(flatten)]
212    topology: Option<TopologyArgs>,
213}
214
215impl Opts {
216    fn can_autopilot(&self) -> bool {
217        self.autopower == false
218            && self.performance == false
219            && self.powersave == false
220            && self.balanced == false
221            && self.no_core_compaction == false
222    }
223
224    fn can_autopower(&self) -> bool {
225        self.autopilot == false
226            && self.performance == false
227            && self.powersave == false
228            && self.balanced == false
229            && self.no_core_compaction == false
230    }
231
232    fn can_performance(&self) -> bool {
233        self.autopilot == false
234            && self.autopower == false
235            && self.powersave == false
236            && self.balanced == false
237    }
238
239    fn can_balanced(&self) -> bool {
240        self.autopilot == false
241            && self.autopower == false
242            && self.performance == false
243            && self.powersave == false
244            && self.no_core_compaction == false
245    }
246
247    fn can_powersave(&self) -> bool {
248        self.autopilot == false
249            && self.autopower == false
250            && self.performance == false
251            && self.balanced == false
252            && self.no_core_compaction == false
253    }
254
255    fn proc(&mut self) -> Option<&mut Self> {
256        if !self.autopilot {
257            self.autopilot = self.can_autopilot();
258        }
259
260        if self.autopilot {
261            if !self.can_autopilot() {
262                info!("Autopilot mode cannot be used with conflicting options.");
263                return None;
264            }
265            info!("Autopilot mode is enabled.");
266        }
267
268        if self.autopower {
269            if !self.can_autopower() {
270                info!("Autopower mode cannot be used with conflicting options.");
271                return None;
272            }
273            info!("Autopower mode is enabled.");
274        }
275
276        if self.performance {
277            if !self.can_performance() {
278                info!("Performance mode cannot be used with conflicting options.");
279                return None;
280            }
281            info!("Performance mode is enabled.");
282            self.no_core_compaction = true;
283        }
284
285        if self.powersave {
286            if !self.can_powersave() {
287                info!("Powersave mode cannot be used with conflicting options.");
288                return None;
289            }
290            info!("Powersave mode is enabled.");
291            self.no_core_compaction = false;
292        }
293
294        if self.balanced {
295            if !self.can_balanced() {
296                info!("Balanced mode cannot be used with conflicting options.");
297                return None;
298            }
299            info!("Balanced mode is enabled.");
300            self.no_core_compaction = false;
301        }
302
303        if !EnergyModel::has_energy_model() || !self.cpu_pref_order.is_empty() {
304            self.no_use_em = true;
305            info!("Energy model won't be used for CPU preference order.");
306        }
307
308        Some(self)
309    }
310
311    fn preempt_shift_range(s: &str) -> Result<u8, String> {
312        number_range(s, 0, 10)
313    }
314}
315
316unsafe impl Plain for msg_task_ctx {}
317
318impl msg_task_ctx {
319    fn from_bytes(buf: &[u8]) -> &msg_task_ctx {
320        plain::from_bytes(buf).expect("The buffer is either too short or not aligned!")
321    }
322}
323
324impl introspec {
325    fn new() -> Self {
326        let intrspc = unsafe { mem::MaybeUninit::<introspec>::zeroed().assume_init() };
327        intrspc
328    }
329}
330
331struct Scheduler<'a> {
332    skel: BpfSkel<'a>,
333    struct_ops: Option<libbpf_rs::Link>,
334    rb_mgr: libbpf_rs::RingBuffer<'static>,
335    intrspc: introspec,
336    intrspc_rx: Receiver<SchedSample>,
337    monitor_tid: Option<ThreadId>,
338    stats_server: StatsServer<StatsReq, StatsRes>,
339    mseq_id: u64,
340}
341
342impl<'a> Scheduler<'a> {
343    fn init(opts: &'a Opts, open_object: &'a mut MaybeUninit<OpenObject>) -> Result<Self> {
344        if *NR_CPU_IDS > LAVD_CPU_ID_MAX as usize {
345            panic!(
346                "Num possible CPU IDs ({}) exceeds maximum of ({})",
347                *NR_CPU_IDS, LAVD_CPU_ID_MAX
348            );
349        }
350
351        try_set_rlimit_infinity();
352
353        // Open the BPF prog first for verification.
354        let mut skel_builder = BpfSkelBuilder::default();
355        skel_builder.obj_builder.debug(opts.verbose > 0);
356        init_libbpf_logging(Some(PrintLevel::Debug));
357
358        let open_opts = opts.libbpf.clone().into_bpf_open_opts();
359        let mut skel = scx_ops_open!(skel_builder, open_object, lavd_ops, open_opts)?;
360
361        // Enable futex tracing using ftrace if available. If the ftrace is not
362        // available, use tracepoint, which is known to be slower than ftrace.
363        if !opts.no_futex_boost {
364            if Self::attach_futex_ftraces(&mut skel)? == false {
365                info!("Fail to attach futex ftraces. Try with tracepoints.");
366                if Self::attach_futex_tracepoints(&mut skel)? == false {
367                    info!("Fail to attach futex tracepoints.");
368                }
369            }
370        }
371
372        // Initialize CPU topology with CLI arguments
373        let order = CpuOrder::new(opts.topology.as_ref()).unwrap();
374        Self::init_cpus(&mut skel, &order);
375        Self::init_cpdoms(&mut skel, &order);
376
377        // Initialize skel according to @opts.
378        Self::init_globals(&mut skel, &opts, &order);
379
380        // Attach.
381        let mut skel = scx_ops_load!(skel, lavd_ops, uei)?;
382        let struct_ops = Some(scx_ops_attach!(skel, lavd_ops)?);
383        let stats_server = StatsServer::new(stats::server_data(*NR_CPU_IDS as u64)).launch()?;
384
385        // Build a ring buffer for instrumentation
386        let (intrspc_tx, intrspc_rx) = channel::bounded(65536);
387        let rb_map = &mut skel.maps.introspec_msg;
388        let mut builder = libbpf_rs::RingBufferBuilder::new();
389        builder
390            .add(rb_map, move |data| {
391                Scheduler::relay_introspec(data, &intrspc_tx)
392            })
393            .unwrap();
394        let rb_mgr = builder.build().unwrap();
395
396        Ok(Self {
397            skel,
398            struct_ops,
399            rb_mgr,
400            intrspc: introspec::new(),
401            intrspc_rx,
402            monitor_tid: None,
403            stats_server,
404            mseq_id: 0,
405        })
406    }
407
408    fn attach_futex_ftraces(skel: &mut OpenBpfSkel) -> Result<bool> {
409        let ftraces = vec![
410            ("__futex_wait", &skel.progs.fexit___futex_wait),
411            ("futex_wait_multiple", &skel.progs.fexit_futex_wait_multiple),
412            (
413                "futex_wait_requeue_pi",
414                &skel.progs.fexit_futex_wait_requeue_pi,
415            ),
416            ("futex_wake", &skel.progs.fexit_futex_wake),
417            ("futex_wake_op", &skel.progs.fexit_futex_wake_op),
418            ("futex_lock_pi", &skel.progs.fexit_futex_lock_pi),
419            ("futex_unlock_pi", &skel.progs.fexit_futex_unlock_pi),
420        ];
421
422        compat::cond_kprobes_enable(ftraces)
423    }
424
425    fn attach_futex_tracepoints(skel: &mut OpenBpfSkel) -> Result<bool> {
426        let tracepoints = vec![
427            ("syscalls:sys_enter_futex", &skel.progs.rtp_sys_enter_futex),
428            ("syscalls:sys_exit_futex", &skel.progs.rtp_sys_exit_futex),
429            (
430                "syscalls:sys_exit_futex_wait",
431                &skel.progs.rtp_sys_exit_futex_wait,
432            ),
433            (
434                "syscalls:sys_exit_futex_waitv",
435                &skel.progs.rtp_sys_exit_futex_waitv,
436            ),
437            (
438                "syscalls:sys_exit_futex_wake",
439                &skel.progs.rtp_sys_exit_futex_wake,
440            ),
441        ];
442
443        compat::cond_tracepoints_enable(tracepoints)
444    }
445
446    fn init_cpus(skel: &mut OpenBpfSkel, order: &CpuOrder) {
447        debug!("{:#?}", order);
448
449        // Initialize CPU capacity and sibling
450        for cpu in order.cpuids.iter() {
451            skel.maps.rodata_data.as_mut().unwrap().cpu_capacity[cpu.cpu_adx] = cpu.cpu_cap as u16;
452            skel.maps.rodata_data.as_mut().unwrap().cpu_big[cpu.cpu_adx] = cpu.big_core as u8;
453            skel.maps.rodata_data.as_mut().unwrap().cpu_turbo[cpu.cpu_adx] = cpu.turbo_core as u8;
454            skel.maps.rodata_data.as_mut().unwrap().cpu_sibling[cpu.cpu_adx] =
455                cpu.cpu_sibling as u32;
456        }
457
458        // Initialize performance vs. CPU order table.
459        let nr_pco_states: u8 = order.perf_cpu_order.len() as u8;
460        if nr_pco_states > LAVD_PCO_STATE_MAX as u8 {
461            panic!("Generated performance vs. CPU order stats are too complex ({nr_pco_states}) to handle");
462        }
463
464        skel.maps.rodata_data.as_mut().unwrap().nr_pco_states = nr_pco_states;
465        for (i, (_, pco)) in order.perf_cpu_order.iter().enumerate() {
466            Self::init_pco_tuple(skel, i, &pco);
467            info!("{:#}", pco);
468        }
469
470        let (_, last_pco) = order.perf_cpu_order.last_key_value().unwrap();
471        for i in nr_pco_states..LAVD_PCO_STATE_MAX as u8 {
472            Self::init_pco_tuple(skel, i as usize, &last_pco);
473        }
474    }
475
476    fn init_pco_tuple(skel: &mut OpenBpfSkel, i: usize, pco: &PerfCpuOrder) {
477        let cpus_perf = pco.cpus_perf.borrow();
478        let cpus_ovflw = pco.cpus_ovflw.borrow();
479        let pco_nr_primary = cpus_perf.len();
480
481        skel.maps.rodata_data.as_mut().unwrap().pco_bounds[i] = pco.perf_cap as u32;
482        skel.maps.rodata_data.as_mut().unwrap().pco_nr_primary[i] = pco_nr_primary as u16;
483
484        for (j, &cpu_adx) in cpus_perf.iter().enumerate() {
485            skel.maps.rodata_data.as_mut().unwrap().pco_table[i][j] = cpu_adx as u16;
486        }
487
488        for (j, &cpu_adx) in cpus_ovflw.iter().enumerate() {
489            let k = j + pco_nr_primary;
490            skel.maps.rodata_data.as_mut().unwrap().pco_table[i][k] = cpu_adx as u16;
491        }
492    }
493
494    fn init_cpdoms(skel: &mut OpenBpfSkel, order: &CpuOrder) {
495        // Initialize compute domain contexts
496        for (k, v) in order.cpdom_map.iter() {
497            skel.maps.bss_data.as_mut().unwrap().cpdom_ctxs[v.cpdom_id].id = v.cpdom_id as u64;
498            skel.maps.bss_data.as_mut().unwrap().cpdom_ctxs[v.cpdom_id].alt_id =
499                v.cpdom_alt_id.get() as u64;
500            skel.maps.bss_data.as_mut().unwrap().cpdom_ctxs[v.cpdom_id].numa_id = k.numa_adx as u8;
501            skel.maps.bss_data.as_mut().unwrap().cpdom_ctxs[v.cpdom_id].llc_id = k.llc_adx as u8;
502            skel.maps.bss_data.as_mut().unwrap().cpdom_ctxs[v.cpdom_id].is_big = k.is_big as u8;
503            skel.maps.bss_data.as_mut().unwrap().cpdom_ctxs[v.cpdom_id].is_valid = 1;
504            for cpu_id in v.cpu_ids.iter() {
505                let i = cpu_id / 64;
506                let j = cpu_id % 64;
507                skel.maps.bss_data.as_mut().unwrap().cpdom_ctxs[v.cpdom_id].__cpumask[i] |=
508                    0x01 << j;
509            }
510
511            if v.neighbor_map.borrow().iter().len() > LAVD_CPDOM_MAX_DIST as usize {
512                panic!("The processor topology is too complex to handle in BPF.");
513            }
514
515            for (k, (_d, neighbors)) in v.neighbor_map.borrow().iter().enumerate() {
516                let nr_neighbors = neighbors.borrow().len() as u8;
517                if nr_neighbors > LAVD_CPDOM_MAX_NR as u8 {
518                    panic!("The processor topology is too complex to handle in BPF.");
519                }
520                skel.maps.bss_data.as_mut().unwrap().cpdom_ctxs[v.cpdom_id].nr_neighbors[k] =
521                    nr_neighbors;
522                for n in neighbors.borrow().iter() {
523                    skel.maps.bss_data.as_mut().unwrap().cpdom_ctxs[v.cpdom_id].neighbor_bits[k] |=
524                        0x1 << n;
525                }
526            }
527        }
528    }
529
530    fn init_globals(skel: &mut OpenBpfSkel, opts: &Opts, order: &CpuOrder) {
531        let bss_data = skel.maps.bss_data.as_mut().unwrap();
532        bss_data.no_preemption = opts.no_preemption;
533        bss_data.no_core_compaction = opts.no_core_compaction;
534        bss_data.no_freq_scaling = opts.no_freq_scaling;
535        bss_data.is_powersave_mode = opts.powersave;
536        let rodata = skel.maps.rodata_data.as_mut().unwrap();
537        rodata.nr_llcs = order.nr_llcs as u64;
538        rodata.__nr_cpu_ids = *NR_CPU_IDS as u64;
539        rodata.is_smt_active = order.smt_enabled;
540        rodata.is_autopilot_on = opts.autopilot;
541        rodata.verbose = opts.verbose;
542        rodata.slice_max_ns = opts.slice_max_us * 1000;
543        rodata.slice_min_ns = opts.slice_min_us * 1000;
544        rodata.preempt_shift = opts.preempt_shift;
545        rodata.no_use_em = opts.no_use_em as u8;
546        rodata.no_wake_sync = opts.no_wake_sync;
547        rodata.no_slice_boost = opts.no_slice_boost;
548        rodata.per_cpu_dsq = opts.per_cpu_dsq;
549
550        skel.struct_ops.lavd_ops_mut().flags = *compat::SCX_OPS_ENQ_EXITING
551            | *compat::SCX_OPS_ENQ_LAST
552            | *compat::SCX_OPS_ENQ_MIGRATION_DISABLED
553            | *compat::SCX_OPS_KEEP_BUILTIN_IDLE;
554    }
555
556    fn get_msg_seq_id() -> u64 {
557        static mut MSEQ: u64 = 0;
558        unsafe {
559            MSEQ += 1;
560            MSEQ
561        }
562    }
563
564    fn relay_introspec(data: &[u8], intrspc_tx: &Sender<SchedSample>) -> i32 {
565        let mt = msg_task_ctx::from_bytes(data);
566        let tx = mt.taskc_x;
567        let tc = mt.taskc;
568
569        // No idea how to print other types than LAVD_MSG_TASKC
570        if mt.hdr.kind != LAVD_MSG_TASKC {
571            return 0;
572        }
573
574        let mseq = Scheduler::get_msg_seq_id();
575
576        let c_tx_cm: *const c_char = (&tx.comm as *const [c_char; 17]) as *const c_char;
577        let c_tx_cm_str: &CStr = unsafe { CStr::from_ptr(c_tx_cm) };
578        let tx_comm: &str = c_tx_cm_str.to_str().unwrap();
579
580        let c_waker_cm: *const c_char = (&tc.waker_comm as *const [c_char; 17]) as *const c_char;
581        let c_waker_cm_str: &CStr = unsafe { CStr::from_ptr(c_waker_cm) };
582        let waker_comm: &str = c_waker_cm_str.to_str().unwrap();
583
584        let c_tx_st: *const c_char = (&tx.stat as *const [c_char; 5]) as *const c_char;
585        let c_tx_st_str: &CStr = unsafe { CStr::from_ptr(c_tx_st) };
586        let tx_stat: &str = c_tx_st_str.to_str().unwrap();
587
588        match intrspc_tx.try_send(SchedSample {
589            mseq,
590            pid: tx.pid,
591            comm: tx_comm.into(),
592            stat: tx_stat.into(),
593            cpu_id: tc.cpu_id,
594            prev_cpu_id: tc.prev_cpu_id,
595            suggested_cpu_id: tc.suggested_cpu_id,
596            waker_pid: tc.waker_pid,
597            waker_comm: waker_comm.into(),
598            slice: tc.slice,
599            lat_cri: tc.lat_cri,
600            avg_lat_cri: tx.avg_lat_cri,
601            static_prio: tx.static_prio,
602            rerunnable_interval: tx.rerunnable_interval,
603            resched_interval: tc.resched_interval,
604            run_freq: tc.run_freq,
605            avg_runtime: tc.avg_runtime,
606            wait_freq: tc.wait_freq,
607            wake_freq: tc.wake_freq,
608            perf_cri: tc.perf_cri,
609            thr_perf_cri: tx.thr_perf_cri,
610            cpuperf_cur: tx.cpuperf_cur,
611            cpu_util: tx.cpu_util,
612            cpu_sutil: tx.cpu_sutil,
613            nr_active: tx.nr_active,
614            dsq_id: tx.dsq_id,
615            dsq_consume_lat: tx.dsq_consume_lat,
616            slice_used: tc.last_slice_used,
617        }) {
618            Ok(()) | Err(TrySendError::Full(_)) => 0,
619            Err(e) => panic!("failed to send on intrspc_tx ({})", e),
620        }
621    }
622
623    fn prep_introspec(&mut self) {
624        if !self.skel.maps.bss_data.as_ref().unwrap().is_monitored {
625            self.skel.maps.bss_data.as_mut().unwrap().is_monitored = true;
626        }
627        self.skel.maps.bss_data.as_mut().unwrap().intrspc.cmd = self.intrspc.cmd;
628        self.skel.maps.bss_data.as_mut().unwrap().intrspc.arg = self.intrspc.arg;
629    }
630
631    fn cleanup_introspec(&mut self) {
632        self.skel.maps.bss_data.as_mut().unwrap().intrspc.cmd = LAVD_CMD_NOP;
633    }
634
635    fn get_pc(x: u64, y: u64) -> f64 {
636        return 100. * x as f64 / y as f64;
637    }
638
639    fn get_power_mode(power_mode: i32) -> &'static str {
640        match power_mode as u32 {
641            LAVD_PM_PERFORMANCE => "performance",
642            LAVD_PM_BALANCED => "balanced",
643            LAVD_PM_POWERSAVE => "powersave",
644            _ => "unknown",
645        }
646    }
647
648    fn stats_req_to_res(&mut self, req: &StatsReq) -> Result<StatsRes> {
649        Ok(match req {
650            StatsReq::NewSampler(tid) => {
651                self.rb_mgr.consume().unwrap();
652                self.monitor_tid = Some(*tid);
653                StatsRes::Ack
654            }
655            StatsReq::SysStatsReq { tid } => {
656                if Some(*tid) != self.monitor_tid {
657                    return Ok(StatsRes::Bye);
658                }
659                self.mseq_id += 1;
660
661                let bss_data = self.skel.maps.bss_data.as_ref().unwrap();
662                let st = bss_data.sys_stat;
663
664                let mseq = self.mseq_id;
665                let nr_queued_task = st.nr_queued_task;
666                let nr_active = st.nr_active;
667                let nr_sched = st.nr_sched;
668                let nr_preempt = st.nr_preempt;
669                let pc_pc = Self::get_pc(st.nr_perf_cri, nr_sched);
670                let pc_lc = Self::get_pc(st.nr_lat_cri, nr_sched);
671                let pc_x_migration = Self::get_pc(st.nr_x_migration, nr_sched);
672                let nr_stealee = st.nr_stealee;
673                let nr_big = st.nr_big;
674                let pc_big = Self::get_pc(nr_big, nr_sched);
675                let pc_pc_on_big = Self::get_pc(st.nr_pc_on_big, nr_big);
676                let pc_lc_on_big = Self::get_pc(st.nr_lc_on_big, nr_big);
677                let power_mode = Self::get_power_mode(bss_data.power_mode);
678                let total_time = bss_data.performance_mode_ns
679                    + bss_data.balanced_mode_ns
680                    + bss_data.powersave_mode_ns;
681                let pc_performance = Self::get_pc(bss_data.performance_mode_ns, total_time);
682                let pc_balanced = Self::get_pc(bss_data.balanced_mode_ns, total_time);
683                let pc_powersave = Self::get_pc(bss_data.powersave_mode_ns, total_time);
684
685                StatsRes::SysStats(SysStats {
686                    mseq,
687                    nr_queued_task,
688                    nr_active,
689                    nr_sched,
690                    nr_preempt,
691                    pc_pc,
692                    pc_lc,
693                    pc_x_migration,
694                    nr_stealee,
695                    pc_big,
696                    pc_pc_on_big,
697                    pc_lc_on_big,
698                    power_mode: power_mode.to_string(),
699                    pc_performance,
700                    pc_balanced,
701                    pc_powersave,
702                })
703            }
704            StatsReq::SchedSamplesNr {
705                tid,
706                nr_samples,
707                interval_ms,
708            } => {
709                if Some(*tid) != self.monitor_tid {
710                    return Ok(StatsRes::Bye);
711                }
712
713                self.intrspc.cmd = LAVD_CMD_SCHED_N;
714                self.intrspc.arg = *nr_samples;
715                self.prep_introspec();
716                std::thread::sleep(Duration::from_millis(*interval_ms));
717                self.rb_mgr.poll(Duration::from_millis(100)).unwrap();
718
719                let mut samples = vec![];
720                while let Ok(ts) = self.intrspc_rx.try_recv() {
721                    samples.push(ts);
722                }
723
724                self.cleanup_introspec();
725
726                StatsRes::SchedSamples(SchedSamples { samples })
727            }
728        })
729    }
730
731    fn stop_monitoring(&mut self) {
732        if self.skel.maps.bss_data.as_ref().unwrap().is_monitored {
733            self.skel.maps.bss_data.as_mut().unwrap().is_monitored = false;
734        }
735    }
736
737    pub fn exited(&mut self) -> bool {
738        uei_exited!(&self.skel, uei)
739    }
740
741    fn set_power_profile(&mut self, mode: u32) -> Result<(), u32> {
742        let prog = &mut self.skel.progs.set_power_profile;
743        let mut args = power_arg {
744            power_mode: mode as c_int,
745        };
746        let input = ProgramInput {
747            context_in: Some(unsafe {
748                std::slice::from_raw_parts_mut(
749                    &mut args as *mut _ as *mut u8,
750                    std::mem::size_of_val(&args),
751                )
752            }),
753            ..Default::default()
754        };
755        let out = prog.test_run(input).unwrap();
756        if out.return_value != 0 {
757            return Err(out.return_value);
758        }
759
760        Ok(())
761    }
762
763    fn update_power_profile(&mut self, prev_profile: PowerProfile) -> (bool, PowerProfile) {
764        let profile = fetch_power_profile(false);
765        if profile == prev_profile {
766            // If the profile is the same, skip updaring the profile for BPF.
767            return (true, profile);
768        }
769
770        let _ = match profile {
771            PowerProfile::Performance => self.set_power_profile(LAVD_PM_PERFORMANCE),
772            PowerProfile::Balanced { .. } => self.set_power_profile(LAVD_PM_BALANCED),
773            PowerProfile::Powersave => self.set_power_profile(LAVD_PM_POWERSAVE),
774            PowerProfile::Unknown => {
775                // We don't know how to handle an unknown energy profile,
776                // so we just give up updating the profile from now on.
777                return (false, profile);
778            }
779        };
780
781        info!("Set the scheduler's power profile to {profile} mode.");
782        (true, profile)
783    }
784
785    fn run(&mut self, opts: &Opts, shutdown: Arc<AtomicBool>) -> Result<UserExitInfo> {
786        let (res_ch, req_ch) = self.stats_server.channels();
787        let mut autopower = opts.autopower;
788        let mut profile = PowerProfile::Unknown;
789
790        if opts.performance {
791            let _ = self.set_power_profile(LAVD_PM_PERFORMANCE);
792        } else if opts.powersave {
793            let _ = self.set_power_profile(LAVD_PM_POWERSAVE);
794        } else {
795            let _ = self.set_power_profile(LAVD_PM_BALANCED);
796        }
797
798        while !shutdown.load(Ordering::Relaxed) && !self.exited() {
799            if autopower {
800                (autopower, profile) = self.update_power_profile(profile);
801            }
802
803            match req_ch.recv_timeout(Duration::from_secs(1)) {
804                Ok(req) => {
805                    let res = self.stats_req_to_res(&req)?;
806                    res_ch.send(res)?;
807                }
808                Err(RecvTimeoutError::Timeout) => {
809                    self.stop_monitoring();
810                }
811                Err(e) => {
812                    self.stop_monitoring();
813                    Err(e)?
814                }
815            }
816            self.cleanup_introspec();
817        }
818        self.rb_mgr.consume().unwrap();
819
820        let _ = self.struct_ops.take();
821        uei_report!(&self.skel, uei)
822    }
823}
824
825impl Drop for Scheduler<'_> {
826    fn drop(&mut self) {
827        info!("Unregister {SCHEDULER_NAME} scheduler");
828
829        if let Some(struct_ops) = self.struct_ops.take() {
830            drop(struct_ops);
831        }
832    }
833}
834
835fn init_log(opts: &Opts) {
836    let llv = match opts.verbose {
837        0 => simplelog::LevelFilter::Info,
838        1 => simplelog::LevelFilter::Debug,
839        _ => simplelog::LevelFilter::Trace,
840    };
841    let mut lcfg = simplelog::ConfigBuilder::new();
842    lcfg.set_time_offset_to_local()
843        .expect("Failed to set local time offset")
844        .set_time_level(simplelog::LevelFilter::Error)
845        .set_location_level(simplelog::LevelFilter::Off)
846        .set_target_level(simplelog::LevelFilter::Off)
847        .set_thread_level(simplelog::LevelFilter::Off);
848    simplelog::TermLogger::init(
849        llv,
850        lcfg.build(),
851        simplelog::TerminalMode::Stderr,
852        simplelog::ColorChoice::Auto,
853    )
854    .unwrap();
855}
856
857fn main() -> Result<()> {
858    let mut opts = Opts::parse();
859
860    if opts.version {
861        println!(
862            "scx_lavd {}",
863            build_id::full_version(env!("CARGO_PKG_VERSION"))
864        );
865        return Ok(());
866    }
867
868    if opts.help_stats {
869        let sys_stats_meta_name = SysStats::meta().name;
870        let sched_sample_meta_name = SchedSample::meta().name;
871        let stats_meta_names: &[&str] = &[
872            sys_stats_meta_name.as_str(),
873            sched_sample_meta_name.as_str(),
874        ];
875        stats::server_data(0).describe_meta(&mut std::io::stdout(), Some(&stats_meta_names))?;
876        return Ok(());
877    }
878
879    init_log(&opts);
880
881    if opts.monitor.is_none() && opts.monitor_sched_samples.is_none() {
882        opts.proc().unwrap();
883        info!("{:#?}", opts);
884    }
885
886    let shutdown = Arc::new(AtomicBool::new(false));
887    let shutdown_clone = shutdown.clone();
888    ctrlc::set_handler(move || {
889        shutdown_clone.store(true, Ordering::Relaxed);
890    })
891    .context("Error setting Ctrl-C handler")?;
892
893    if let Some(nr_samples) = opts.monitor_sched_samples {
894        let shutdown_copy = shutdown.clone();
895        let jh = std::thread::spawn(move || {
896            stats::monitor_sched_samples(nr_samples, shutdown_copy).unwrap()
897        });
898        let _ = jh.join();
899        return Ok(());
900    }
901
902    if let Some(intv) = opts.monitor.or(opts.stats) {
903        let shutdown_copy = shutdown.clone();
904        let jh = std::thread::spawn(move || {
905            stats::monitor(Duration::from_secs_f64(intv), shutdown_copy).unwrap()
906        });
907        if opts.monitor.is_some() {
908            let _ = jh.join();
909            return Ok(());
910        }
911    }
912
913    let mut open_object = MaybeUninit::uninit();
914    loop {
915        let mut sched = Scheduler::init(&opts, &mut open_object)?;
916        info!(
917            "scx_lavd scheduler is initialized (build ID: {})",
918            build_id::full_version(env!("CARGO_PKG_VERSION"))
919        );
920        info!("scx_lavd scheduler starts running.");
921        if !sched.run(&opts, shutdown.clone())?.should_restart() {
922            break;
923        }
924    }
925
926    Ok(())
927}