scx_lavd/
main.rs

1// SPDX-License-Identifier: GPL-2.0
2//
3// Copyright (c) 2024 Valve Corporation.
4// Author: Changwoo Min <changwoo@igalia.com>
5
6// This software may be used and distributed according to the terms of the
7// GNU General Public License version 2.
8
9mod bpf_skel;
10pub use bpf_skel::*;
11pub mod bpf_intf;
12pub use bpf_intf::*;
13
14mod cpu_order;
15mod stats;
16use std::ffi::c_int;
17use std::ffi::CStr;
18use std::mem;
19use std::mem::MaybeUninit;
20use std::str;
21use std::sync::atomic::AtomicBool;
22use std::sync::atomic::Ordering;
23use std::sync::Arc;
24use std::thread::ThreadId;
25use std::time::Duration;
26
27use anyhow::Context;
28use anyhow::Result;
29use clap::Parser;
30use clap_num::number_range;
31use cpu_order::CpuOrder;
32use cpu_order::PerfCpuOrder;
33use crossbeam::channel;
34use crossbeam::channel::Receiver;
35use crossbeam::channel::RecvTimeoutError;
36use crossbeam::channel::Sender;
37use crossbeam::channel::TrySendError;
38use libbpf_rs::OpenObject;
39use libbpf_rs::ProgramInput;
40use libc::c_char;
41use log::debug;
42use log::info;
43use plain::Plain;
44use scx_stats::prelude::*;
45use scx_utils::autopower::{fetch_power_profile, PowerProfile};
46use scx_utils::build_id;
47use scx_utils::compat;
48use scx_utils::scx_ops_attach;
49use scx_utils::scx_ops_load;
50use scx_utils::scx_ops_open;
51use scx_utils::set_rlimit_infinity;
52use scx_utils::uei_exited;
53use scx_utils::uei_report;
54use scx_utils::EnergyModel;
55use scx_utils::UserExitInfo;
56use scx_utils::NR_CPU_IDS;
57use stats::SchedSample;
58use stats::SchedSamples;
59use stats::StatsReq;
60use stats::StatsRes;
61use stats::SysStats;
62
63/// scx_lavd: Latency-criticality Aware Virtual Deadline (LAVD) scheduler
64///
65/// The rust part is minimal. It processes command line options and logs out
66/// scheduling statistics. The BPF part makes all the scheduling decisions.
67/// See the more detailed overview of the LAVD design at main.bpf.c.
68#[derive(Debug, Parser)]
69struct Opts {
70    /// Automatically decide the scheduler's power mode (performance vs.
71    /// powersave vs. balanced), CPU preference order, etc, based on system
72    /// load. The options affecting the power mode and the use of core compaction
73    /// (--autopower, --performance, --powersave, --balanced,
74    /// --no-core-compaction) cannot be used with this option. When no option
75    /// is specified, this is a default mode.
76    #[clap(long = "autopilot", action = clap::ArgAction::SetTrue)]
77    autopilot: bool,
78
79    /// Automatically decide the scheduler's power mode (performance vs.
80    /// powersave vs. balanced) based on the system's active power profile.
81    /// The scheduler's power mode decides the CPU preference order and the use
82    /// of core compaction, so the options affecting these (--autopilot,
83    /// --performance, --powersave, --balanced, --no-core-compaction) cannot
84    /// be used with this option.
85    #[clap(long = "autopower", action = clap::ArgAction::SetTrue)]
86    autopower: bool,
87
88    /// Run the scheduler in performance mode to get maximum performance.
89    /// This option cannot be used with other conflicting options (--autopilot,
90    /// --autopower, --balanced, --powersave, --no-core-compaction)
91    /// affecting the use of core compaction.
92    #[clap(long = "performance", action = clap::ArgAction::SetTrue)]
93    performance: bool,
94
95    /// Run the scheduler in powersave mode to minimize powr consumption.
96    /// This option cannot be used with other conflicting options (--autopilot,
97    /// --autopower, --performance, --balanced, --no-core-compaction)
98    /// affecting the use of core compaction.
99    #[clap(long = "powersave", action = clap::ArgAction::SetTrue)]
100    powersave: bool,
101
102    /// Run the scheduler in balanced mode aiming for sweetspot between power
103    /// and performance. This option cannot be used with other conflicting
104    /// options (--autopilot, --autopower, --performance, --powersave,
105    /// --no-core-compaction) affecting the use of core compaction.
106    #[clap(long = "balanced", action = clap::ArgAction::SetTrue)]
107    balanced: bool,
108
109    /// Maximum scheduling slice duration in microseconds.
110    #[clap(long = "slice-max-us", default_value = "5000")]
111    slice_max_us: u64,
112
113    /// Minimum scheduling slice duration in microseconds.
114    #[clap(long = "slice-min-us", default_value = "500")]
115    slice_min_us: u64,
116
117    /// Limit the ratio of preemption to the roughly top P% of latency-critical
118    /// tasks. When N is given as an argument, P is 0.5^N * 100. The default
119    /// value is 6, which limits the preemption for the top 1.56% of
120    /// latency-critical tasks.
121    #[clap(long = "preempt-shift", default_value = "6", value_parser=Opts::preempt_shift_range)]
122    preempt_shift: u8,
123
124    /// List of CPUs in preferred order (e.g., "0-3,7,6,5,4"). The scheduler
125    /// uses the CPU preference mode only when the core compaction is enabled
126    /// (i.e., balanced or powersave mode is specified as an option or chosen
127    /// in the autopilot or autopower mode). When "--cpu-pref-order" is given,
128    /// it implies "--no-use-em".
129    #[clap(long = "cpu-pref-order", default_value = "")]
130    cpu_pref_order: String,
131
132    /// Do not use the energy model in making CPU preference order decisions.
133    #[clap(long = "no-use-em", action = clap::ArgAction::SetTrue)]
134    no_use_em: bool,
135
136    /// Do not boost futex holders.
137    #[clap(long = "no-futex-boost", action = clap::ArgAction::SetTrue)]
138    no_futex_boost: bool,
139
140    /// Disable preemption.
141    #[clap(long = "no-preemption", action = clap::ArgAction::SetTrue)]
142    no_preemption: bool,
143
144    /// Disable an optimization for synchronous wake-up.
145    #[clap(long = "no-wake-sync", action = clap::ArgAction::SetTrue)]
146    no_wake_sync: bool,
147
148    /// Disable core compaction so the scheduler uses all the online CPUs.
149    /// The core compaction attempts to minimize the number of actively used
150    /// CPUs for unaffinitized tasks, respecting the CPU preference order.
151    /// Normally, the core compaction is enabled by the power mode (i.e.,
152    /// balanced or powersave mode is specified as an option or chosen in
153    /// the autopilot or autopower mode). This option cannot be used with the
154    /// other options that control the core compaction (--autopilot,
155    /// --autopower, --performance, --balanced, --powersave).
156    #[clap(long = "no-core-compaction", action = clap::ArgAction::SetTrue)]
157    no_core_compaction: bool,
158
159    /// Disable controlling the CPU frequency.
160    #[clap(long = "no-freq-scaling", action = clap::ArgAction::SetTrue)]
161    no_freq_scaling: bool,
162
163    /// Enable stats monitoring with the specified interval.
164    #[clap(long)]
165    stats: Option<f64>,
166
167    /// Run in stats monitoring mode with the specified interval. Scheduler is not launched.
168    #[clap(long)]
169    monitor: Option<f64>,
170
171    /// Run in monitoring mode. Show the specified number of scheduling
172    /// samples every second.
173    #[clap(long)]
174    monitor_sched_samples: Option<u64>,
175
176    /// Enable verbose output, including libbpf details. Specify multiple
177    /// times to increase verbosity.
178    #[clap(short = 'v', long, action = clap::ArgAction::Count)]
179    verbose: u8,
180
181    /// Print scheduler version and exit.
182    #[clap(short = 'V', long, action = clap::ArgAction::SetTrue)]
183    version: bool,
184
185    /// Show descriptions for statistics.
186    #[clap(long)]
187    help_stats: bool,
188}
189
190impl Opts {
191    fn can_autopilot(&self) -> bool {
192        self.autopower == false
193            && self.performance == false
194            && self.powersave == false
195            && self.balanced == false
196            && self.no_core_compaction == false
197    }
198
199    fn can_autopower(&self) -> bool {
200        self.autopilot == false
201            && self.performance == false
202            && self.powersave == false
203            && self.balanced == false
204            && self.no_core_compaction == false
205    }
206
207    fn can_performance(&self) -> bool {
208        self.autopilot == false
209            && self.autopower == false
210            && self.powersave == false
211            && self.balanced == false
212    }
213
214    fn can_balanced(&self) -> bool {
215        self.autopilot == false
216            && self.autopower == false
217            && self.performance == false
218            && self.powersave == false
219            && self.no_core_compaction == false
220    }
221
222    fn can_powersave(&self) -> bool {
223        self.autopilot == false
224            && self.autopower == false
225            && self.performance == false
226            && self.balanced == false
227            && self.no_core_compaction == false
228    }
229
230    fn proc(&mut self) -> Option<&mut Self> {
231        if !self.autopilot {
232            self.autopilot = self.can_autopilot();
233        }
234
235        if self.autopilot {
236            if !self.can_autopilot() {
237                info!("Autopilot mode cannot be used with conflicting options.");
238                return None;
239            }
240            info!("Autopilot mode is enabled.");
241        }
242
243        if self.autopower {
244            if !self.can_autopower() {
245                info!("Autopower mode cannot be used with conflicting options.");
246                return None;
247            }
248            info!("Autopower mode is enabled.");
249        }
250
251        if self.performance {
252            if !self.can_performance() {
253                info!("Performance mode cannot be used with conflicting options.");
254                return None;
255            }
256            info!("Performance mode is enabled.");
257            self.no_core_compaction = true;
258        }
259
260        if self.powersave {
261            if !self.can_powersave() {
262                info!("Powersave mode cannot be used with conflicting options.");
263                return None;
264            }
265            info!("Powersave mode is enabled.");
266            self.no_core_compaction = false;
267        }
268
269        if self.balanced {
270            if !self.can_balanced() {
271                info!("Balanced mode cannot be used with conflicting options.");
272                return None;
273            }
274            info!("Balanced mode is enabled.");
275            self.no_core_compaction = false;
276        }
277
278        if !EnergyModel::has_energy_model() || !self.cpu_pref_order.is_empty() {
279            self.no_use_em = true;
280            info!("Energy model won't be used for CPU preference order.");
281        }
282
283        Some(self)
284    }
285
286    fn preempt_shift_range(s: &str) -> Result<u8, String> {
287        number_range(s, 0, 10)
288    }
289}
290
291unsafe impl Plain for msg_task_ctx {}
292
293impl msg_task_ctx {
294    fn from_bytes(buf: &[u8]) -> &msg_task_ctx {
295        plain::from_bytes(buf).expect("The buffer is either too short or not aligned!")
296    }
297}
298
299impl introspec {
300    fn new() -> Self {
301        let intrspc = unsafe { mem::MaybeUninit::<introspec>::zeroed().assume_init() };
302        intrspc
303    }
304}
305
306struct Scheduler<'a> {
307    skel: BpfSkel<'a>,
308    struct_ops: Option<libbpf_rs::Link>,
309    rb_mgr: libbpf_rs::RingBuffer<'static>,
310    intrspc: introspec,
311    intrspc_rx: Receiver<SchedSample>,
312    monitor_tid: Option<ThreadId>,
313    stats_server: StatsServer<StatsReq, StatsRes>,
314    mseq_id: u64,
315}
316
317impl<'a> Scheduler<'a> {
318    fn init(opts: &'a Opts, open_object: &'a mut MaybeUninit<OpenObject>) -> Result<Self> {
319        if *NR_CPU_IDS > LAVD_CPU_ID_MAX as usize {
320            panic!(
321                "Num possible CPU IDs ({}) exceeds maximum of ({})",
322                *NR_CPU_IDS, LAVD_CPU_ID_MAX
323            );
324        }
325
326        // Increase MEMLOCK size since the BPF scheduler might use
327        // more than the current limit
328        set_rlimit_infinity();
329
330        // Open the BPF prog first for verification.
331        let mut skel_builder = BpfSkelBuilder::default();
332        skel_builder.obj_builder.debug(opts.verbose > 0);
333        let mut skel = scx_ops_open!(skel_builder, open_object, lavd_ops)?;
334
335        // Enable futex tracing using ftrace if available. If the ftrace is not
336        // available, use tracepoint, which is known to be slower than ftrace.
337        if !opts.no_futex_boost {
338            if Self::attach_futex_ftraces(&mut skel)? == false {
339                info!("Fail to attach futex ftraces. Try with tracepoints.");
340                if Self::attach_futex_tracepoints(&mut skel)? == false {
341                    info!("Fail to attach futex tracepoints.");
342                }
343            }
344        }
345
346        // Initialize CPU topology
347        let order = CpuOrder::new().unwrap();
348        Self::init_cpus(&mut skel, &order);
349        Self::init_cpdoms(&mut skel, &order);
350
351        // Initialize skel according to @opts.
352        Self::init_globals(&mut skel, &opts, &order);
353
354        // Attach.
355        let mut skel = scx_ops_load!(skel, lavd_ops, uei)?;
356        let struct_ops = Some(scx_ops_attach!(skel, lavd_ops)?);
357        let stats_server = StatsServer::new(stats::server_data(*NR_CPU_IDS as u64)).launch()?;
358
359        // Build a ring buffer for instrumentation
360        let (intrspc_tx, intrspc_rx) = channel::bounded(65536);
361        let rb_map = &mut skel.maps.introspec_msg;
362        let mut builder = libbpf_rs::RingBufferBuilder::new();
363        builder
364            .add(rb_map, move |data| {
365                Scheduler::relay_introspec(data, &intrspc_tx)
366            })
367            .unwrap();
368        let rb_mgr = builder.build().unwrap();
369
370        Ok(Self {
371            skel,
372            struct_ops,
373            rb_mgr,
374            intrspc: introspec::new(),
375            intrspc_rx,
376            monitor_tid: None,
377            stats_server,
378            mseq_id: 0,
379        })
380    }
381
382    fn attach_futex_ftraces(skel: &mut OpenBpfSkel) -> Result<bool> {
383        let ftraces = vec![
384            ("__futex_wait", &skel.progs.fexit___futex_wait),
385            ("futex_wait_multiple", &skel.progs.fexit_futex_wait_multiple),
386            (
387                "futex_wait_requeue_pi",
388                &skel.progs.fexit_futex_wait_requeue_pi,
389            ),
390            ("futex_wake", &skel.progs.fexit_futex_wake),
391            ("futex_wake_op", &skel.progs.fexit_futex_wake_op),
392            ("futex_lock_pi", &skel.progs.fexit_futex_lock_pi),
393            ("futex_unlock_pi", &skel.progs.fexit_futex_unlock_pi),
394        ];
395
396        compat::cond_kprobes_enable(ftraces)
397    }
398
399    fn attach_futex_tracepoints(skel: &mut OpenBpfSkel) -> Result<bool> {
400        let tracepoints = vec![
401            ("syscalls:sys_enter_futex", &skel.progs.rtp_sys_enter_futex),
402            ("syscalls:sys_exit_futex", &skel.progs.rtp_sys_exit_futex),
403            (
404                "syscalls:sys_exit_futex_wait",
405                &skel.progs.rtp_sys_exit_futex_wait,
406            ),
407            (
408                "syscalls:sys_exit_futex_waitv",
409                &skel.progs.rtp_sys_exit_futex_waitv,
410            ),
411            (
412                "syscalls:sys_exit_futex_wake",
413                &skel.progs.rtp_sys_exit_futex_wake,
414            ),
415        ];
416
417        compat::cond_tracepoints_enable(tracepoints)
418    }
419
420    fn init_cpus(skel: &mut OpenBpfSkel, order: &CpuOrder) {
421        debug!("{:#?}", order);
422
423        // Initialize CPU capacity.
424        for cpu in order.cpuids.iter() {
425            skel.maps.rodata_data.cpu_capacity[cpu.cpu_adx] = cpu.cpu_cap as u16;
426            skel.maps.rodata_data.cpu_big[cpu.cpu_adx] = cpu.big_core as u8;
427            skel.maps.rodata_data.cpu_turbo[cpu.cpu_adx] = cpu.turbo_core as u8;
428        }
429
430        // Initialize performance vs. CPU order table.
431        let nr_pco_states: u8 = order.perf_cpu_order.len() as u8;
432        if nr_pco_states > LAVD_PCO_STATE_MAX as u8 {
433            panic!("Generated performance vs. CPU order stats are too complex ({nr_pco_states}) to handle");
434        }
435
436        skel.maps.rodata_data.nr_pco_states = nr_pco_states;
437        for (i, (_, pco)) in order.perf_cpu_order.iter().enumerate() {
438            Self::init_pco_tuple(skel, i, &pco);
439            info!("{:#}", pco);
440        }
441
442        let (_, last_pco) = order.perf_cpu_order.last_key_value().unwrap();
443        for i in nr_pco_states..LAVD_PCO_STATE_MAX as u8 {
444            Self::init_pco_tuple(skel, i as usize, &last_pco);
445        }
446    }
447
448    fn init_pco_tuple(skel: &mut OpenBpfSkel, i: usize, pco: &PerfCpuOrder) {
449        let cpus_perf = pco.cpus_perf.borrow();
450        let cpus_ovflw = pco.cpus_ovflw.borrow();
451        let pco_nr_primary = cpus_perf.len();
452
453        skel.maps.rodata_data.pco_bounds[i] = pco.perf_cap as u32;
454        skel.maps.rodata_data.pco_nr_primary[i] = pco_nr_primary as u16;
455
456        for (j, &cpu_adx) in cpus_perf.iter().enumerate() {
457            skel.maps.rodata_data.pco_table[i][j] = cpu_adx as u16;
458        }
459
460        for (j, &cpu_adx) in cpus_ovflw.iter().enumerate() {
461            let k = j + pco_nr_primary;
462            skel.maps.rodata_data.pco_table[i][k] = cpu_adx as u16;
463        }
464    }
465
466    fn init_cpdoms(skel: &mut OpenBpfSkel, order: &CpuOrder) {
467        // Initialize compute domain contexts
468        for (k, v) in order.cpdom_map.iter() {
469            skel.maps.bss_data.cpdom_ctxs[v.cpdom_id].id = v.cpdom_id as u64;
470            skel.maps.bss_data.cpdom_ctxs[v.cpdom_id].alt_id = v.cpdom_alt_id.get() as u64;
471            skel.maps.bss_data.cpdom_ctxs[v.cpdom_id].node_id = k.node_adx as u8;
472            skel.maps.bss_data.cpdom_ctxs[v.cpdom_id].is_big = k.is_big as u8;
473            skel.maps.bss_data.cpdom_ctxs[v.cpdom_id].is_valid = 1;
474            for cpu_id in v.cpu_ids.iter() {
475                let i = cpu_id / 64;
476                let j = cpu_id % 64;
477                skel.maps.bss_data.cpdom_ctxs[v.cpdom_id].__cpumask[i] |= 0x01 << j;
478            }
479
480            if v.neighbor_map.borrow().iter().len() > LAVD_CPDOM_MAX_DIST as usize {
481                panic!("The processor topology is too complex to handle in BPF.");
482            }
483
484            for (k, (_d, neighbors)) in v.neighbor_map.borrow().iter().enumerate() {
485                let nr_neighbors = neighbors.borrow().len() as u8;
486                if nr_neighbors > LAVD_CPDOM_MAX_NR as u8 {
487                    panic!("The processor topology is too complex to handle in BPF.");
488                }
489                skel.maps.bss_data.cpdom_ctxs[v.cpdom_id].nr_neighbors[k] = nr_neighbors;
490                for n in neighbors.borrow().iter() {
491                    skel.maps.bss_data.cpdom_ctxs[v.cpdom_id].neighbor_bits[k] |= 0x1 << n;
492                }
493            }
494        }
495    }
496
497    fn init_globals(skel: &mut OpenBpfSkel, opts: &Opts, order: &CpuOrder) {
498        skel.maps.bss_data.no_preemption = opts.no_preemption;
499        skel.maps.bss_data.no_wake_sync = opts.no_wake_sync;
500        skel.maps.bss_data.no_core_compaction = opts.no_core_compaction;
501        skel.maps.bss_data.no_freq_scaling = opts.no_freq_scaling;
502        skel.maps.bss_data.is_powersave_mode = opts.powersave;
503        skel.maps.rodata_data.nr_cpu_ids = *NR_CPU_IDS as u64;
504        skel.maps.rodata_data.is_smt_active = order.smt_enabled;
505        skel.maps.rodata_data.is_autopilot_on = opts.autopilot;
506        skel.maps.rodata_data.verbose = opts.verbose;
507        skel.maps.rodata_data.slice_max_ns = opts.slice_max_us * 1000;
508        skel.maps.rodata_data.slice_min_ns = opts.slice_min_us * 1000;
509        skel.maps.rodata_data.preempt_shift = opts.preempt_shift;
510        skel.maps.rodata_data.no_use_em = opts.no_use_em as u8;
511
512        skel.struct_ops.lavd_ops_mut().flags = *compat::SCX_OPS_ENQ_EXITING
513            | *compat::SCX_OPS_ENQ_LAST
514            | *compat::SCX_OPS_ENQ_MIGRATION_DISABLED
515            | *compat::SCX_OPS_KEEP_BUILTIN_IDLE;
516    }
517
518    fn get_msg_seq_id() -> u64 {
519        static mut MSEQ: u64 = 0;
520        unsafe {
521            MSEQ += 1;
522            MSEQ
523        }
524    }
525
526    fn relay_introspec(data: &[u8], intrspc_tx: &Sender<SchedSample>) -> i32 {
527        let mt = msg_task_ctx::from_bytes(data);
528        let tx = mt.taskc_x;
529        let tc = mt.taskc;
530
531        // No idea how to print other types than LAVD_MSG_TASKC
532        if mt.hdr.kind != LAVD_MSG_TASKC {
533            return 0;
534        }
535
536        let mseq = Scheduler::get_msg_seq_id();
537
538        let c_tx_cm: *const c_char = (&tx.comm as *const [c_char; 17]) as *const c_char;
539        let c_tx_cm_str: &CStr = unsafe { CStr::from_ptr(c_tx_cm) };
540        let tx_comm: &str = c_tx_cm_str.to_str().unwrap();
541
542        let c_tx_st: *const c_char = (&tx.stat as *const [c_char; 5]) as *const c_char;
543        let c_tx_st_str: &CStr = unsafe { CStr::from_ptr(c_tx_st) };
544        let tx_stat: &str = c_tx_st_str.to_str().unwrap();
545
546        match intrspc_tx.try_send(SchedSample {
547            mseq,
548            pid: tx.pid,
549            comm: tx_comm.into(),
550            stat: tx_stat.into(),
551            cpu_id: tx.cpu_id,
552            slice_ns: tc.slice_ns,
553            lat_cri: tc.lat_cri,
554            avg_lat_cri: tx.avg_lat_cri,
555            static_prio: tx.static_prio,
556            run_freq: tc.run_freq,
557            avg_runtime: tc.avg_runtime,
558            wait_freq: tc.wait_freq,
559            wake_freq: tc.wake_freq,
560            perf_cri: tc.perf_cri,
561            thr_perf_cri: tx.thr_perf_cri,
562            cpuperf_cur: tx.cpuperf_cur,
563            cpu_util: tx.cpu_util,
564            cpu_sutil: tx.cpu_sutil,
565            nr_active: tx.nr_active,
566        }) {
567            Ok(()) | Err(TrySendError::Full(_)) => 0,
568            Err(e) => panic!("failed to send on intrspc_tx ({})", e),
569        }
570    }
571
572    fn prep_introspec(&mut self) {
573        self.skel.maps.bss_data.intrspc.cmd = self.intrspc.cmd;
574        self.skel.maps.bss_data.intrspc.arg = self.intrspc.arg;
575    }
576
577    fn cleanup_introspec(&mut self) {
578        self.skel.maps.bss_data.intrspc.cmd = LAVD_CMD_NOP;
579    }
580
581    fn get_pc(x: u64, y: u64) -> f64 {
582        return 100. * x as f64 / y as f64;
583    }
584
585    fn get_power_mode(power_mode: i32) -> &'static str {
586        match power_mode as u32 {
587            LAVD_PM_PERFORMANCE => "performance",
588            LAVD_PM_BALANCED => "balanced",
589            LAVD_PM_POWERSAVE => "powersave",
590            _ => "unknown",
591        }
592    }
593
594    fn stats_req_to_res(&mut self, req: &StatsReq) -> Result<StatsRes> {
595        Ok(match req {
596            StatsReq::NewSampler(tid) => {
597                self.rb_mgr.consume().unwrap();
598                self.monitor_tid = Some(*tid);
599                StatsRes::Ack
600            }
601            StatsReq::SysStatsReq { tid } => {
602                if Some(*tid) != self.monitor_tid {
603                    return Ok(StatsRes::Bye);
604                }
605                self.mseq_id += 1;
606
607                let bss_data = &self.skel.maps.bss_data;
608                let st = bss_data.sys_stat;
609
610                let mseq = self.mseq_id;
611                let nr_queued_task = st.nr_queued_task;
612                let nr_active = st.nr_active;
613                let nr_sched = st.nr_sched;
614                let nr_preempt = st.nr_preempt;
615                let pc_pc = Self::get_pc(st.nr_perf_cri, nr_sched);
616                let pc_lc = Self::get_pc(st.nr_lat_cri, nr_sched);
617                let pc_x_migration = Self::get_pc(st.nr_x_migration, nr_sched);
618                let nr_stealee = st.nr_stealee;
619                let nr_big = st.nr_big;
620                let pc_big = Self::get_pc(nr_big, nr_sched);
621                let pc_pc_on_big = Self::get_pc(st.nr_pc_on_big, nr_big);
622                let pc_lc_on_big = Self::get_pc(st.nr_lc_on_big, nr_big);
623                let power_mode = Self::get_power_mode(bss_data.power_mode);
624                let total_time = bss_data.performance_mode_ns
625                    + bss_data.balanced_mode_ns
626                    + bss_data.powersave_mode_ns;
627                let pc_performance = Self::get_pc(bss_data.performance_mode_ns, total_time);
628                let pc_balanced = Self::get_pc(bss_data.balanced_mode_ns, total_time);
629                let pc_powersave = Self::get_pc(bss_data.powersave_mode_ns, total_time);
630
631                StatsRes::SysStats(SysStats {
632                    mseq,
633                    nr_queued_task,
634                    nr_active,
635                    nr_sched,
636                    nr_preempt,
637                    pc_pc,
638                    pc_lc,
639                    pc_x_migration,
640                    nr_stealee,
641                    pc_big,
642                    pc_pc_on_big,
643                    pc_lc_on_big,
644                    power_mode: power_mode.to_string(),
645                    pc_performance,
646                    pc_balanced,
647                    pc_powersave,
648                })
649            }
650            StatsReq::SchedSamplesNr {
651                tid,
652                nr_samples,
653                interval_ms,
654            } => {
655                if Some(*tid) != self.monitor_tid {
656                    return Ok(StatsRes::Bye);
657                }
658
659                self.intrspc.cmd = LAVD_CMD_SCHED_N;
660                self.intrspc.arg = *nr_samples;
661                self.prep_introspec();
662                std::thread::sleep(Duration::from_millis(*interval_ms));
663                self.rb_mgr.poll(Duration::from_millis(100)).unwrap();
664
665                let mut samples = vec![];
666                while let Ok(ts) = self.intrspc_rx.try_recv() {
667                    samples.push(ts);
668                }
669
670                self.cleanup_introspec();
671
672                StatsRes::SchedSamples(SchedSamples { samples })
673            }
674        })
675    }
676
677    pub fn exited(&mut self) -> bool {
678        uei_exited!(&self.skel, uei)
679    }
680
681    fn set_power_profile(&mut self, mode: u32) -> Result<(), u32> {
682        let prog = &mut self.skel.progs.set_power_profile;
683        let mut args = power_arg {
684            power_mode: mode as c_int,
685        };
686        let input = ProgramInput {
687            context_in: Some(unsafe {
688                std::slice::from_raw_parts_mut(
689                    &mut args as *mut _ as *mut u8,
690                    std::mem::size_of_val(&args),
691                )
692            }),
693            ..Default::default()
694        };
695        let out = prog.test_run(input).unwrap();
696        if out.return_value != 0 {
697            return Err(out.return_value);
698        }
699
700        Ok(())
701    }
702
703    fn update_power_profile(&mut self, prev_profile: PowerProfile) -> (bool, PowerProfile) {
704        let profile = fetch_power_profile(false);
705        if profile == prev_profile {
706            // If the profile is the same, skip updaring the profile for BPF.
707            return (true, profile);
708        }
709
710        let _ = match profile {
711            PowerProfile::Performance => self.set_power_profile(LAVD_PM_PERFORMANCE),
712            PowerProfile::Balanced { .. } => self.set_power_profile(LAVD_PM_BALANCED),
713            PowerProfile::Powersave => self.set_power_profile(LAVD_PM_POWERSAVE),
714            PowerProfile::Unknown => {
715                // We don't know how to handle an unknown energy profile,
716                // so we just give up updating the profile from now on.
717                return (false, profile);
718            }
719        };
720
721        info!("Set the scheduler's power profile to {profile} mode.");
722        (true, profile)
723    }
724
725    fn run(&mut self, opts: &Opts, shutdown: Arc<AtomicBool>) -> Result<UserExitInfo> {
726        let (res_ch, req_ch) = self.stats_server.channels();
727        let mut autopower = opts.autopower;
728        let mut profile = PowerProfile::Unknown;
729
730        if opts.performance {
731            let _ = self.set_power_profile(LAVD_PM_PERFORMANCE);
732        } else if opts.powersave {
733            let _ = self.set_power_profile(LAVD_PM_POWERSAVE);
734        } else {
735            let _ = self.set_power_profile(LAVD_PM_BALANCED);
736        }
737
738        while !shutdown.load(Ordering::Relaxed) && !self.exited() {
739            if autopower {
740                (autopower, profile) = self.update_power_profile(profile);
741            }
742
743            match req_ch.recv_timeout(Duration::from_secs(1)) {
744                Ok(req) => {
745                    let res = self.stats_req_to_res(&req)?;
746                    res_ch.send(res)?;
747                }
748                Err(RecvTimeoutError::Timeout) => {}
749                Err(e) => Err(e)?,
750            }
751            self.cleanup_introspec();
752        }
753        self.rb_mgr.consume().unwrap();
754
755        let _ = self.struct_ops.take();
756        uei_report!(&self.skel, uei)
757    }
758}
759
760impl Drop for Scheduler<'_> {
761    fn drop(&mut self) {
762        if let Some(struct_ops) = self.struct_ops.take() {
763            drop(struct_ops);
764        }
765    }
766}
767
768fn init_log(opts: &Opts) {
769    let llv = match opts.verbose {
770        0 => simplelog::LevelFilter::Info,
771        1 => simplelog::LevelFilter::Debug,
772        _ => simplelog::LevelFilter::Trace,
773    };
774    let mut lcfg = simplelog::ConfigBuilder::new();
775    lcfg.set_time_offset_to_local()
776        .expect("Failed to set local time offset")
777        .set_time_level(simplelog::LevelFilter::Error)
778        .set_location_level(simplelog::LevelFilter::Off)
779        .set_target_level(simplelog::LevelFilter::Off)
780        .set_thread_level(simplelog::LevelFilter::Off);
781    simplelog::TermLogger::init(
782        llv,
783        lcfg.build(),
784        simplelog::TerminalMode::Stderr,
785        simplelog::ColorChoice::Auto,
786    )
787    .unwrap();
788}
789
790fn main() -> Result<()> {
791    let mut opts = Opts::parse();
792
793    if opts.version {
794        println!(
795            "scx_lavd {}",
796            build_id::full_version(env!("CARGO_PKG_VERSION"))
797        );
798        return Ok(());
799    }
800
801    if opts.help_stats {
802        let sys_stats_meta_name = SysStats::meta().name;
803        let sched_sample_meta_name = SchedSample::meta().name;
804        let stats_meta_names: &[&str] = &[
805            sys_stats_meta_name.as_str(),
806            sched_sample_meta_name.as_str(),
807        ];
808        stats::server_data(0).describe_meta(&mut std::io::stdout(), Some(&stats_meta_names))?;
809        return Ok(());
810    }
811
812    init_log(&opts);
813
814    opts.proc().unwrap();
815    info!("{:#?}", opts);
816
817    let shutdown = Arc::new(AtomicBool::new(false));
818    let shutdown_clone = shutdown.clone();
819    ctrlc::set_handler(move || {
820        shutdown_clone.store(true, Ordering::Relaxed);
821    })
822    .context("Error setting Ctrl-C handler")?;
823
824    if let Some(nr_samples) = opts.monitor_sched_samples {
825        let shutdown_copy = shutdown.clone();
826        let jh = std::thread::spawn(move || {
827            stats::monitor_sched_samples(nr_samples, shutdown_copy).unwrap()
828        });
829        let _ = jh.join();
830        return Ok(());
831    }
832
833    if let Some(intv) = opts.monitor.or(opts.stats) {
834        let shutdown_copy = shutdown.clone();
835        let jh = std::thread::spawn(move || {
836            stats::monitor(Duration::from_secs_f64(intv), shutdown_copy).unwrap()
837        });
838        if opts.monitor.is_some() {
839            let _ = jh.join();
840            return Ok(());
841        }
842    }
843
844    let mut open_object = MaybeUninit::uninit();
845    loop {
846        let mut sched = Scheduler::init(&opts, &mut open_object)?;
847        info!(
848            "scx_lavd scheduler is initialized (build ID: {})",
849            build_id::full_version(env!("CARGO_PKG_VERSION"))
850        );
851        info!("scx_lavd scheduler starts running.");
852        if !sched.run(&opts, shutdown.clone())?.should_restart() {
853            break;
854        }
855    }
856
857    Ok(())
858}