scx_lavd/
main.rs

1// SPDX-License-Identifier: GPL-2.0
2//
3// Copyright (c) 2024 Valve Corporation.
4// Author: Changwoo Min <changwoo@igalia.com>
5
6// This software may be used and distributed according to the terms of the
7// GNU General Public License version 2.
8
9mod bpf_skel;
10pub use bpf_skel::*;
11pub mod bpf_intf;
12pub use bpf_intf::*;
13
14mod stats;
15use std::cell::Cell;
16use std::cell::RefCell;
17use std::collections::BTreeMap;
18use std::ffi::CStr;
19use std::ffi::c_int;
20use std::fmt;
21use std::mem;
22use std::mem::MaybeUninit;
23use std::str;
24use std::sync::Arc;
25use std::sync::atomic::AtomicBool;
26use std::sync::atomic::Ordering;
27use std::thread::ThreadId;
28use std::time::Duration;
29
30use anyhow::Context;
31use anyhow::Result;
32use clap::Parser;
33use crossbeam::channel;
34use crossbeam::channel::Receiver;
35use crossbeam::channel::RecvTimeoutError;
36use crossbeam::channel::Sender;
37use crossbeam::channel::TrySendError;
38use itertools::iproduct;
39use libbpf_rs::OpenObject;
40use libbpf_rs::ProgramInput;
41use libc::c_char;
42use log::debug;
43use log::info;
44use plain::Plain;
45use scx_stats::prelude::*;
46use scx_utils::Cpumask;
47use scx_utils::EnergyModel;
48use scx_utils::NR_CPU_IDS;
49use scx_utils::Topology;
50use scx_utils::UserExitInfo;
51use scx_utils::autopower::{PowerProfile, fetch_power_profile};
52use scx_utils::build_id;
53use scx_utils::compat;
54use scx_utils::read_cpulist;
55use scx_utils::scx_ops_attach;
56use scx_utils::scx_ops_load;
57use scx_utils::scx_ops_open;
58use scx_utils::set_rlimit_infinity;
59use scx_utils::uei_exited;
60use scx_utils::uei_report;
61use stats::SchedSample;
62use stats::SchedSamples;
63use stats::StatsReq;
64use stats::StatsRes;
65use stats::SysStats;
66
67/// scx_lavd: Latency-criticality Aware Virtual Deadline (LAVD) scheduler
68///
69/// The rust part is minimal. It processes command line options and logs out
70/// scheduling statistics. The BPF part makes all the scheduling decisions.
71/// See the more detailed overview of the LAVD design at main.bpf.c.
72#[derive(Debug, Parser)]
73struct Opts {
74    /// Automatically decide the scheduler's power mode based on system load.
75    /// This is a default mode if you don't specify the following options:
76    #[clap(long = "autopilot", action = clap::ArgAction::SetTrue)]
77    autopilot: bool,
78
79    /// Automatically decide the scheduler's power mode based on the system's active power profile.
80    #[clap(long = "autopower", action = clap::ArgAction::SetTrue)]
81    autopower: bool,
82
83    /// Run in performance mode to get maximum performance.
84    #[clap(long = "performance", action = clap::ArgAction::SetTrue)]
85    performance: bool,
86
87    /// Run in powersave mode to minimize power consumption.
88    #[clap(long = "powersave", action = clap::ArgAction::SetTrue)]
89    powersave: bool,
90
91    /// Run in balanced mode aiming for sweetspot between power and performance (default).
92    #[clap(long = "balanced", action = clap::ArgAction::SetTrue)]
93    balanced: bool,
94
95    /// Maximum scheduling slice duration in microseconds.
96    #[clap(long = "slice-max-us", default_value = "5000")]
97    slice_max_us: u64,
98
99    /// Minimum scheduling slice duration in microseconds.
100    #[clap(long = "slice-min-us", default_value = "300")]
101    slice_min_us: u64,
102
103    /// List of CPUs in preferred order (e.g., "0-3,7,6,5,4").
104    #[clap(long = "cpu-pref-order", default_value = "")]
105    cpu_pref_order: String,
106
107    /// Do not boost futex holders.
108    #[clap(long = "no-futex-boost", action = clap::ArgAction::SetTrue)]
109    no_futex_boost: bool,
110
111    /// Disable core compaction and schedule tasks across all online CPUs. Core compaction attempts
112    /// to keep idle CPUs idle in favor of scheduling tasks on CPUs that are already
113    /// awake. See main.bpf.c for more info. Normally set by the power mode, but can be set independently if
114    /// desired.
115    #[clap(long = "no-core-compaction", action = clap::ArgAction::SetTrue)]
116    no_core_compaction: bool,
117
118    /// Schedule tasks on SMT siblings before using other physcial cores when core compaction is
119    /// enabled. Normally set by the power mode, but can be set independently if desired.
120    #[clap(long = "prefer-smt-core", action = clap::ArgAction::SetTrue)]
121    prefer_smt_core: bool,
122
123    /// Schedule tasks on little (efficiency) cores before big (performance) cores when core compaction is
124    /// enabled. Normally set by the power mode, but can be set independently if desired.
125    #[clap(long = "prefer-little-core", action = clap::ArgAction::SetTrue)]
126    prefer_little_core: bool,
127
128    /// Do not specifically prefer to schedule on turbo cores. Normally set by the power mode, but
129    /// can be set independently if desired.
130    #[clap(long = "no-prefer-turbo-core", action = clap::ArgAction::SetTrue)]
131    no_prefer_turbo_core: bool,
132
133    /// Disable controlling the CPU frequency. In order to improve latency and responsiveness of
134    /// performance-critical tasks, scx_lavd increases the CPU frequency even if CPU usage is low.
135    /// See main.bpf.c for more info. Normally set by the power mode, but can be set independently
136    /// if desired.
137    #[clap(long = "no-freq-scaling", action = clap::ArgAction::SetTrue)]
138    no_freq_scaling: bool,
139
140    /// Enable stats monitoring with the specified interval.
141    #[clap(long)]
142    stats: Option<f64>,
143
144    /// Run in stats monitoring mode with the specified interval. Scheduler is not launched.
145    #[clap(long)]
146    monitor: Option<f64>,
147
148    /// Run in monitoring mode. Show the specified number of scheduling
149    /// samples every second.
150    #[clap(long)]
151    monitor_sched_samples: Option<u64>,
152
153    /// Enable verbose output, including libbpf details. Specify multiple
154    /// times to increase verbosity.
155    #[clap(short = 'v', long, action = clap::ArgAction::Count)]
156    verbose: u8,
157
158    /// Print scheduler version and exit.
159    #[clap(short = 'V', long, action = clap::ArgAction::SetTrue)]
160    version: bool,
161
162    /// Show descriptions for statistics.
163    #[clap(long)]
164    help_stats: bool,
165}
166
167impl Opts {
168    fn autopilot_allowed(&self) -> bool {
169        self.autopilot == false
170            && self.autopower == false
171            && self.performance == false
172            && self.powersave == false
173            && self.balanced == false
174            && self.cpu_pref_order == ""
175            && self.no_core_compaction == false
176            && self.prefer_smt_core == false
177            && self.prefer_little_core == false
178            && self.no_prefer_turbo_core == false
179            && self.no_freq_scaling == false
180            && self.monitor == None
181            && self.monitor_sched_samples == None
182    }
183
184    fn proc(&mut self) -> Option<&mut Self> {
185        if self.autopilot_allowed() {
186            self.autopilot = true;
187            info!("Autopilot mode is enabled by default.");
188            return Some(self);
189        }
190
191        if self.performance {
192            self.no_core_compaction = true;
193            self.prefer_smt_core = false;
194            self.prefer_little_core = false;
195            self.no_prefer_turbo_core = false;
196            self.no_freq_scaling = true;
197        } else if self.powersave {
198            self.no_core_compaction = false;
199            self.prefer_smt_core = true;
200            self.prefer_little_core = true;
201            self.no_prefer_turbo_core = true;
202            self.no_freq_scaling = false;
203        } else if self.balanced {
204            self.no_core_compaction = false;
205            self.prefer_smt_core = false;
206            self.prefer_little_core = false;
207            self.no_prefer_turbo_core = false;
208            self.no_freq_scaling = false;
209        }
210
211        Some(self)
212    }
213}
214
215unsafe impl Plain for msg_task_ctx {}
216
217impl msg_task_ctx {
218    fn from_bytes(buf: &[u8]) -> &msg_task_ctx {
219        plain::from_bytes(buf).expect("The buffer is either too short or not aligned!")
220    }
221}
222
223impl introspec {
224    fn new() -> Self {
225        let intrspc = unsafe { mem::MaybeUninit::<introspec>::zeroed().assume_init() };
226        intrspc
227    }
228}
229
230#[derive(Debug, Clone)]
231struct CpuFlatId {
232    node_id: usize,
233    pd_id: usize,
234    llc_pos: usize,
235    core_pos: usize,
236    cpu_pos: usize,
237    cpu_id: usize,
238    smt_level: usize,
239    cache_size: usize,
240    cpu_cap: usize,
241}
242
243#[derive(Debug, Eq, PartialEq, Ord, PartialOrd, Clone)]
244struct ComputeDomainKey {
245    node_id: usize,
246    llc_pos: usize,
247    is_big: bool,
248}
249
250#[derive(Debug, Clone)]
251struct ComputeDomainValue {
252    cpdom_id: usize,
253    cpdom_alt_id: Cell<usize>,
254    cpu_ids: Vec<usize>,
255    neighbor_map: RefCell<BTreeMap<usize, RefCell<Vec<usize>>>>,
256}
257
258#[derive(Debug)]
259struct FlatTopology {
260    all_cpus_mask: Cpumask,
261    cpu_fids_performance: Vec<CpuFlatId>,
262    cpu_fids_powersave: Vec<CpuFlatId>,
263    cpdom_map: BTreeMap<ComputeDomainKey, ComputeDomainValue>,
264    smt_enabled: bool,
265}
266
267impl fmt::Display for FlatTopology {
268    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
269        for cpu_fid in self.cpu_fids_performance.iter() {
270            write!(f, "\nCPU in performance: {:?}", cpu_fid).ok();
271        }
272        for cpu_fid in self.cpu_fids_powersave.iter() {
273            write!(f, "\nCPU in powersave: {:?}", cpu_fid).ok();
274        }
275        for (k, v) in self.cpdom_map.iter() {
276            write!(f, "\nCPDOM: {:?} {:?}", k, v).ok();
277        }
278        write!(f, "SMT: {}", self.smt_enabled).ok();
279        Ok(())
280    }
281}
282
283impl FlatTopology {
284    /// Build a flat-structured topology
285    pub fn new() -> Result<FlatTopology> {
286        let sys_topo = Topology::new().expect("Failed to build host topology");
287        let sys_em = EnergyModel::new();
288        debug!("{:#?}", sys_topo);
289        debug!("{:#?}", sys_em);
290
291        let (cpu_fids_performance, avg_cap) =
292            Self::build_cpu_fids(&sys_topo, &sys_em, false).unwrap();
293        let (cpu_fids_powersave, _) = Self::build_cpu_fids(&sys_topo, &sys_em, true).unwrap();
294
295        // Note that building compute domain is not dependent to CPU orer
296        // so it is okay to use any cpu_fids_*.
297        let cpdom_map = Self::build_cpdom(&cpu_fids_performance, avg_cap).unwrap();
298
299        Ok(FlatTopology {
300            all_cpus_mask: sys_topo.span,
301            cpu_fids_performance,
302            cpu_fids_powersave,
303            cpdom_map,
304            smt_enabled: sys_topo.smt_enabled,
305        })
306    }
307
308    /// Build a flat-structured list of CPUs in a preference order
309    fn build_cpu_fids(
310        topo: &Topology,
311        em: &Result<EnergyModel>,
312        prefer_powersave: bool,
313    ) -> Option<(Vec<CpuFlatId>, usize)> {
314        let mut cpu_fids = Vec::new();
315
316        // Build a vector of cpu flat ids.
317        let mut avg_cap = 0;
318        for (&node_id, node) in topo.nodes.iter() {
319            for (llc_pos, (_llc_id, llc)) in node.llcs.iter().enumerate() {
320                for (core_pos, (_core_id, core)) in llc.cores.iter().enumerate() {
321                    for (cpu_pos, (cpu_id, cpu)) in core.cpus.iter().enumerate() {
322                        let cpu_id = *cpu_id;
323                        let pd_id = Self::get_pd_id(em, cpu_id, node_id);
324                        let cpu_fid = CpuFlatId {
325                            node_id,
326                            pd_id,
327                            llc_pos,
328                            core_pos,
329                            cpu_pos,
330                            cpu_id,
331                            smt_level: cpu.smt_level,
332                            cache_size: cpu.cache_size,
333                            cpu_cap: cpu.cpu_capacity,
334                        };
335                        cpu_fids.push(RefCell::new(cpu_fid));
336                        avg_cap += cpu.cpu_capacity;
337                    }
338                }
339            }
340        }
341        avg_cap /= cpu_fids.len() as usize;
342
343        // Convert a vector of RefCell to a vector of plain cpu_fids
344        let mut cpu_fids2 = Vec::new();
345        for cpu_fid in cpu_fids.iter() {
346            cpu_fids2.push(cpu_fid.borrow().clone());
347        }
348        let mut cpu_fids = cpu_fids2;
349
350        // Sort the cpu_fids
351        match prefer_powersave {
352            true => {
353                // Sort the cpu_fids by node, llc, cpu_cap, ^smt_level, ^cache_size, perf_dom, core, and cpu order
354                cpu_fids.sort_by(|a, b| {
355                    a.node_id
356                        .cmp(&b.node_id)
357                        .then_with(|| a.llc_pos.cmp(&b.llc_pos))
358                        .then_with(|| a.cpu_cap.cmp(&b.cpu_cap))
359                        .then_with(|| b.smt_level.cmp(&a.smt_level))
360                        .then_with(|| b.cache_size.cmp(&a.cache_size))
361                        .then_with(|| a.pd_id.cmp(&b.pd_id))
362                        .then_with(|| a.core_pos.cmp(&b.core_pos))
363                        .then_with(|| a.cpu_pos.cmp(&b.cpu_pos))
364                });
365            }
366            false => {
367                // Sort the cpu_fids by cpu, node, llc, ^cpu_cap, smt_level, ^cache_size, perf_dom, and core order
368                cpu_fids.sort_by(|a, b| {
369                    a.cpu_pos
370                        .cmp(&b.cpu_pos)
371                        .then_with(|| a.node_id.cmp(&b.node_id))
372                        .then_with(|| a.llc_pos.cmp(&b.llc_pos))
373                        .then_with(|| b.cpu_cap.cmp(&a.cpu_cap))
374                        .then_with(|| a.smt_level.cmp(&b.smt_level))
375                        .then_with(|| b.cache_size.cmp(&a.cache_size))
376                        .then_with(|| a.pd_id.cmp(&b.pd_id))
377                        .then_with(|| a.core_pos.cmp(&b.core_pos))
378                });
379            }
380        }
381
382        Some((cpu_fids, avg_cap))
383    }
384
385    /// Get the performance domain (i.e., CPU frequency domain) ID for a CPU.
386    /// If the energy model is not available, use NUMA node ID instead.
387    fn get_pd_id(em: &Result<EnergyModel>, cpu_id: usize, node_id: usize) -> usize {
388        match em {
389            Ok(em) => em.get_pd(cpu_id).unwrap().id,
390            Err(_) => node_id,
391        }
392    }
393
394    /// Build a list of compute domains
395    fn build_cpdom(
396        cpu_fids: &Vec<CpuFlatId>,
397        avg_cap: usize,
398    ) -> Option<BTreeMap<ComputeDomainKey, ComputeDomainValue>> {
399        // Creat a compute domain map, where a compute domain is a CPUs that
400        // are under the same node and LLC and have the same core type.
401        let mut cpdom_id = 0;
402        let mut cpdom_map: BTreeMap<ComputeDomainKey, ComputeDomainValue> = BTreeMap::new();
403        for cpu_fid in cpu_fids.iter() {
404            let key = ComputeDomainKey {
405                node_id: cpu_fid.node_id,
406                llc_pos: cpu_fid.llc_pos,
407                is_big: cpu_fid.cpu_cap >= avg_cap,
408            };
409            let mut value;
410            match cpdom_map.get(&key) {
411                Some(v) => {
412                    value = v.clone();
413                }
414                None => {
415                    value = ComputeDomainValue {
416                        cpdom_id,
417                        cpdom_alt_id: Cell::new(cpdom_id),
418                        cpu_ids: Vec::new(),
419                        neighbor_map: RefCell::new(BTreeMap::new()),
420                    };
421                    cpdom_id += 1;
422                }
423            }
424            value.cpu_ids.push(cpu_fid.cpu_id);
425            cpdom_map.insert(key, value);
426        }
427
428        // Fill up cpdom_alt_id for each compute domain, where the alternative
429        // compute domain is a compute domain that are under the same node
430        // and LLC but has a different core type.
431        for (k, v) in cpdom_map.iter() {
432            let mut key = k.clone();
433            key.is_big = !k.is_big;
434
435            if let Some(alt_v) = cpdom_map.get(&key) {
436                v.cpdom_alt_id.set(alt_v.cpdom_id);
437            }
438        }
439
440        // Build a neighbor map for each compute domain, where neighbors are
441        // ordered by core type, node, and LLC.
442        for ((from_k, from_v), (to_k, to_v)) in iproduct!(cpdom_map.iter(), cpdom_map.iter()) {
443            if from_k == to_k {
444                continue;
445            }
446
447            let d = Self::dist(from_k, to_k);
448            let mut map = from_v.neighbor_map.borrow_mut();
449            match map.get(&d) {
450                Some(v) => {
451                    v.borrow_mut().push(to_v.cpdom_id);
452                }
453                None => {
454                    map.insert(d, RefCell::new(vec![to_v.cpdom_id]));
455                }
456            }
457        }
458
459        Some(cpdom_map)
460    }
461
462    /// Calculate distance from two compute domains
463    fn dist(from: &ComputeDomainKey, to: &ComputeDomainKey) -> usize {
464        let mut d = 0;
465        // code type > numa node > llc
466        if from.is_big != to.is_big {
467            d += 3;
468        }
469        if from.node_id != to.node_id {
470            d += 2;
471        } else {
472            if from.llc_pos != to.llc_pos {
473                d += 1;
474            }
475        }
476        d
477    }
478}
479
480struct Scheduler<'a> {
481    skel: BpfSkel<'a>,
482    struct_ops: Option<libbpf_rs::Link>,
483    rb_mgr: libbpf_rs::RingBuffer<'static>,
484    intrspc: introspec,
485    intrspc_rx: Receiver<SchedSample>,
486    monitor_tid: Option<ThreadId>,
487    stats_server: StatsServer<StatsReq, StatsRes>,
488    mseq_id: u64,
489}
490
491impl<'a> Scheduler<'a> {
492    fn init(opts: &'a Opts, open_object: &'a mut MaybeUninit<OpenObject>) -> Result<Self> {
493        if *NR_CPU_IDS > LAVD_CPU_ID_MAX as usize {
494            panic!(
495                "Num possible CPU IDs ({}) exceeds maximum of ({})",
496                *NR_CPU_IDS, LAVD_CPU_ID_MAX
497            );
498        }
499
500        // Increase MEMLOCK size since the BPF scheduler might use
501        // more than the current limit
502        set_rlimit_infinity();
503
504        // Open the BPF prog first for verification.
505        let mut skel_builder = BpfSkelBuilder::default();
506        skel_builder.obj_builder.debug(opts.verbose > 0);
507        let mut skel = scx_ops_open!(skel_builder, open_object, lavd_ops)?;
508
509        // Enable autoloads for conditionally loaded things
510        // immediately after creating skel (because this is always before loading)
511        if !opts.no_futex_boost {
512            compat::cond_tracepoint_enable(
513                "syscalls:sys_enter_futex",
514                &skel.progs.rtp_sys_enter_futex,
515            )?;
516            compat::cond_tracepoint_enable(
517                "syscalls:sys_exit_futex",
518                &skel.progs.rtp_sys_exit_futex,
519            )?;
520            compat::cond_tracepoint_enable(
521                "syscalls:sys_exit_futex_wait",
522                &skel.progs.rtp_sys_exit_futex_wait,
523            )?;
524            compat::cond_tracepoint_enable(
525                "syscalls:sys_exit_futex_waitv",
526                &skel.progs.rtp_sys_exit_futex_waitv,
527            )?;
528            compat::cond_tracepoint_enable(
529                "syscalls:sys_exit_futex_wake",
530                &skel.progs.rtp_sys_exit_futex_wake,
531            )?;
532        }
533
534        // Initialize CPU topology
535        let topo = FlatTopology::new().unwrap();
536        Self::init_cpus(&mut skel, &opts, &topo);
537
538        // Initialize skel according to @opts.
539        Self::init_globals(&mut skel, &opts, &topo);
540
541        // Attach.
542        let mut skel = scx_ops_load!(skel, lavd_ops, uei)?;
543        let struct_ops = Some(scx_ops_attach!(skel, lavd_ops)?);
544        let stats_server = StatsServer::new(stats::server_data(*NR_CPU_IDS as u64)).launch()?;
545
546        // Build a ring buffer for instrumentation
547        let (intrspc_tx, intrspc_rx) = channel::bounded(65536);
548        let rb_map = &mut skel.maps.introspec_msg;
549        let mut builder = libbpf_rs::RingBufferBuilder::new();
550        builder
551            .add(rb_map, move |data| {
552                Scheduler::relay_introspec(data, &intrspc_tx)
553            })
554            .unwrap();
555        let rb_mgr = builder.build().unwrap();
556
557        Ok(Self {
558            skel,
559            struct_ops,
560            rb_mgr,
561            intrspc: introspec::new(),
562            intrspc_rx,
563            monitor_tid: None,
564            stats_server,
565            mseq_id: 0,
566        })
567    }
568
569    fn init_cpus(skel: &mut OpenBpfSkel, opts: &Opts, topo: &FlatTopology) {
570        debug!("{:#?}", topo);
571
572        // Initialize CPU capacity
573        for (_, cpu) in topo.cpu_fids_performance.iter().enumerate() {
574            skel.maps.rodata_data.cpu_capacity[cpu.cpu_id] = cpu.cpu_cap as u16;
575        }
576
577        // If cpu_pref_order is not specified, initialize CPU order
578        // topologically sorted by a cpu, node, llc, max_freq, and core order.
579        // Otherwise, follow the specified CPU preference order.
580        let mut cpu_pf_order = vec![];
581        let mut cpu_ps_order = vec![];
582        if opts.cpu_pref_order == "" {
583            for cpu in topo.cpu_fids_performance.iter() {
584                cpu_pf_order.push(cpu.cpu_id);
585            }
586            for cpu in topo.cpu_fids_powersave.iter() {
587                cpu_ps_order.push(cpu.cpu_id);
588            }
589        } else {
590            let cpu_list = read_cpulist(&opts.cpu_pref_order).unwrap();
591            let pref_mask = Cpumask::from_cpulist(&opts.cpu_pref_order).unwrap();
592            if pref_mask != topo.all_cpus_mask {
593                panic!("--cpu_pref_order does not cover the whole CPUs.");
594            }
595            cpu_pf_order = cpu_list.clone();
596            cpu_ps_order = cpu_list.clone();
597        }
598        for (pos, cpu) in cpu_pf_order.iter().enumerate() {
599            skel.maps.rodata_data.cpu_order_performance[pos] = *cpu as u16;
600        }
601        for (pos, cpu) in cpu_ps_order.iter().enumerate() {
602            skel.maps.rodata_data.cpu_order_powersave[pos] = *cpu as u16;
603        }
604        info!("CPU pref order in performance mode: {:?}", cpu_pf_order);
605        info!("CPU pref order in powersave mode: {:?}", cpu_ps_order);
606
607        // Initialize compute domain contexts
608        for (k, v) in topo.cpdom_map.iter() {
609            skel.maps.bss_data.cpdom_ctxs[v.cpdom_id].id = v.cpdom_id as u64;
610            skel.maps.bss_data.cpdom_ctxs[v.cpdom_id].alt_id = v.cpdom_alt_id.get() as u64;
611            skel.maps.bss_data.cpdom_ctxs[v.cpdom_id].node_id = k.node_id as u8;
612            skel.maps.bss_data.cpdom_ctxs[v.cpdom_id].is_big = k.is_big as u8;
613            skel.maps.bss_data.cpdom_ctxs[v.cpdom_id].is_valid = 1;
614            for cpu_id in v.cpu_ids.iter() {
615                let i = cpu_id / 64;
616                let j = cpu_id % 64;
617                skel.maps.bss_data.cpdom_ctxs[v.cpdom_id].__cpumask[i] |= 0x01 << j;
618            }
619
620            if v.neighbor_map.borrow().iter().len() > LAVD_CPDOM_MAX_DIST as usize {
621                panic!("The processor topology is too complex to handle in BPF.");
622            }
623
624            for (k, (_d, neighbors)) in v.neighbor_map.borrow().iter().enumerate() {
625                let nr_neighbors = neighbors.borrow().len() as u8;
626                if nr_neighbors > LAVD_CPDOM_MAX_NR as u8 {
627                    panic!("The processor topology is too complex to handle in BPF.");
628                }
629                skel.maps.bss_data.cpdom_ctxs[v.cpdom_id].nr_neighbors[k] = nr_neighbors;
630                for n in neighbors.borrow().iter() {
631                    skel.maps.bss_data.cpdom_ctxs[v.cpdom_id].neighbor_bits[k] |= 0x1 << n;
632                }
633            }
634        }
635    }
636
637    fn is_powersave_mode(opts: &Opts) -> bool {
638        opts.prefer_smt_core && opts.prefer_little_core
639    }
640
641    fn init_globals(skel: &mut OpenBpfSkel, opts: &Opts, topo: &FlatTopology) {
642        skel.maps.bss_data.no_core_compaction = opts.no_core_compaction;
643        skel.maps.bss_data.no_freq_scaling = opts.no_freq_scaling;
644        skel.maps.bss_data.no_prefer_turbo_core = opts.no_prefer_turbo_core;
645        skel.maps.bss_data.is_powersave_mode = Self::is_powersave_mode(&opts);
646        skel.maps.rodata_data.nr_cpu_ids = *NR_CPU_IDS as u64;
647        skel.maps.rodata_data.is_smt_active = topo.smt_enabled;
648        skel.maps.rodata_data.is_autopilot_on = opts.autopilot;
649        skel.maps.rodata_data.verbose = opts.verbose;
650        skel.maps.rodata_data.slice_max_ns = opts.slice_max_us * 1000;
651        skel.maps.rodata_data.slice_min_ns = opts.slice_min_us * 1000;
652
653        skel.struct_ops.lavd_ops_mut().flags = *compat::SCX_OPS_ALLOW_QUEUED_WAKEUP
654            | *compat::SCX_OPS_ENQ_EXITING
655            | *compat::SCX_OPS_ENQ_LAST
656            | *compat::SCX_OPS_ENQ_MIGRATION_DISABLED
657            | *compat::SCX_OPS_KEEP_BUILTIN_IDLE;
658    }
659
660    fn get_msg_seq_id() -> u64 {
661        static mut MSEQ: u64 = 0;
662        unsafe {
663            MSEQ += 1;
664            MSEQ
665        }
666    }
667
668    fn relay_introspec(data: &[u8], intrspc_tx: &Sender<SchedSample>) -> i32 {
669        let mt = msg_task_ctx::from_bytes(data);
670        let tx = mt.taskc_x;
671        let tc = mt.taskc;
672
673        // No idea how to print other types than LAVD_MSG_TASKC
674        if mt.hdr.kind != LAVD_MSG_TASKC {
675            return 0;
676        }
677
678        let mseq = Scheduler::get_msg_seq_id();
679
680        let c_tx_cm: *const c_char = (&tx.comm as *const [c_char; 17]) as *const c_char;
681        let c_tx_cm_str: &CStr = unsafe { CStr::from_ptr(c_tx_cm) };
682        let tx_comm: &str = c_tx_cm_str.to_str().unwrap();
683
684        let c_tx_st: *const c_char = (&tx.stat as *const [c_char; 5]) as *const c_char;
685        let c_tx_st_str: &CStr = unsafe { CStr::from_ptr(c_tx_st) };
686        let tx_stat: &str = c_tx_st_str.to_str().unwrap();
687
688        match intrspc_tx.try_send(SchedSample {
689            mseq,
690            pid: tx.pid,
691            comm: tx_comm.into(),
692            stat: tx_stat.into(),
693            cpu_id: tx.cpu_id,
694            slice_ns: tc.slice_ns,
695            lat_cri: tc.lat_cri,
696            avg_lat_cri: tx.avg_lat_cri,
697            static_prio: tx.static_prio,
698            slice_boost_prio: tc.slice_boost_prio,
699            run_freq: tc.run_freq,
700            avg_runtime: tc.avg_runtime,
701            wait_freq: tc.wait_freq,
702            wake_freq: tc.wake_freq,
703            perf_cri: tc.perf_cri,
704            thr_perf_cri: tx.thr_perf_cri,
705            cpuperf_cur: tx.cpuperf_cur,
706            cpu_util: tx.cpu_util,
707            cpu_sutil: tx.cpu_sutil,
708            nr_active: tx.nr_active,
709        }) {
710            Ok(()) | Err(TrySendError::Full(_)) => 0,
711            Err(e) => panic!("failed to send on intrspc_tx ({})", &e),
712        }
713    }
714
715    fn prep_introspec(&mut self) {
716        self.skel.maps.bss_data.intrspc.cmd = self.intrspc.cmd;
717        self.skel.maps.bss_data.intrspc.arg = self.intrspc.arg;
718    }
719
720    fn cleanup_introspec(&mut self) {
721        self.skel.maps.bss_data.intrspc.cmd = LAVD_CMD_NOP;
722    }
723
724    fn get_pc(x: u64, y: u64) -> f64 {
725        return 100. * x as f64 / y as f64;
726    }
727
728    fn get_power_mode(power_mode: i32) -> &'static str {
729        match power_mode as u32 {
730            LAVD_PM_PERFORMANCE => "performance",
731            LAVD_PM_BALANCED => "balanced",
732            LAVD_PM_POWERSAVE => "powersave",
733            _ => "unknown",
734        }
735    }
736
737    fn stats_req_to_res(&mut self, req: &StatsReq) -> Result<StatsRes> {
738        Ok(match req {
739            StatsReq::NewSampler(tid) => {
740                self.rb_mgr.consume().unwrap();
741                self.monitor_tid = Some(*tid);
742                StatsRes::Ack
743            }
744            StatsReq::SysStatsReq { tid } => {
745                if Some(*tid) != self.monitor_tid {
746                    return Ok(StatsRes::Bye);
747                }
748                self.mseq_id += 1;
749
750                let bss_data = &self.skel.maps.bss_data;
751                let st = bss_data.sys_stat;
752
753                let mseq = self.mseq_id;
754                let nr_queued_task = st.nr_queued_task;
755                let nr_active = st.nr_active;
756                let nr_sched = st.nr_sched;
757                let pc_pc = Self::get_pc(st.nr_perf_cri, nr_sched);
758                let pc_lc = Self::get_pc(st.nr_lat_cri, nr_sched);
759                let pc_x_migration = Self::get_pc(st.nr_x_migration, nr_sched);
760                let nr_stealee = st.nr_stealee;
761                let nr_big = st.nr_big;
762                let pc_big = Self::get_pc(nr_big, nr_sched);
763                let pc_pc_on_big = Self::get_pc(st.nr_pc_on_big, nr_big);
764                let pc_lc_on_big = Self::get_pc(st.nr_lc_on_big, nr_big);
765                let power_mode = Self::get_power_mode(bss_data.power_mode);
766                let total_time = bss_data.performance_mode_ns
767                    + bss_data.balanced_mode_ns
768                    + bss_data.powersave_mode_ns;
769                let pc_performance = Self::get_pc(bss_data.performance_mode_ns, total_time);
770                let pc_balanced = Self::get_pc(bss_data.balanced_mode_ns, total_time);
771                let pc_powersave = Self::get_pc(bss_data.powersave_mode_ns, total_time);
772
773                StatsRes::SysStats(SysStats {
774                    mseq,
775                    nr_queued_task,
776                    nr_active,
777                    nr_sched,
778                    pc_pc,
779                    pc_lc,
780                    pc_x_migration,
781                    nr_stealee,
782                    pc_big,
783                    pc_pc_on_big,
784                    pc_lc_on_big,
785                    power_mode: power_mode.to_string(),
786                    pc_performance,
787                    pc_balanced,
788                    pc_powersave,
789                })
790            }
791            StatsReq::SchedSamplesNr {
792                tid,
793                nr_samples,
794                interval_ms,
795            } => {
796                if Some(*tid) != self.monitor_tid {
797                    return Ok(StatsRes::Bye);
798                }
799
800                self.intrspc.cmd = LAVD_CMD_SCHED_N;
801                self.intrspc.arg = *nr_samples;
802                self.prep_introspec();
803                std::thread::sleep(Duration::from_millis(*interval_ms));
804                self.rb_mgr.poll(Duration::from_millis(100)).unwrap();
805
806                let mut samples = vec![];
807                while let Ok(ts) = self.intrspc_rx.try_recv() {
808                    samples.push(ts);
809                }
810
811                self.cleanup_introspec();
812
813                StatsRes::SchedSamples(SchedSamples { samples })
814            }
815        })
816    }
817
818    pub fn exited(&mut self) -> bool {
819        uei_exited!(&self.skel, uei)
820    }
821
822    fn set_power_profile(&mut self, mode: u32) -> Result<(), u32> {
823        let prog = &mut self.skel.progs.set_power_profile;
824        let mut args = power_arg {
825            power_mode: mode as c_int,
826        };
827        let input = ProgramInput {
828            context_in: Some(unsafe {
829                std::slice::from_raw_parts_mut(
830                    &mut args as *mut _ as *mut u8,
831                    std::mem::size_of_val(&args),
832                )
833            }),
834            ..Default::default()
835        };
836        let out = prog.test_run(input).unwrap();
837        if out.return_value != 0 {
838            return Err(out.return_value);
839        }
840
841        Ok(())
842    }
843
844    fn update_power_profile(&mut self, prev_profile: PowerProfile) -> (bool, PowerProfile) {
845        let profile = fetch_power_profile(false);
846        if profile == prev_profile {
847            // If the profile is the same, skip updaring the profile for BPF.
848            return (true, profile);
849        }
850
851        let _ = match profile {
852            PowerProfile::Performance => self.set_power_profile(LAVD_PM_PERFORMANCE),
853            PowerProfile::Balanced => self.set_power_profile(LAVD_PM_BALANCED),
854            PowerProfile::Powersave => self.set_power_profile(LAVD_PM_POWERSAVE),
855            PowerProfile::Unknown => {
856                // We don't know how to handle an unknown energy profile,
857                // so we just give up updating the profile from now on.
858                return (false, profile);
859            }
860        };
861
862        info!("Set the scheduler's power profile to {profile} mode.");
863        (true, profile)
864    }
865
866    fn run(&mut self, opts: &Opts, shutdown: Arc<AtomicBool>) -> Result<UserExitInfo> {
867        let (res_ch, req_ch) = self.stats_server.channels();
868        let mut autopower = opts.autopower;
869        let mut profile = PowerProfile::Unknown;
870
871        if opts.performance {
872            let _ = self.set_power_profile(LAVD_PM_PERFORMANCE);
873        } else if opts.powersave {
874            let _ = self.set_power_profile(LAVD_PM_POWERSAVE);
875        } else {
876            let _ = self.set_power_profile(LAVD_PM_BALANCED);
877        }
878
879        while !shutdown.load(Ordering::Relaxed) && !self.exited() {
880            if autopower {
881                (autopower, profile) = self.update_power_profile(profile);
882            }
883
884            match req_ch.recv_timeout(Duration::from_secs(1)) {
885                Ok(req) => {
886                    let res = self.stats_req_to_res(&req)?;
887                    res_ch.send(res)?;
888                }
889                Err(RecvTimeoutError::Timeout) => {}
890                Err(e) => Err(e)?,
891            }
892            self.cleanup_introspec();
893        }
894        self.rb_mgr.consume().unwrap();
895
896        self.struct_ops.take();
897        uei_report!(&self.skel, uei)
898    }
899}
900
901impl Drop for Scheduler<'_> {
902    fn drop(&mut self) {
903        if let Some(struct_ops) = self.struct_ops.take() {
904            drop(struct_ops);
905        }
906    }
907}
908
909fn init_log(opts: &Opts) {
910    let llv = match opts.verbose {
911        0 => simplelog::LevelFilter::Info,
912        1 => simplelog::LevelFilter::Debug,
913        _ => simplelog::LevelFilter::Trace,
914    };
915    let mut lcfg = simplelog::ConfigBuilder::new();
916    lcfg.set_time_level(simplelog::LevelFilter::Error)
917        .set_location_level(simplelog::LevelFilter::Off)
918        .set_target_level(simplelog::LevelFilter::Off)
919        .set_thread_level(simplelog::LevelFilter::Off);
920    simplelog::TermLogger::init(
921        llv,
922        lcfg.build(),
923        simplelog::TerminalMode::Stderr,
924        simplelog::ColorChoice::Auto,
925    )
926    .unwrap();
927}
928
929fn main() -> Result<()> {
930    let mut opts = Opts::parse();
931
932    if opts.version {
933        println!(
934            "scx_lavd {}",
935            build_id::full_version(env!("CARGO_PKG_VERSION"))
936        );
937        return Ok(());
938    }
939
940    if opts.help_stats {
941        let sys_stats_meta_name = SysStats::meta().name;
942        let sched_sample_meta_name = SchedSample::meta().name;
943        let stats_meta_names: &[&str] = &[
944            sys_stats_meta_name.as_str(),
945            sched_sample_meta_name.as_str(),
946        ];
947        stats::server_data(0).describe_meta(&mut std::io::stdout(), Some(&stats_meta_names))?;
948        return Ok(());
949    }
950
951    init_log(&opts);
952
953    opts.proc().unwrap();
954    debug!("{:#?}", opts);
955
956    let shutdown = Arc::new(AtomicBool::new(false));
957    let shutdown_clone = shutdown.clone();
958    ctrlc::set_handler(move || {
959        shutdown_clone.store(true, Ordering::Relaxed);
960    })
961    .context("Error setting Ctrl-C handler")?;
962
963    if let Some(nr_samples) = opts.monitor_sched_samples {
964        let shutdown_copy = shutdown.clone();
965        let jh = std::thread::spawn(move || {
966            stats::monitor_sched_samples(nr_samples, shutdown_copy).unwrap()
967        });
968        let _ = jh.join();
969        return Ok(());
970    }
971
972    if let Some(intv) = opts.monitor.or(opts.stats) {
973        let shutdown_copy = shutdown.clone();
974        let jh = std::thread::spawn(move || {
975            stats::monitor(Duration::from_secs_f64(intv), shutdown_copy).unwrap()
976        });
977        if opts.monitor.is_some() {
978            let _ = jh.join();
979            return Ok(());
980        }
981    }
982
983    let mut open_object = MaybeUninit::uninit();
984    loop {
985        let mut sched = Scheduler::init(&opts, &mut open_object)?;
986        info!(
987            "scx_lavd scheduler is initialized (build ID: {})",
988            build_id::full_version(env!("CARGO_PKG_VERSION"))
989        );
990        info!("scx_lavd scheduler starts running.");
991        if !sched.run(&opts, shutdown.clone())?.should_restart() {
992            break;
993        }
994    }
995
996    Ok(())
997}