scx_lavd/
main.rs

1// SPDX-License-Identifier: GPL-2.0
2//
3// Copyright (c) 2024 Valve Corporation.
4// Author: Changwoo Min <changwoo@igalia.com>
5
6// This software may be used and distributed according to the terms of the
7// GNU General Public License version 2.
8
9mod bpf_skel;
10pub use bpf_skel::*;
11pub mod bpf_intf;
12pub use bpf_intf::*;
13
14mod stats;
15use std::cell::Cell;
16use std::cell::RefCell;
17use std::collections::BTreeMap;
18use std::ffi::c_int;
19use std::ffi::CStr;
20use std::fmt;
21use std::mem;
22use std::mem::MaybeUninit;
23use std::str;
24use std::sync::atomic::AtomicBool;
25use std::sync::atomic::Ordering;
26use std::sync::Arc;
27use std::thread::ThreadId;
28use std::time::Duration;
29
30use anyhow::Context;
31use anyhow::Result;
32use clap::Parser;
33use clap_num::number_range;
34use crossbeam::channel;
35use crossbeam::channel::Receiver;
36use crossbeam::channel::RecvTimeoutError;
37use crossbeam::channel::Sender;
38use crossbeam::channel::TrySendError;
39use itertools::iproduct;
40use libbpf_rs::OpenObject;
41use libbpf_rs::ProgramInput;
42use libc::c_char;
43use log::debug;
44use log::info;
45use plain::Plain;
46use scx_stats::prelude::*;
47use scx_utils::autopower::{fetch_power_profile, PowerProfile};
48use scx_utils::build_id;
49use scx_utils::compat;
50use scx_utils::read_cpulist;
51use scx_utils::scx_ops_attach;
52use scx_utils::scx_ops_load;
53use scx_utils::scx_ops_open;
54use scx_utils::set_rlimit_infinity;
55use scx_utils::uei_exited;
56use scx_utils::uei_report;
57use scx_utils::CoreType;
58use scx_utils::Cpumask;
59use scx_utils::EnergyModel;
60use scx_utils::Topology;
61use scx_utils::UserExitInfo;
62use scx_utils::NR_CPU_IDS;
63use stats::SchedSample;
64use stats::SchedSamples;
65use stats::StatsReq;
66use stats::StatsRes;
67use stats::SysStats;
68
69/// scx_lavd: Latency-criticality Aware Virtual Deadline (LAVD) scheduler
70///
71/// The rust part is minimal. It processes command line options and logs out
72/// scheduling statistics. The BPF part makes all the scheduling decisions.
73/// See the more detailed overview of the LAVD design at main.bpf.c.
74#[derive(Debug, Parser)]
75struct Opts {
76    /// Automatically decide the scheduler's power mode (performance vs.
77    /// powersave vs. balanced), CPU preference order, etc, based on system
78    /// load. The options affecting the power mode and the use of core compaction
79    /// (--autopower, --performance, --powersave, --balanced,
80    /// --no-core-compaction) cannot be used with this option. When no option
81    /// is specified, this is a default mode.
82    #[clap(long = "autopilot", action = clap::ArgAction::SetTrue)]
83    autopilot: bool,
84
85    /// Automatically decide the scheduler's power mode (performance vs.
86    /// powersave vs. balanced) based on the system's active power profile.
87    /// The scheduler's power mode decides the CPU preference order and the use
88    /// of core compaction, so the options affecting these (--autopilot,
89    /// --performance, --powersave, --balanced, --no-core-compaction) cannot
90    /// be used with this option.
91    #[clap(long = "autopower", action = clap::ArgAction::SetTrue)]
92    autopower: bool,
93
94    /// Run the scheduler in performance mode to get maximum performance.
95    /// This option cannot be used with other conflicting options (--autopilot,
96    /// --autopower, --balanced, --powersave, --no-core-compaction)
97    /// affecting the use of core compaction.
98    #[clap(long = "performance", action = clap::ArgAction::SetTrue)]
99    performance: bool,
100
101    /// Run the scheduler in powersave mode to minimize powr consumption.
102    /// This option cannot be used with other conflicting options (--autopilot,
103    /// --autopower, --performance, --balanced, --no-core-compaction)
104    /// affecting the use of core compaction.
105    #[clap(long = "powersave", action = clap::ArgAction::SetTrue)]
106    powersave: bool,
107
108    /// Run the scheduler in balanced mode aiming for sweetspot between power
109    /// and performance. This option cannot be used with other conflicting
110    /// options (--autopilot, --autopower, --performance, --powersave,
111    /// --no-core-compaction) affecting the use of core compaction.
112    #[clap(long = "balanced", action = clap::ArgAction::SetTrue)]
113    balanced: bool,
114
115    /// Maximum scheduling slice duration in microseconds.
116    #[clap(long = "slice-max-us", default_value = "5000")]
117    slice_max_us: u64,
118
119    /// Minimum scheduling slice duration in microseconds.
120    #[clap(long = "slice-min-us", default_value = "500")]
121    slice_min_us: u64,
122
123    /// Limit the ratio of preemption to the roughly top P% of latency-critical
124    /// tasks. When N is given as an argument, P is 0.5^N * 100. The default
125    /// value is 6, which limits the preemption for the top 1.56% of
126    /// latency-critical tasks.
127    #[clap(long = "preempt-shift", default_value = "6", value_parser=Opts::preempt_shift_range)]
128    preempt_shift: u8,
129
130    /// List of CPUs in preferred order (e.g., "0-3,7,6,5,4"). The scheduler
131    /// uses the CPU preference mode only when the core compaction is enabled
132    /// (i.e., balanced or powersave mode is specified as an option or chosen
133    /// in the autopilot or autopower mode).
134    #[clap(long = "cpu-pref-order", default_value = "")]
135    cpu_pref_order: String,
136
137    /// Do not boost futex holders.
138    #[clap(long = "no-futex-boost", action = clap::ArgAction::SetTrue)]
139    no_futex_boost: bool,
140
141    /// Disable preemption.
142    #[clap(long = "no-preemption", action = clap::ArgAction::SetTrue)]
143    no_preemption: bool,
144
145    /// Disable an optimization for synchronous wake-up.
146    #[clap(long = "no-wake-sync", action = clap::ArgAction::SetTrue)]
147    no_wake_sync: bool,
148
149    /// Disable core compaction so the scheduler uses all the online CPUs.
150    /// The core compaction attempts to minimize the number of actively used
151    /// CPUs for unaffinitized tasks, respecting the CPU preference order.
152    /// Normally, the core compaction is enabled by the power mode (i.e.,
153    /// balanced or powersave mode is specified as an option or chosen in
154    /// the autopilot or autopower mode). This option cannot be used with the
155    /// other options that control the core compaction (--autopilot,
156    /// --autopower, --performance, --balanced, --powersave).
157    #[clap(long = "no-core-compaction", action = clap::ArgAction::SetTrue)]
158    no_core_compaction: bool,
159
160    /// Disable controlling the CPU frequency.
161    #[clap(long = "no-freq-scaling", action = clap::ArgAction::SetTrue)]
162    no_freq_scaling: bool,
163
164    /// Enable stats monitoring with the specified interval.
165    #[clap(long)]
166    stats: Option<f64>,
167
168    /// Run in stats monitoring mode with the specified interval. Scheduler is not launched.
169    #[clap(long)]
170    monitor: Option<f64>,
171
172    /// Run in monitoring mode. Show the specified number of scheduling
173    /// samples every second.
174    #[clap(long)]
175    monitor_sched_samples: Option<u64>,
176
177    /// Enable verbose output, including libbpf details. Specify multiple
178    /// times to increase verbosity.
179    #[clap(short = 'v', long, action = clap::ArgAction::Count)]
180    verbose: u8,
181
182    /// Print scheduler version and exit.
183    #[clap(short = 'V', long, action = clap::ArgAction::SetTrue)]
184    version: bool,
185
186    /// Show descriptions for statistics.
187    #[clap(long)]
188    help_stats: bool,
189}
190
191impl Opts {
192    fn can_autopilot(&self) -> bool {
193        self.autopower == false
194            && self.performance == false
195            && self.powersave == false
196            && self.balanced == false
197            && self.no_core_compaction == false
198    }
199
200    fn can_autopower(&self) -> bool {
201        self.autopilot == false
202            && self.performance == false
203            && self.powersave == false
204            && self.balanced == false
205            && self.no_core_compaction == false
206    }
207
208    fn can_performance(&self) -> bool {
209        self.autopilot == false
210            && self.autopower == false
211            && self.powersave == false
212            && self.balanced == false
213    }
214
215    fn can_balanced(&self) -> bool {
216        self.autopilot == false
217            && self.autopower == false
218            && self.performance == false
219            && self.powersave == false
220            && self.no_core_compaction == false
221    }
222
223    fn can_powersave(&self) -> bool {
224        self.autopilot == false
225            && self.autopower == false
226            && self.performance == false
227            && self.balanced == false
228            && self.no_core_compaction == false
229    }
230
231    fn proc(&mut self) -> Option<&mut Self> {
232        if !self.autopilot {
233            self.autopilot = self.can_autopilot();
234        }
235        if self.autopilot {
236            if !self.can_autopilot() {
237                info!("Autopilot mode cannot be used with conflicting options.");
238                return None;
239            }
240            info!("Autopilot mode is enabled.");
241            return Some(self);
242        }
243
244        if self.autopower {
245            if !self.can_autopower() {
246                info!("Autopower mode cannot be used with conflicting options.");
247                return None;
248            }
249            info!("Autopower mode is enabled.");
250            return Some(self);
251        }
252
253        if self.performance {
254            if !self.can_performance() {
255                info!("Performance mode cannot be used with conflicting options.");
256                return None;
257            }
258            info!("Performance mode is enabled.");
259            self.no_core_compaction = true;
260            return Some(self);
261        }
262
263        if self.powersave {
264            if !self.can_powersave() {
265                info!("Powersave mode cannot be used with conflicting options.");
266                return None;
267            }
268            info!("Powersave mode is enabled.");
269            self.no_core_compaction = false;
270            return Some(self);
271        }
272
273        if self.balanced {
274            if !self.can_balanced() {
275                info!("Balanced mode cannot be used with conflicting options.");
276                return None;
277            }
278            info!("Balanced mode is enabled.");
279            self.no_core_compaction = false;
280            return Some(self);
281        }
282
283        Some(self)
284    }
285
286    fn preempt_shift_range(s: &str) -> Result<u8, String> {
287        number_range(s, 0, 10)
288    }
289}
290
291unsafe impl Plain for msg_task_ctx {}
292
293impl msg_task_ctx {
294    fn from_bytes(buf: &[u8]) -> &msg_task_ctx {
295        plain::from_bytes(buf).expect("The buffer is either too short or not aligned!")
296    }
297}
298
299impl introspec {
300    fn new() -> Self {
301        let intrspc = unsafe { mem::MaybeUninit::<introspec>::zeroed().assume_init() };
302        intrspc
303    }
304}
305
306#[derive(Debug, Clone)]
307struct CpuFlatId {
308    node_id: usize,
309    pd_id: usize,
310    llc_pos: usize,
311    core_pos: usize,
312    cpu_pos: usize,
313    cpu_id: usize,
314    smt_level: usize,
315    cache_size: usize,
316    cpu_cap: usize,
317    big_core: bool,
318    turbo_core: bool,
319}
320
321#[derive(Debug, Eq, PartialEq, Ord, PartialOrd, Clone)]
322struct ComputeDomainKey {
323    node_id: usize,
324    llc_pos: usize,
325    is_big: bool,
326}
327
328#[derive(Debug, Clone)]
329struct ComputeDomainValue {
330    cpdom_id: usize,
331    cpdom_alt_id: Cell<usize>,
332    cpu_ids: Vec<usize>,
333    neighbor_map: RefCell<BTreeMap<usize, RefCell<Vec<usize>>>>,
334}
335
336#[derive(Debug)]
337struct FlatTopology {
338    all_cpus_mask: Cpumask,
339    cpu_fids_performance: Vec<CpuFlatId>,
340    cpu_fids_powersave: Vec<CpuFlatId>,
341    cpdom_map: BTreeMap<ComputeDomainKey, ComputeDomainValue>,
342    smt_enabled: bool,
343}
344
345impl fmt::Display for FlatTopology {
346    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
347        for cpu_fid in self.cpu_fids_performance.iter() {
348            write!(f, "\nCPU in performance: {:?}", cpu_fid).ok();
349        }
350        for cpu_fid in self.cpu_fids_powersave.iter() {
351            write!(f, "\nCPU in powersave: {:?}", cpu_fid).ok();
352        }
353        for (k, v) in self.cpdom_map.iter() {
354            write!(f, "\nCPDOM: {:?} {:?}", k, v).ok();
355        }
356        write!(f, "SMT: {}", self.smt_enabled).ok();
357        Ok(())
358    }
359}
360
361impl FlatTopology {
362    /// Build a flat-structured topology
363    pub fn new() -> Result<FlatTopology> {
364        let sys_topo = Topology::new().expect("Failed to build host topology");
365        let sys_em = EnergyModel::new();
366        debug!("{:#?}", sys_topo);
367        debug!("{:#?}", sys_em);
368
369        let cpu_fids_performance = Self::build_cpu_fids(&sys_topo, &sys_em, false).unwrap();
370        let cpu_fids_powersave = Self::build_cpu_fids(&sys_topo, &sys_em, true).unwrap();
371
372        // Note that building compute domain is not dependent to CPU orer
373        // so it is okay to use any cpu_fids_*.
374        let cpdom_map = Self::build_cpdom(&cpu_fids_performance).unwrap();
375
376        Ok(FlatTopology {
377            all_cpus_mask: sys_topo.span,
378            cpu_fids_performance,
379            cpu_fids_powersave,
380            cpdom_map,
381            smt_enabled: sys_topo.smt_enabled,
382        })
383    }
384
385    /// Build a flat-structured list of CPUs in a preference order
386    fn build_cpu_fids(
387        topo: &Topology,
388        em: &Result<EnergyModel>,
389        prefer_powersave: bool,
390    ) -> Option<Vec<CpuFlatId>> {
391        let mut cpu_fids = Vec::new();
392
393        // Build a vector of cpu flat ids.
394        for (&node_id, node) in topo.nodes.iter() {
395            for (llc_pos, (_llc_id, llc)) in node.llcs.iter().enumerate() {
396                for (core_pos, (_core_id, core)) in llc.cores.iter().enumerate() {
397                    for (cpu_pos, (cpu_id, cpu)) in core.cpus.iter().enumerate() {
398                        let cpu_id = *cpu_id;
399                        let pd_id = Self::get_pd_id(em, cpu_id, node_id);
400                        let cpu_fid = CpuFlatId {
401                            node_id,
402                            pd_id,
403                            llc_pos,
404                            core_pos,
405                            cpu_pos,
406                            cpu_id,
407                            smt_level: cpu.smt_level,
408                            cache_size: cpu.cache_size,
409                            cpu_cap: cpu.cpu_capacity,
410                            big_core: cpu.core_type != CoreType::Little,
411                            turbo_core: cpu.core_type == CoreType::Big { turbo: true },
412                        };
413                        cpu_fids.push(RefCell::new(cpu_fid));
414                    }
415                }
416            }
417        }
418
419        // Convert a vector of RefCell to a vector of plain cpu_fids
420        let mut cpu_fids2 = Vec::new();
421        for cpu_fid in cpu_fids.iter() {
422            cpu_fids2.push(cpu_fid.borrow().clone());
423        }
424        let mut cpu_fids = cpu_fids2;
425
426        // Sort the cpu_fids
427        match prefer_powersave {
428            true => {
429                // Sort the cpu_fids by node, llc, cpu_cap, ^smt_level, ^cache_size, perf_dom, core, and cpu order
430                cpu_fids.sort_by(|a, b| {
431                    a.node_id
432                        .cmp(&b.node_id)
433                        .then_with(|| a.llc_pos.cmp(&b.llc_pos))
434                        .then_with(|| a.cpu_cap.cmp(&b.cpu_cap))
435                        .then_with(|| b.smt_level.cmp(&a.smt_level))
436                        .then_with(|| b.cache_size.cmp(&a.cache_size))
437                        .then_with(|| a.pd_id.cmp(&b.pd_id))
438                        .then_with(|| a.core_pos.cmp(&b.core_pos))
439                        .then_with(|| a.cpu_pos.cmp(&b.cpu_pos))
440                });
441            }
442            false => {
443                // Sort the cpu_fids by node, llc, ^cpu_cap, cpu_pos, smt_level, ^cache_size, perf_dom, and core order
444                // For performance mode, prioritize CPU capacity over physical position for ARM big.LITTLE systems
445                cpu_fids.sort_by(|a, b| {
446                    a.node_id
447                        .cmp(&b.node_id) // NUMA node first
448                        .then_with(|| a.llc_pos.cmp(&b.llc_pos)) // LLC locality
449                        .then_with(|| b.cpu_cap.cmp(&a.cpu_cap)) // CPU performance first (^cpu_cap)
450                        .then_with(|| a.cpu_pos.cmp(&b.cpu_pos)) // Physical position as tie-breaker
451                        .then_with(|| a.smt_level.cmp(&b.smt_level))
452                        .then_with(|| b.cache_size.cmp(&a.cache_size))
453                        .then_with(|| a.pd_id.cmp(&b.pd_id))
454                        .then_with(|| a.core_pos.cmp(&b.core_pos))
455                });
456            }
457        }
458
459        Some(cpu_fids)
460    }
461
462    /// Get the performance domain (i.e., CPU frequency domain) ID for a CPU.
463    /// If the energy model is not available, use NUMA node ID instead.
464    fn get_pd_id(em: &Result<EnergyModel>, cpu_id: usize, node_id: usize) -> usize {
465        match em {
466            Ok(em) => em.get_pd(cpu_id).unwrap().id,
467            Err(_) => node_id,
468        }
469    }
470
471    /// Build a list of compute domains
472    fn build_cpdom(
473        cpu_fids: &Vec<CpuFlatId>,
474    ) -> Option<BTreeMap<ComputeDomainKey, ComputeDomainValue>> {
475        // Creat a compute domain map, where a compute domain is a CPUs that
476        // are under the same node and LLC and have the same core type.
477        let mut cpdom_id = 0;
478        let mut cpdom_map: BTreeMap<ComputeDomainKey, ComputeDomainValue> = BTreeMap::new();
479        let mut cpdom_types: BTreeMap<usize, bool> = BTreeMap::new();
480        for cpu_fid in cpu_fids.iter() {
481            let key = ComputeDomainKey {
482                node_id: cpu_fid.node_id,
483                llc_pos: cpu_fid.llc_pos,
484                is_big: cpu_fid.big_core,
485            };
486            let mut value;
487            match cpdom_map.get(&key) {
488                Some(v) => {
489                    value = v.clone();
490                }
491                None => {
492                    value = ComputeDomainValue {
493                        cpdom_id,
494                        cpdom_alt_id: Cell::new(cpdom_id),
495                        cpu_ids: Vec::new(),
496                        neighbor_map: RefCell::new(BTreeMap::new()),
497                    };
498                    cpdom_types.insert(cpdom_id, key.is_big);
499
500                    cpdom_id += 1;
501                }
502            }
503            value.cpu_ids.push(cpu_fid.cpu_id);
504            cpdom_map.insert(key, value);
505        }
506
507        // Build a neighbor map for each compute domain, where neighbors are
508        // ordered by core type, node, and LLC.
509        for ((from_k, from_v), (to_k, to_v)) in iproduct!(cpdom_map.iter(), cpdom_map.iter()) {
510            if from_k == to_k {
511                continue;
512            }
513
514            let d = Self::dist(from_k, to_k);
515            let mut map = from_v.neighbor_map.borrow_mut();
516            match map.get(&d) {
517                Some(v) => {
518                    v.borrow_mut().push(to_v.cpdom_id);
519                }
520                None => {
521                    map.insert(d, RefCell::new(vec![to_v.cpdom_id]));
522                }
523            }
524        }
525
526        // Fill up cpdom_alt_id for each compute domain.
527        for (k, v) in cpdom_map.iter() {
528            let mut key = k.clone();
529            key.is_big = !k.is_big;
530
531            if let Some(alt_v) = cpdom_map.get(&key) {
532                // First, try to find an alternative domain
533                // under the same node/LLC.
534                v.cpdom_alt_id.set(alt_v.cpdom_id);
535            } else {
536                // If there is no alternative domain in the same node/LLC,
537                // choose the closest one.
538                //
539                // Note that currently, the idle CPU selection (pick_idle_cpu)
540                // is not optimized for this kind of architecture, where big
541                // and LITTLE cores are in different node/LLCs.
542                'outer: for (_dist, ncpdoms) in v.neighbor_map.borrow().iter() {
543                    for ncpdom_id in ncpdoms.borrow().iter() {
544                        if let Some(is_big) = cpdom_types.get(ncpdom_id) {
545                            if *is_big == key.is_big {
546                                v.cpdom_alt_id.set(*ncpdom_id);
547                                break 'outer;
548                            }
549                        }
550                    }
551                }
552            }
553        }
554
555        Some(cpdom_map)
556    }
557
558    /// Calculate distance from two compute domains
559    fn dist(from: &ComputeDomainKey, to: &ComputeDomainKey) -> usize {
560        let mut d = 0;
561        // code type > numa node > llc
562        if from.is_big != to.is_big {
563            d += 3;
564        }
565        if from.node_id != to.node_id {
566            d += 2;
567        } else {
568            if from.llc_pos != to.llc_pos {
569                d += 1;
570            }
571        }
572        d
573    }
574}
575
576struct Scheduler<'a> {
577    skel: BpfSkel<'a>,
578    struct_ops: Option<libbpf_rs::Link>,
579    rb_mgr: libbpf_rs::RingBuffer<'static>,
580    intrspc: introspec,
581    intrspc_rx: Receiver<SchedSample>,
582    monitor_tid: Option<ThreadId>,
583    stats_server: StatsServer<StatsReq, StatsRes>,
584    mseq_id: u64,
585}
586
587impl<'a> Scheduler<'a> {
588    fn init(opts: &'a Opts, open_object: &'a mut MaybeUninit<OpenObject>) -> Result<Self> {
589        if *NR_CPU_IDS > LAVD_CPU_ID_MAX as usize {
590            panic!(
591                "Num possible CPU IDs ({}) exceeds maximum of ({})",
592                *NR_CPU_IDS, LAVD_CPU_ID_MAX
593            );
594        }
595
596        // Increase MEMLOCK size since the BPF scheduler might use
597        // more than the current limit
598        set_rlimit_infinity();
599
600        // Open the BPF prog first for verification.
601        let mut skel_builder = BpfSkelBuilder::default();
602        skel_builder.obj_builder.debug(opts.verbose > 0);
603        let mut skel = scx_ops_open!(skel_builder, open_object, lavd_ops)?;
604
605        // Enable autoloads for conditionally loaded things
606        // immediately after creating skel (because this is always before loading)
607        if !opts.no_futex_boost {
608            compat::cond_tracepoint_enable(
609                "syscalls:sys_enter_futex",
610                &skel.progs.rtp_sys_enter_futex,
611            )?;
612            compat::cond_tracepoint_enable(
613                "syscalls:sys_exit_futex",
614                &skel.progs.rtp_sys_exit_futex,
615            )?;
616            compat::cond_tracepoint_enable(
617                "syscalls:sys_exit_futex_wait",
618                &skel.progs.rtp_sys_exit_futex_wait,
619            )?;
620            compat::cond_tracepoint_enable(
621                "syscalls:sys_exit_futex_waitv",
622                &skel.progs.rtp_sys_exit_futex_waitv,
623            )?;
624            compat::cond_tracepoint_enable(
625                "syscalls:sys_exit_futex_wake",
626                &skel.progs.rtp_sys_exit_futex_wake,
627            )?;
628        }
629
630        // Initialize CPU topology
631        let topo = FlatTopology::new().unwrap();
632        Self::init_cpus(&mut skel, &opts, &topo);
633
634        // Initialize skel according to @opts.
635        Self::init_globals(&mut skel, &opts, &topo);
636
637        // Attach.
638        let mut skel = scx_ops_load!(skel, lavd_ops, uei)?;
639        let struct_ops = Some(scx_ops_attach!(skel, lavd_ops)?);
640        let stats_server = StatsServer::new(stats::server_data(*NR_CPU_IDS as u64)).launch()?;
641
642        // Build a ring buffer for instrumentation
643        let (intrspc_tx, intrspc_rx) = channel::bounded(65536);
644        let rb_map = &mut skel.maps.introspec_msg;
645        let mut builder = libbpf_rs::RingBufferBuilder::new();
646        builder
647            .add(rb_map, move |data| {
648                Scheduler::relay_introspec(data, &intrspc_tx)
649            })
650            .unwrap();
651        let rb_mgr = builder.build().unwrap();
652
653        Ok(Self {
654            skel,
655            struct_ops,
656            rb_mgr,
657            intrspc: introspec::new(),
658            intrspc_rx,
659            monitor_tid: None,
660            stats_server,
661            mseq_id: 0,
662        })
663    }
664
665    fn init_cpus(skel: &mut OpenBpfSkel, opts: &Opts, topo: &FlatTopology) {
666        debug!("{:#?}", topo);
667
668        // Initialize CPU capacity
669        for (_, cpu) in topo.cpu_fids_performance.iter().enumerate() {
670            skel.maps.rodata_data.cpu_capacity[cpu.cpu_id] = cpu.cpu_cap as u16;
671            skel.maps.rodata_data.cpu_big[cpu.cpu_id] = cpu.big_core as u8;
672            skel.maps.rodata_data.cpu_turbo[cpu.cpu_id] = cpu.turbo_core as u8;
673        }
674
675        // If cpu_pref_order is not specified, initialize CPU order
676        // topologically sorted by a cpu, node, llc, max_freq, and core order.
677        // Otherwise, follow the specified CPU preference order.
678        let (cpu_pf_order, cpu_ps_order) = if opts.cpu_pref_order.is_empty() {
679            (
680                topo.cpu_fids_performance
681                    .iter()
682                    .map(|cpu| cpu.cpu_id)
683                    .collect(),
684                topo.cpu_fids_powersave
685                    .iter()
686                    .map(|cpu| cpu.cpu_id)
687                    .collect(),
688            )
689        } else {
690            let cpu_list = read_cpulist(&opts.cpu_pref_order).unwrap();
691            let pref_mask = Cpumask::from_cpulist(&opts.cpu_pref_order).unwrap();
692            if pref_mask != topo.all_cpus_mask {
693                panic!("--cpu_pref_order does not cover the whole CPUs.");
694            }
695            (cpu_list.clone(), cpu_list)
696        };
697        for (pos, cpu) in cpu_pf_order.iter().enumerate() {
698            skel.maps.rodata_data.cpu_order_performance[pos] = *cpu as u16;
699        }
700        for (pos, cpu) in cpu_ps_order.iter().enumerate() {
701            skel.maps.rodata_data.cpu_order_powersave[pos] = *cpu as u16;
702        }
703        if !opts.powersave {
704            info!("CPU pref order in performance mode: {:?}", cpu_pf_order);
705        }
706        if !opts.performance {
707            info!("CPU pref order in powersave mode: {:?}", cpu_ps_order);
708        }
709
710        // Initialize compute domain contexts
711        for (k, v) in topo.cpdom_map.iter() {
712            skel.maps.bss_data.cpdom_ctxs[v.cpdom_id].id = v.cpdom_id as u64;
713            skel.maps.bss_data.cpdom_ctxs[v.cpdom_id].alt_id = v.cpdom_alt_id.get() as u64;
714            skel.maps.bss_data.cpdom_ctxs[v.cpdom_id].node_id = k.node_id as u8;
715            skel.maps.bss_data.cpdom_ctxs[v.cpdom_id].is_big = k.is_big as u8;
716            skel.maps.bss_data.cpdom_ctxs[v.cpdom_id].is_valid = 1;
717            for cpu_id in v.cpu_ids.iter() {
718                let i = cpu_id / 64;
719                let j = cpu_id % 64;
720                skel.maps.bss_data.cpdom_ctxs[v.cpdom_id].__cpumask[i] |= 0x01 << j;
721            }
722
723            if v.neighbor_map.borrow().iter().len() > LAVD_CPDOM_MAX_DIST as usize {
724                panic!("The processor topology is too complex to handle in BPF.");
725            }
726
727            for (k, (_d, neighbors)) in v.neighbor_map.borrow().iter().enumerate() {
728                let nr_neighbors = neighbors.borrow().len() as u8;
729                if nr_neighbors > LAVD_CPDOM_MAX_NR as u8 {
730                    panic!("The processor topology is too complex to handle in BPF.");
731                }
732                skel.maps.bss_data.cpdom_ctxs[v.cpdom_id].nr_neighbors[k] = nr_neighbors;
733                for n in neighbors.borrow().iter() {
734                    skel.maps.bss_data.cpdom_ctxs[v.cpdom_id].neighbor_bits[k] |= 0x1 << n;
735                }
736            }
737        }
738    }
739
740    fn init_globals(skel: &mut OpenBpfSkel, opts: &Opts, topo: &FlatTopology) {
741        skel.maps.bss_data.no_preemption = opts.no_preemption;
742        skel.maps.bss_data.no_wake_sync = opts.no_wake_sync;
743        skel.maps.bss_data.no_core_compaction = opts.no_core_compaction;
744        skel.maps.bss_data.no_freq_scaling = opts.no_freq_scaling;
745        skel.maps.bss_data.is_powersave_mode = opts.powersave;
746        skel.maps.rodata_data.nr_cpu_ids = *NR_CPU_IDS as u64;
747        skel.maps.rodata_data.is_smt_active = topo.smt_enabled;
748        skel.maps.rodata_data.is_autopilot_on = opts.autopilot;
749        skel.maps.rodata_data.verbose = opts.verbose;
750        skel.maps.rodata_data.slice_max_ns = opts.slice_max_us * 1000;
751        skel.maps.rodata_data.slice_min_ns = opts.slice_min_us * 1000;
752        skel.maps.rodata_data.preempt_shift = opts.preempt_shift;
753
754        skel.struct_ops.lavd_ops_mut().flags = *compat::SCX_OPS_ALLOW_QUEUED_WAKEUP
755            | *compat::SCX_OPS_ENQ_EXITING
756            | *compat::SCX_OPS_ENQ_LAST
757            | *compat::SCX_OPS_ENQ_MIGRATION_DISABLED
758            | *compat::SCX_OPS_KEEP_BUILTIN_IDLE;
759    }
760
761    fn get_msg_seq_id() -> u64 {
762        static mut MSEQ: u64 = 0;
763        unsafe {
764            MSEQ += 1;
765            MSEQ
766        }
767    }
768
769    fn relay_introspec(data: &[u8], intrspc_tx: &Sender<SchedSample>) -> i32 {
770        let mt = msg_task_ctx::from_bytes(data);
771        let tx = mt.taskc_x;
772        let tc = mt.taskc;
773
774        // No idea how to print other types than LAVD_MSG_TASKC
775        if mt.hdr.kind != LAVD_MSG_TASKC {
776            return 0;
777        }
778
779        let mseq = Scheduler::get_msg_seq_id();
780
781        let c_tx_cm: *const c_char = (&tx.comm as *const [c_char; 17]) as *const c_char;
782        let c_tx_cm_str: &CStr = unsafe { CStr::from_ptr(c_tx_cm) };
783        let tx_comm: &str = c_tx_cm_str.to_str().unwrap();
784
785        let c_tx_st: *const c_char = (&tx.stat as *const [c_char; 5]) as *const c_char;
786        let c_tx_st_str: &CStr = unsafe { CStr::from_ptr(c_tx_st) };
787        let tx_stat: &str = c_tx_st_str.to_str().unwrap();
788
789        match intrspc_tx.try_send(SchedSample {
790            mseq,
791            pid: tx.pid,
792            comm: tx_comm.into(),
793            stat: tx_stat.into(),
794            cpu_id: tx.cpu_id,
795            slice_ns: tc.slice_ns,
796            lat_cri: tc.lat_cri,
797            avg_lat_cri: tx.avg_lat_cri,
798            static_prio: tx.static_prio,
799            run_freq: tc.run_freq,
800            avg_runtime: tc.avg_runtime,
801            wait_freq: tc.wait_freq,
802            wake_freq: tc.wake_freq,
803            perf_cri: tc.perf_cri,
804            thr_perf_cri: tx.thr_perf_cri,
805            cpuperf_cur: tx.cpuperf_cur,
806            cpu_util: tx.cpu_util,
807            cpu_sutil: tx.cpu_sutil,
808            nr_active: tx.nr_active,
809        }) {
810            Ok(()) | Err(TrySendError::Full(_)) => 0,
811            Err(e) => panic!("failed to send on intrspc_tx ({})", e),
812        }
813    }
814
815    fn prep_introspec(&mut self) {
816        self.skel.maps.bss_data.intrspc.cmd = self.intrspc.cmd;
817        self.skel.maps.bss_data.intrspc.arg = self.intrspc.arg;
818    }
819
820    fn cleanup_introspec(&mut self) {
821        self.skel.maps.bss_data.intrspc.cmd = LAVD_CMD_NOP;
822    }
823
824    fn get_pc(x: u64, y: u64) -> f64 {
825        return 100. * x as f64 / y as f64;
826    }
827
828    fn get_power_mode(power_mode: i32) -> &'static str {
829        match power_mode as u32 {
830            LAVD_PM_PERFORMANCE => "performance",
831            LAVD_PM_BALANCED => "balanced",
832            LAVD_PM_POWERSAVE => "powersave",
833            _ => "unknown",
834        }
835    }
836
837    fn stats_req_to_res(&mut self, req: &StatsReq) -> Result<StatsRes> {
838        Ok(match req {
839            StatsReq::NewSampler(tid) => {
840                self.rb_mgr.consume().unwrap();
841                self.monitor_tid = Some(*tid);
842                StatsRes::Ack
843            }
844            StatsReq::SysStatsReq { tid } => {
845                if Some(*tid) != self.monitor_tid {
846                    return Ok(StatsRes::Bye);
847                }
848                self.mseq_id += 1;
849
850                let bss_data = &self.skel.maps.bss_data;
851                let st = bss_data.sys_stat;
852
853                let mseq = self.mseq_id;
854                let nr_queued_task = st.nr_queued_task;
855                let nr_active = st.nr_active;
856                let nr_sched = st.nr_sched;
857                let nr_preempt = st.nr_preempt;
858                let pc_pc = Self::get_pc(st.nr_perf_cri, nr_sched);
859                let pc_lc = Self::get_pc(st.nr_lat_cri, nr_sched);
860                let pc_x_migration = Self::get_pc(st.nr_x_migration, nr_sched);
861                let nr_stealee = st.nr_stealee;
862                let nr_big = st.nr_big;
863                let pc_big = Self::get_pc(nr_big, nr_sched);
864                let pc_pc_on_big = Self::get_pc(st.nr_pc_on_big, nr_big);
865                let pc_lc_on_big = Self::get_pc(st.nr_lc_on_big, nr_big);
866                let power_mode = Self::get_power_mode(bss_data.power_mode);
867                let total_time = bss_data.performance_mode_ns
868                    + bss_data.balanced_mode_ns
869                    + bss_data.powersave_mode_ns;
870                let pc_performance = Self::get_pc(bss_data.performance_mode_ns, total_time);
871                let pc_balanced = Self::get_pc(bss_data.balanced_mode_ns, total_time);
872                let pc_powersave = Self::get_pc(bss_data.powersave_mode_ns, total_time);
873
874                StatsRes::SysStats(SysStats {
875                    mseq,
876                    nr_queued_task,
877                    nr_active,
878                    nr_sched,
879                    nr_preempt,
880                    pc_pc,
881                    pc_lc,
882                    pc_x_migration,
883                    nr_stealee,
884                    pc_big,
885                    pc_pc_on_big,
886                    pc_lc_on_big,
887                    power_mode: power_mode.to_string(),
888                    pc_performance,
889                    pc_balanced,
890                    pc_powersave,
891                })
892            }
893            StatsReq::SchedSamplesNr {
894                tid,
895                nr_samples,
896                interval_ms,
897            } => {
898                if Some(*tid) != self.monitor_tid {
899                    return Ok(StatsRes::Bye);
900                }
901
902                self.intrspc.cmd = LAVD_CMD_SCHED_N;
903                self.intrspc.arg = *nr_samples;
904                self.prep_introspec();
905                std::thread::sleep(Duration::from_millis(*interval_ms));
906                self.rb_mgr.poll(Duration::from_millis(100)).unwrap();
907
908                let mut samples = vec![];
909                while let Ok(ts) = self.intrspc_rx.try_recv() {
910                    samples.push(ts);
911                }
912
913                self.cleanup_introspec();
914
915                StatsRes::SchedSamples(SchedSamples { samples })
916            }
917        })
918    }
919
920    pub fn exited(&mut self) -> bool {
921        uei_exited!(&self.skel, uei)
922    }
923
924    fn set_power_profile(&mut self, mode: u32) -> Result<(), u32> {
925        let prog = &mut self.skel.progs.set_power_profile;
926        let mut args = power_arg {
927            power_mode: mode as c_int,
928        };
929        let input = ProgramInput {
930            context_in: Some(unsafe {
931                std::slice::from_raw_parts_mut(
932                    &mut args as *mut _ as *mut u8,
933                    std::mem::size_of_val(&args),
934                )
935            }),
936            ..Default::default()
937        };
938        let out = prog.test_run(input).unwrap();
939        if out.return_value != 0 {
940            return Err(out.return_value);
941        }
942
943        Ok(())
944    }
945
946    fn update_power_profile(&mut self, prev_profile: PowerProfile) -> (bool, PowerProfile) {
947        let profile = fetch_power_profile(false);
948        if profile == prev_profile {
949            // If the profile is the same, skip updaring the profile for BPF.
950            return (true, profile);
951        }
952
953        let _ = match profile {
954            PowerProfile::Performance => self.set_power_profile(LAVD_PM_PERFORMANCE),
955            PowerProfile::Balanced { .. } => self.set_power_profile(LAVD_PM_BALANCED),
956            PowerProfile::Powersave => self.set_power_profile(LAVD_PM_POWERSAVE),
957            PowerProfile::Unknown => {
958                // We don't know how to handle an unknown energy profile,
959                // so we just give up updating the profile from now on.
960                return (false, profile);
961            }
962        };
963
964        info!("Set the scheduler's power profile to {profile} mode.");
965        (true, profile)
966    }
967
968    fn run(&mut self, opts: &Opts, shutdown: Arc<AtomicBool>) -> Result<UserExitInfo> {
969        let (res_ch, req_ch) = self.stats_server.channels();
970        let mut autopower = opts.autopower;
971        let mut profile = PowerProfile::Unknown;
972
973        if opts.performance {
974            let _ = self.set_power_profile(LAVD_PM_PERFORMANCE);
975        } else if opts.powersave {
976            let _ = self.set_power_profile(LAVD_PM_POWERSAVE);
977        } else {
978            let _ = self.set_power_profile(LAVD_PM_BALANCED);
979        }
980
981        while !shutdown.load(Ordering::Relaxed) && !self.exited() {
982            if autopower {
983                (autopower, profile) = self.update_power_profile(profile);
984            }
985
986            match req_ch.recv_timeout(Duration::from_secs(1)) {
987                Ok(req) => {
988                    let res = self.stats_req_to_res(&req)?;
989                    res_ch.send(res)?;
990                }
991                Err(RecvTimeoutError::Timeout) => {}
992                Err(e) => Err(e)?,
993            }
994            self.cleanup_introspec();
995        }
996        self.rb_mgr.consume().unwrap();
997
998        let _ = self.struct_ops.take();
999        uei_report!(&self.skel, uei)
1000    }
1001}
1002
1003impl Drop for Scheduler<'_> {
1004    fn drop(&mut self) {
1005        if let Some(struct_ops) = self.struct_ops.take() {
1006            drop(struct_ops);
1007        }
1008    }
1009}
1010
1011fn init_log(opts: &Opts) {
1012    let llv = match opts.verbose {
1013        0 => simplelog::LevelFilter::Info,
1014        1 => simplelog::LevelFilter::Debug,
1015        _ => simplelog::LevelFilter::Trace,
1016    };
1017    let mut lcfg = simplelog::ConfigBuilder::new();
1018    lcfg.set_time_offset_to_local()
1019        .expect("Failed to set local time offset")
1020        .set_time_level(simplelog::LevelFilter::Error)
1021        .set_location_level(simplelog::LevelFilter::Off)
1022        .set_target_level(simplelog::LevelFilter::Off)
1023        .set_thread_level(simplelog::LevelFilter::Off);
1024    simplelog::TermLogger::init(
1025        llv,
1026        lcfg.build(),
1027        simplelog::TerminalMode::Stderr,
1028        simplelog::ColorChoice::Auto,
1029    )
1030    .unwrap();
1031}
1032
1033fn main() -> Result<()> {
1034    let mut opts = Opts::parse();
1035
1036    if opts.version {
1037        println!(
1038            "scx_lavd {}",
1039            build_id::full_version(env!("CARGO_PKG_VERSION"))
1040        );
1041        return Ok(());
1042    }
1043
1044    if opts.help_stats {
1045        let sys_stats_meta_name = SysStats::meta().name;
1046        let sched_sample_meta_name = SchedSample::meta().name;
1047        let stats_meta_names: &[&str] = &[
1048            sys_stats_meta_name.as_str(),
1049            sched_sample_meta_name.as_str(),
1050        ];
1051        stats::server_data(0).describe_meta(&mut std::io::stdout(), Some(&stats_meta_names))?;
1052        return Ok(());
1053    }
1054
1055    init_log(&opts);
1056
1057    opts.proc().unwrap();
1058    info!("{:#?}", opts);
1059
1060    let shutdown = Arc::new(AtomicBool::new(false));
1061    let shutdown_clone = shutdown.clone();
1062    ctrlc::set_handler(move || {
1063        shutdown_clone.store(true, Ordering::Relaxed);
1064    })
1065    .context("Error setting Ctrl-C handler")?;
1066
1067    if let Some(nr_samples) = opts.monitor_sched_samples {
1068        let shutdown_copy = shutdown.clone();
1069        let jh = std::thread::spawn(move || {
1070            stats::monitor_sched_samples(nr_samples, shutdown_copy).unwrap()
1071        });
1072        let _ = jh.join();
1073        return Ok(());
1074    }
1075
1076    if let Some(intv) = opts.monitor.or(opts.stats) {
1077        let shutdown_copy = shutdown.clone();
1078        let jh = std::thread::spawn(move || {
1079            stats::monitor(Duration::from_secs_f64(intv), shutdown_copy).unwrap()
1080        });
1081        if opts.monitor.is_some() {
1082            let _ = jh.join();
1083            return Ok(());
1084        }
1085    }
1086
1087    let mut open_object = MaybeUninit::uninit();
1088    loop {
1089        let mut sched = Scheduler::init(&opts, &mut open_object)?;
1090        info!(
1091            "scx_lavd scheduler is initialized (build ID: {})",
1092            build_id::full_version(env!("CARGO_PKG_VERSION"))
1093        );
1094        info!("scx_lavd scheduler starts running.");
1095        if !sched.run(&opts, shutdown.clone())?.should_restart() {
1096            break;
1097        }
1098    }
1099
1100    Ok(())
1101}