Skip to main content

scx_lavd/
main.rs

1// SPDX-License-Identifier: GPL-2.0
2//
3// Copyright (c) 2024 Valve Corporation.
4// Author: Changwoo Min <changwoo@igalia.com>
5
6// This software may be used and distributed according to the terms of the
7// GNU General Public License version 2.
8
9mod bpf_skel;
10pub use bpf_skel::*;
11pub mod bpf_intf;
12mod bpf_streams;
13pub use bpf_intf::*;
14
15mod cpu_order;
16use scx_utils::init_libbpf_logging;
17mod stats;
18use std::ffi::c_int;
19use std::ffi::CStr;
20use std::mem;
21use std::mem::MaybeUninit;
22use std::str;
23use std::sync::atomic::AtomicBool;
24use std::sync::atomic::Ordering;
25use std::sync::Arc;
26use std::thread::ThreadId;
27use std::time::Duration;
28
29use anyhow::Context;
30use anyhow::Result;
31use clap::Parser;
32use clap_num::number_range;
33use cpu_order::CpuOrder;
34use cpu_order::PerfCpuOrder;
35use crossbeam::channel;
36use crossbeam::channel::Receiver;
37use crossbeam::channel::RecvTimeoutError;
38use crossbeam::channel::Sender;
39use crossbeam::channel::TrySendError;
40use libbpf_rs::skel::Skel;
41use libbpf_rs::OpenObject;
42use libbpf_rs::PrintLevel;
43use libbpf_rs::ProgramInput;
44use libc::c_char;
45use plain::Plain;
46use scx_arena::ArenaLib;
47use scx_stats::prelude::*;
48use scx_utils::autopower::{fetch_power_profile, PowerProfile};
49use scx_utils::build_id;
50use scx_utils::compat;
51use scx_utils::ksym_exists;
52use scx_utils::libbpf_clap_opts::LibbpfOpts;
53use scx_utils::scx_ops_attach;
54use scx_utils::scx_ops_load;
55use scx_utils::scx_ops_open;
56use scx_utils::try_set_rlimit_infinity;
57use scx_utils::uei_exited;
58use scx_utils::uei_report;
59use scx_utils::EnergyModel;
60use scx_utils::TopologyArgs;
61use scx_utils::UserExitInfo;
62use scx_utils::NR_CPU_IDS;
63use stats::SchedSample;
64use stats::SchedSamples;
65use stats::StatsReq;
66use stats::StatsRes;
67use stats::SysStats;
68use tracing::{debug, info, warn};
69use tracing_subscriber::filter::EnvFilter;
70
71const SCHEDULER_NAME: &str = "scx_lavd";
72/// scx_lavd: Latency-criticality Aware Virtual Deadline (LAVD) scheduler
73///
74/// The rust part is minimal. It processes command line options and logs out
75/// scheduling statistics. The BPF part makes all the scheduling decisions.
76/// See the more detailed overview of the LAVD design at main.bpf.c.
77#[derive(Debug, Parser)]
78struct Opts {
79    /// Deprecated, noop, use RUST_LOG or --log-level instead.
80    #[clap(short = 'v', long, action = clap::ArgAction::Count)]
81    verbose: u8,
82
83    /// Automatically decide the scheduler's power mode (performance vs.
84    /// powersave vs. balanced), CPU preference order, etc, based on system
85    /// load. The options affecting the power mode and the use of core compaction
86    /// (--autopower, --performance, --powersave, --balanced,
87    /// --no-core-compaction) cannot be used with this option. When no option
88    /// is specified, this is a default mode.
89    #[clap(long = "autopilot", action = clap::ArgAction::SetTrue)]
90    autopilot: bool,
91
92    /// Automatically decide the scheduler's power mode (performance vs.
93    /// powersave vs. balanced) based on the system's active power profile.
94    /// The scheduler's power mode decides the CPU preference order and the use
95    /// of core compaction, so the options affecting these (--autopilot,
96    /// --performance, --powersave, --balanced, --no-core-compaction) cannot
97    /// be used with this option.
98    #[clap(long = "autopower", action = clap::ArgAction::SetTrue)]
99    autopower: bool,
100
101    /// Run the scheduler in performance mode to get maximum performance.
102    /// This option cannot be used with other conflicting options (--autopilot,
103    /// --autopower, --balanced, --powersave, --no-core-compaction)
104    /// affecting the use of core compaction.
105    #[clap(long = "performance", action = clap::ArgAction::SetTrue)]
106    performance: bool,
107
108    /// Run the scheduler in powersave mode to minimize power consumption.
109    /// This option cannot be used with other conflicting options (--autopilot,
110    /// --autopower, --performance, --balanced, --no-core-compaction)
111    /// affecting the use of core compaction.
112    #[clap(long = "powersave", action = clap::ArgAction::SetTrue)]
113    powersave: bool,
114
115    /// Run the scheduler in balanced mode aiming for sweetspot between power
116    /// and performance. This option cannot be used with other conflicting
117    /// options (--autopilot, --autopower, --performance, --powersave,
118    /// --no-core-compaction) affecting the use of core compaction.
119    #[clap(long = "balanced", action = clap::ArgAction::SetTrue)]
120    balanced: bool,
121
122    /// Maximum scheduling slice duration in microseconds.
123    #[clap(long = "slice-max-us", default_value = "5000")]
124    slice_max_us: u64,
125
126    /// Minimum scheduling slice duration in microseconds.
127    #[clap(long = "slice-min-us", default_value = "500")]
128    slice_min_us: u64,
129
130    /// Target load percentage for turbulent CPUs relative to non-turbulent
131    /// CPUs' per-capacity utilization. 100 means turbulent CPUs should carry
132    /// the same per-capacity load as non-turbulent CPUs. Values below 100
133    /// route fewer tasks to turbulent CPUs; values above 100 route more.
134    /// Range: 0-200. Default: 100.
135    #[clap(long = "lat-load-target-pct", default_value = "100", value_parser=Opts::lat_load_target_pct_range)]
136    lat_load_target_pct: u16,
137
138    /// Migration delta threshold percentage (0-100). When set to a non-zero value,
139    /// the migration threshold is mig-delta-pct percent of the average load.
140    /// Additionally, disables force task stealing in the consume path, relying only
141    /// on the is_stealer/is_stealee thresholds for more predictable load balancing.
142    /// Default is 0 (disabled, uses dynamic threshold based on load with both
143    /// probabilistic and force task stealing enabled). This is an experimental feature.
144    #[clap(long = "mig-delta-pct", default_value = "0", value_parser=Opts::mig_delta_pct_range)]
145    mig_delta_pct: u8,
146
147    /// Low utilization threshold percentage (0-100) for periodic load balancing.
148    /// When set to a non-zero value, periodic load balancing is skipped when
149    /// the maximum per-domain utilization is below this percentage.
150    /// Default is 25 (skip periodic LB below 25% utilization).
151    /// Set to 0 to disable. Set to 100 to always skip periodic LB.
152    #[clap(long = "lb-low-util-pct", default_value = "25", value_parser=Opts::lb_low_util_pct_range)]
153    lb_low_util_pct: u8,
154
155    /// Low utilization threshold percentage (0-100) for bypassing deadline
156    /// scheduling. When set to a non-zero value, tasks are dispatched directly
157    /// to the local DSQ (FIFO) instead of using deadline-based ordering when
158    /// the per-CPU utilization is below this percentage.
159    /// Default is 10 (bypass deadline scheduling below 10% utilization).
160    /// Set to 0 to disable. Set to 100 to always bypass deadline scheduling.
161    #[clap(long = "lb-local-dsq-util-pct", default_value = "10", value_parser=Opts::lb_local_dsq_util_pct_range)]
162    lb_local_dsq_util_pct: u8,
163
164    /// Slice duration in microseconds to use for all tasks when pinned tasks
165    /// are running on a CPU. Must be between slice-min-us and slice-max-us.
166    /// When this option is enabled, pinned tasks are always enqueued to per-CPU DSQs
167    /// and the dispatch logic compares vtimes across all DSQs to select the lowest
168    /// vtime task. This helps improve responsiveness for pinned tasks. By default,
169    /// this option is on with a default value of 5000 (5 msec). To turn off the option,
170    /// explicitly set the value to 0.
171    #[clap(long = "pinned-slice-us", default_value = "5000")]
172    pinned_slice_us: Option<u64>,
173
174    /// Limit the ratio of preemption to the roughly top P% of latency-critical
175    /// tasks. When N is given as an argument, P is 0.5^N * 100. The default
176    /// value is 6, which limits the preemption for the top 1.56% of
177    /// latency-critical tasks.
178    #[clap(long = "preempt-shift", default_value = "6", value_parser=Opts::preempt_shift_range)]
179    preempt_shift: u8,
180
181    /// List of CPUs in preferred order (e.g., "0-3,7,6,5,4"). The scheduler
182    /// uses the CPU preference mode only when the core compaction is enabled
183    /// (i.e., balanced or powersave mode is specified as an option or chosen
184    /// in the autopilot or autopower mode). When "--cpu-pref-order" is given,
185    /// it implies "--no-use-em".
186    #[clap(long = "cpu-pref-order", default_value = "")]
187    cpu_pref_order: String,
188
189    /// Do not use the energy model in making CPU preference order decisions.
190    #[clap(long = "no-use-em", action = clap::ArgAction::SetTrue)]
191    no_use_em: bool,
192
193    /// Do not boost futex holders.
194    #[clap(long = "no-futex-boost", action = clap::ArgAction::SetTrue)]
195    no_futex_boost: bool,
196
197    /// Default: --no-fast-lb is deactivated (fast load balancer is on).
198    /// Disable the fast (batch-migration) load balancer and fall back to
199    /// the pre-fast-lb load-balancer behavior.
200    #[clap(long = "no-fast-lb", action = clap::ArgAction::SetTrue)]
201    no_fast_lb: bool,
202
203    /// Disable preemption.
204    #[clap(long = "no-preemption", action = clap::ArgAction::SetTrue)]
205    no_preemption: bool,
206
207    /// Disable an optimization for synchronous wake-up.
208    #[clap(long = "no-wake-sync", action = clap::ArgAction::SetTrue)]
209    no_wake_sync: bool,
210
211    /// Disable dynamic slice boost for long-running tasks.
212    #[clap(long = "no-slice-boost", action = clap::ArgAction::SetTrue)]
213    no_slice_boost: bool,
214
215    /// Enables DSQs per CPU, this enables task queuing and dispatching
216    /// from CPU specific DSQs. This generally increases L1/L2 cache
217    /// locality for tasks and lowers lock contention compared to shared DSQs,
218    /// but at the cost of higher load balancing complexity. This is a
219    /// highly experimental feature.
220    #[clap(long = "per-cpu-dsq", action = clap::ArgAction::SetTrue)]
221    per_cpu_dsq: bool,
222
223    /// Enable CPU bandwidth control using cpu.max in cgroup v2.
224    /// This is a highly experimental feature.
225    #[clap(long = "enable-cpu-bw", action = clap::ArgAction::SetTrue)]
226    enable_cpu_bw: bool,
227
228    /// If specified, only tasks which have their scheduling policy set to
229    /// SCHED_EXT using sched_setscheduler(2) are switched. Otherwise, all
230    /// tasks are switched.
231    #[clap(long = "partial", action = clap::ArgAction::SetTrue)]
232    partial: bool,
233
234    ///
235    /// Disable core compaction so the scheduler uses all the online CPUs.
236    /// The core compaction attempts to minimize the number of actively used
237    /// CPUs for unaffinitized tasks, respecting the CPU preference order.
238    /// Normally, the core compaction is enabled by the power mode (i.e.,
239    /// balanced or powersave mode is specified as an option or chosen in
240    /// the autopilot or autopower mode). This option cannot be used with the
241    /// other options that control the core compaction (--autopilot,
242    /// --autopower, --performance, --balanced, --powersave).
243    #[clap(long = "no-core-compaction", action = clap::ArgAction::SetTrue)]
244    no_core_compaction: bool,
245
246    /// Disable controlling the CPU frequency.
247    #[clap(long = "no-freq-scaling", action = clap::ArgAction::SetTrue)]
248    no_freq_scaling: bool,
249
250    /// Enable stats monitoring with the specified interval.
251    #[clap(long)]
252    stats: Option<f64>,
253
254    /// Run in stats monitoring mode with the specified interval. Scheduler is not launched.
255    #[clap(long)]
256    monitor: Option<f64>,
257
258    /// Run in monitoring mode. Show the specified number of scheduling
259    /// samples every second.
260    #[clap(long)]
261    monitor_sched_samples: Option<u64>,
262
263    /// Specify the logging level. Accepts rust's envfilter syntax for modular
264    /// logging: https://docs.rs/tracing-subscriber/latest/tracing_subscriber/filter/struct.EnvFilter.html#example-syntax. Examples: ["info", "warn,tokio=info"]
265    #[clap(long, default_value = "info")]
266    log_level: String,
267
268    /// Print scheduler version and exit.
269    #[clap(short = 'V', long, action = clap::ArgAction::SetTrue)]
270    version: bool,
271
272    /// Optional run ID for tracking scheduler instances.
273    #[clap(long)]
274    run_id: Option<u64>,
275
276    /// Show descriptions for statistics.
277    #[clap(long)]
278    help_stats: bool,
279
280    #[clap(flatten, next_help_heading = "Libbpf Options")]
281    pub libbpf: LibbpfOpts,
282
283    /// Topology configuration options
284    #[clap(flatten)]
285    topology: Option<TopologyArgs>,
286}
287
288impl Opts {
289    fn can_autopilot(&self) -> bool {
290        self.autopower == false
291            && self.performance == false
292            && self.powersave == false
293            && self.balanced == false
294            && self.no_core_compaction == false
295    }
296
297    fn can_autopower(&self) -> bool {
298        self.autopilot == false
299            && self.performance == false
300            && self.powersave == false
301            && self.balanced == false
302            && self.no_core_compaction == false
303    }
304
305    fn can_performance(&self) -> bool {
306        self.autopilot == false
307            && self.autopower == false
308            && self.powersave == false
309            && self.balanced == false
310    }
311
312    fn can_balanced(&self) -> bool {
313        self.autopilot == false
314            && self.autopower == false
315            && self.performance == false
316            && self.powersave == false
317            && self.no_core_compaction == false
318    }
319
320    fn can_powersave(&self) -> bool {
321        self.autopilot == false
322            && self.autopower == false
323            && self.performance == false
324            && self.balanced == false
325            && self.no_core_compaction == false
326    }
327
328    fn proc(&mut self) -> Option<&mut Self> {
329        if !self.autopilot {
330            self.autopilot = self.can_autopilot();
331        }
332
333        if self.autopilot {
334            if !self.can_autopilot() {
335                info!("Autopilot mode cannot be used with conflicting options.");
336                return None;
337            }
338            info!("Autopilot mode is enabled.");
339        }
340
341        if self.autopower {
342            if !self.can_autopower() {
343                info!("Autopower mode cannot be used with conflicting options.");
344                return None;
345            }
346            info!("Autopower mode is enabled.");
347        }
348
349        if self.performance {
350            if !self.can_performance() {
351                info!("Performance mode cannot be used with conflicting options.");
352                return None;
353            }
354            info!("Performance mode is enabled.");
355            self.no_core_compaction = true;
356        }
357
358        if self.powersave {
359            if !self.can_powersave() {
360                info!("Powersave mode cannot be used with conflicting options.");
361                return None;
362            }
363            info!("Powersave mode is enabled.");
364            self.no_core_compaction = false;
365        }
366
367        if self.balanced {
368            if !self.can_balanced() {
369                info!("Balanced mode cannot be used with conflicting options.");
370                return None;
371            }
372            info!("Balanced mode is enabled.");
373            self.no_core_compaction = false;
374        }
375
376        if !EnergyModel::has_energy_model() || !self.cpu_pref_order.is_empty() {
377            self.no_use_em = true;
378        }
379        if self.no_use_em {
380            info!("Energy model won't be used for CPU preference order.");
381        }
382
383        if let Some(pinned_slice) = self.pinned_slice_us {
384            if pinned_slice == 0 {
385                info!("Pinned task slice mode is disabled. Pinned tasks will use per-domain DSQs.");
386            } else if pinned_slice < self.slice_min_us || pinned_slice > self.slice_max_us {
387                info!(
388                    "pinned-slice-us ({}) must be between slice-min-us ({}) and slice-max-us ({})",
389                    pinned_slice, self.slice_min_us, self.slice_max_us
390                );
391                return None;
392            } else {
393                info!(
394                "Pinned task slice mode is enabled ({} us). Pinned tasks will use per-CPU DSQs.",
395                pinned_slice
396            );
397            }
398        }
399
400        Some(self)
401    }
402
403    fn preempt_shift_range(s: &str) -> Result<u8, String> {
404        number_range(s, 0, 10)
405    }
406
407    fn lat_load_target_pct_range(s: &str) -> Result<u16, String> {
408        number_range(s, 0, 200)
409    }
410
411    fn mig_delta_pct_range(s: &str) -> Result<u8, String> {
412        number_range(s, 0, 100)
413    }
414
415    fn lb_low_util_pct_range(s: &str) -> Result<u8, String> {
416        number_range(s, 0, 100)
417    }
418
419    fn lb_local_dsq_util_pct_range(s: &str) -> Result<u8, String> {
420        number_range(s, 0, 100)
421    }
422}
423
424unsafe impl Plain for msg_task_ctx {}
425
426impl msg_task_ctx {
427    fn from_bytes(buf: &[u8]) -> &msg_task_ctx {
428        plain::from_bytes(buf).expect("The buffer is either too short or not aligned!")
429    }
430}
431
432impl introspec {
433    fn new() -> Self {
434        let intrspc = unsafe { mem::MaybeUninit::<introspec>::zeroed().assume_init() };
435        intrspc
436    }
437}
438
439struct Scheduler<'a> {
440    skel: BpfSkel<'a>,
441    struct_ops: Option<libbpf_rs::Link>,
442    rb_mgr: libbpf_rs::RingBuffer<'static>,
443    intrspc: introspec,
444    intrspc_rx: Receiver<SchedSample>,
445    monitor_tid: Option<ThreadId>,
446    stats_server: StatsServer<StatsReq, StatsRes>,
447    mseq_id: u64,
448}
449
450impl<'a> Scheduler<'a> {
451    fn init(opts: &'a Opts, open_object: &'a mut MaybeUninit<OpenObject>) -> Result<Self> {
452        if *NR_CPU_IDS > LAVD_CPU_ID_MAX as usize {
453            panic!(
454                "Num possible CPU IDs ({}) exceeds maximum of ({})",
455                *NR_CPU_IDS, LAVD_CPU_ID_MAX
456            );
457        }
458
459        try_set_rlimit_infinity();
460
461        // Open the BPF prog first for verification.
462        let debug_level = if opts.log_level.contains("trace") {
463            2
464        } else if opts.log_level.contains("debug") {
465            1
466        } else {
467            0
468        };
469        let mut skel_builder = BpfSkelBuilder::default();
470        skel_builder.obj_builder.debug(debug_level > 1);
471        init_libbpf_logging(Some(PrintLevel::Debug));
472
473        let open_opts = opts.libbpf.clone().into_bpf_open_opts();
474        let mut skel = scx_ops_open!(skel_builder, open_object, lavd_ops, open_opts)?;
475
476        // Enable futex tracing using ftrace if available. If the ftrace is not
477        // available, use tracepoint, which is known to be slower than ftrace.
478        if !opts.no_futex_boost {
479            if Self::attach_futex_ftraces(&mut skel)? == false {
480                info!("Fail to attach futex ftraces. Try with tracepoints.");
481                if Self::attach_futex_tracepoints(&mut skel)? == false {
482                    info!("Fail to attach futex tracepoints.");
483                }
484            }
485        }
486
487        // Initialize CPU topology with CLI arguments
488        let order = CpuOrder::new(opts.topology.as_ref(), opts.no_use_em).unwrap();
489        Self::init_cpus(&mut skel, &order);
490        Self::init_cpdoms(&mut skel, &order);
491
492        // When there are multiple domains, hook the execve() syscall family
493        // to enable aggressive cross-domain migration when execve() is called.
494        if order.cpdom_map.len() > 1 {
495            Self::attach_execve_tracepoints(&mut skel)?;
496        }
497
498        // Initialize skel according to @opts.
499        Self::init_globals(&mut skel, &opts, &order, debug_level);
500
501        // Initialize arena
502        let mut skel = scx_ops_load!(skel, lavd_ops, uei)?;
503        let task_size = std::mem::size_of::<types::task_ctx>();
504        let arenalib = ArenaLib::init(skel.object_mut(), task_size, *NR_CPU_IDS)?;
505        arenalib.setup()?;
506
507        // Attach.
508        let struct_ops = Some(scx_ops_attach!(skel, lavd_ops)?);
509        let stats_server = StatsServer::new(stats::server_data(*NR_CPU_IDS as u64)).launch()?;
510
511        // Build a ring buffer for instrumentation
512        let (intrspc_tx, intrspc_rx) = channel::bounded(65536);
513        let rb_map = &mut skel.maps.introspec_msg;
514        let mut builder = libbpf_rs::RingBufferBuilder::new();
515        builder
516            .add(rb_map, move |data| {
517                Scheduler::relay_introspec(data, &intrspc_tx)
518            })
519            .unwrap();
520        let rb_mgr = builder.build().unwrap();
521
522        Ok(Self {
523            skel,
524            struct_ops,
525            rb_mgr,
526            intrspc: introspec::new(),
527            intrspc_rx,
528            monitor_tid: None,
529            stats_server,
530            mseq_id: 0,
531        })
532    }
533
534    fn attach_futex_ftraces(skel: &mut OpenBpfSkel) -> Result<bool> {
535        let ftraces = vec![
536            ("__futex_wait", &skel.progs.fexit___futex_wait),
537            ("futex_wait_multiple", &skel.progs.fexit_futex_wait_multiple),
538            (
539                "futex_wait_requeue_pi",
540                &skel.progs.fexit_futex_wait_requeue_pi,
541            ),
542            ("futex_wake", &skel.progs.fexit_futex_wake),
543            ("futex_wake_op", &skel.progs.fexit_futex_wake_op),
544            ("futex_lock_pi", &skel.progs.fexit_futex_lock_pi),
545            ("futex_unlock_pi", &skel.progs.fexit_futex_unlock_pi),
546        ];
547
548        if compat::tracer_available("function")? == false {
549            info!("Ftrace is not enabled in the kernel.");
550            return Ok(false);
551        }
552
553        compat::cond_kprobes_enable(ftraces)
554    }
555
556    fn attach_futex_tracepoints(skel: &mut OpenBpfSkel) -> Result<bool> {
557        let tracepoints = vec![
558            ("syscalls:sys_enter_futex", &skel.progs.rtp_sys_enter_futex),
559            ("syscalls:sys_exit_futex", &skel.progs.rtp_sys_exit_futex),
560            (
561                "syscalls:sys_exit_futex_wait",
562                &skel.progs.rtp_sys_exit_futex_wait,
563            ),
564            (
565                "syscalls:sys_exit_futex_waitv",
566                &skel.progs.rtp_sys_exit_futex_waitv,
567            ),
568            (
569                "syscalls:sys_exit_futex_wake",
570                &skel.progs.rtp_sys_exit_futex_wake,
571            ),
572        ];
573
574        compat::cond_tracepoints_enable(tracepoints)
575    }
576
577    fn attach_execve_tracepoints(skel: &mut OpenBpfSkel) -> Result<bool> {
578        let tracepoints = vec![
579            (
580                "syscalls:sys_enter_execve",
581                &skel.progs.cond_hook_sys_enter_execve,
582            ),
583            (
584                "syscalls:sys_enter_execveat",
585                &skel.progs.cond_hook_sys_enter_execveat,
586            ),
587        ];
588
589        compat::cond_tracepoints_enable(tracepoints)
590    }
591
592    fn init_cpus(skel: &mut OpenBpfSkel, order: &CpuOrder) {
593        debug!("{:#?}", order);
594
595        // Initialize CPU capacity and sibling
596        for cpu in order.cpuids.iter() {
597            skel.maps.rodata_data.as_mut().unwrap().cpu_capacity[cpu.cpu_adx] = cpu.cpu_cap as u16;
598            skel.maps.rodata_data.as_mut().unwrap().cpu_big[cpu.cpu_adx] = cpu.big_core as u8;
599            skel.maps.rodata_data.as_mut().unwrap().cpu_turbo[cpu.cpu_adx] = cpu.turbo_core as u8;
600            skel.maps.rodata_data.as_mut().unwrap().cpu_sibling[cpu.cpu_adx] =
601                cpu.cpu_sibling as u32;
602        }
603
604        // Initialize performance vs. CPU order table.
605        let nr_pco_states: u8 = order.perf_cpu_order.len() as u8;
606        if nr_pco_states > LAVD_PCO_STATE_MAX as u8 {
607            panic!("Generated performance vs. CPU order stats are too complex ({nr_pco_states}) to handle");
608        }
609
610        skel.maps.rodata_data.as_mut().unwrap().nr_pco_states = nr_pco_states;
611        for (i, (_, pco)) in order.perf_cpu_order.iter().enumerate() {
612            Self::init_pco_tuple(skel, i, &pco);
613            info!("{:#}", pco);
614        }
615
616        let (_, last_pco) = order.perf_cpu_order.last_key_value().unwrap();
617        for i in nr_pco_states..LAVD_PCO_STATE_MAX as u8 {
618            Self::init_pco_tuple(skel, i as usize, &last_pco);
619        }
620    }
621
622    fn init_pco_tuple(skel: &mut OpenBpfSkel, i: usize, pco: &PerfCpuOrder) {
623        let cpus_perf = pco.cpus_perf.borrow();
624        let cpus_ovflw = pco.cpus_ovflw.borrow();
625        let pco_nr_primary = cpus_perf.len();
626
627        skel.maps.rodata_data.as_mut().unwrap().pco_bounds[i] = pco.perf_cap as u32;
628        skel.maps.rodata_data.as_mut().unwrap().pco_nr_primary[i] = pco_nr_primary as u16;
629
630        for (j, &cpu_adx) in cpus_perf.iter().enumerate() {
631            skel.maps.rodata_data.as_mut().unwrap().pco_table[i][j] = cpu_adx as u16;
632        }
633
634        for (j, &cpu_adx) in cpus_ovflw.iter().enumerate() {
635            let k = j + pco_nr_primary;
636            skel.maps.rodata_data.as_mut().unwrap().pco_table[i][k] = cpu_adx as u16;
637        }
638    }
639
640    fn init_cpdoms(skel: &mut OpenBpfSkel, order: &CpuOrder) {
641        // Initialize compute domain contexts
642        for (k, v) in order.cpdom_map.iter() {
643            skel.maps.bss_data.as_mut().unwrap().cpdom_ctxs[v.cpdom_id].id = v.cpdom_id as u64;
644            skel.maps.bss_data.as_mut().unwrap().cpdom_ctxs[v.cpdom_id].alt_id =
645                v.cpdom_alt_id.get() as u64;
646            skel.maps.bss_data.as_mut().unwrap().cpdom_ctxs[v.cpdom_id].numa_id = k.numa_adx as u8;
647            skel.maps.bss_data.as_mut().unwrap().cpdom_ctxs[v.cpdom_id].llc_id = k.llc_adx as u8;
648            skel.maps.bss_data.as_mut().unwrap().cpdom_ctxs[v.cpdom_id].is_big = k.is_big as u8;
649            skel.maps.bss_data.as_mut().unwrap().cpdom_ctxs[v.cpdom_id].is_valid = 1;
650            for cpu_id in v.cpu_ids.iter() {
651                let i = cpu_id / 64;
652                let j = cpu_id % 64;
653                skel.maps.bss_data.as_mut().unwrap().cpdom_ctxs[v.cpdom_id].__cpumask[i] |=
654                    0x01 << j;
655            }
656
657            if v.neighbor_map.borrow().iter().len() > LAVD_CPDOM_MAX_DIST as usize {
658                panic!("The processor topology is too complex to handle in BPF.");
659            }
660
661            for (k, (_d, neighbors)) in v.neighbor_map.borrow().iter().enumerate() {
662                let nr_neighbors = neighbors.borrow().len() as u8;
663                if nr_neighbors > LAVD_CPDOM_MAX_NR as u8 {
664                    panic!("The processor topology is too complex to handle in BPF.");
665                }
666                skel.maps.bss_data.as_mut().unwrap().cpdom_ctxs[v.cpdom_id].nr_neighbors[k] =
667                    nr_neighbors;
668                for (i, &id) in neighbors.borrow().iter().enumerate() {
669                    let idx = (k * LAVD_CPDOM_MAX_NR as usize) + i;
670                    skel.maps.bss_data.as_mut().unwrap().cpdom_ctxs[v.cpdom_id].neighbor_ids[idx] =
671                        id as u8;
672                }
673            }
674        }
675    }
676
677    fn init_globals(skel: &mut OpenBpfSkel, opts: &Opts, order: &CpuOrder, debug_level: u8) {
678        let bss_data = skel.maps.bss_data.as_mut().unwrap();
679        bss_data.no_preemption = opts.no_preemption;
680        bss_data.no_core_compaction = opts.no_core_compaction;
681        bss_data.no_freq_scaling = opts.no_freq_scaling;
682        bss_data.is_powersave_mode = opts.powersave;
683        let rodata = skel.maps.rodata_data.as_mut().unwrap();
684        rodata.nr_llcs = order.nr_llcs as u64;
685        rodata.nr_cpu_ids = *NR_CPU_IDS as u32;
686        rodata.is_smt_active = order.smt_enabled;
687        rodata.is_autopilot_on = opts.autopilot;
688        rodata.verbose = debug_level;
689        rodata.slice_max_ns = opts.slice_max_us * 1000;
690        rodata.slice_min_ns = opts.slice_min_us * 1000;
691        rodata.pinned_slice_ns = opts.pinned_slice_us.map(|v| v * 1000).unwrap_or(0);
692        rodata.preempt_shift = opts.preempt_shift;
693        rodata.lat_load_target_pct = opts.lat_load_target_pct;
694        rodata.mig_delta_pct = opts.mig_delta_pct;
695        rodata.lb_low_util_wall = ((opts.lb_low_util_pct as u64) << 10) / 100;
696        rodata.lb_local_dsq_util_wall = ((opts.lb_local_dsq_util_pct as u64) << 10) / 100;
697        rodata.no_use_em = opts.no_use_em as u8;
698        rodata.no_fast_lb = opts.no_fast_lb as u8;
699        rodata.no_wake_sync = opts.no_wake_sync;
700        rodata.no_slice_boost = opts.no_slice_boost;
701        rodata.per_cpu_dsq = opts.per_cpu_dsq;
702        rodata.enable_cpu_bw = opts.enable_cpu_bw;
703
704        if !ksym_exists("scx_group_set_bandwidth").unwrap() {
705            skel.struct_ops.lavd_ops_mut().cgroup_set_bandwidth = std::ptr::null_mut();
706            warn!("Kernel does not support ops.cgroup_set_bandwidth(), so disable it.");
707        }
708
709        skel.struct_ops.lavd_ops_mut().flags = *compat::SCX_OPS_ENQ_EXITING
710            | *compat::SCX_OPS_ENQ_LAST
711            | *compat::SCX_OPS_ENQ_MIGRATION_DISABLED
712            | *compat::SCX_OPS_KEEP_BUILTIN_IDLE;
713
714        if opts.partial {
715            skel.struct_ops.lavd_ops_mut().flags |= *compat::SCX_OPS_SWITCH_PARTIAL;
716        }
717    }
718
719    fn get_msg_seq_id() -> u64 {
720        static mut MSEQ: u64 = 0;
721        unsafe {
722            MSEQ += 1;
723            MSEQ
724        }
725    }
726
727    fn relay_introspec(data: &[u8], intrspc_tx: &Sender<SchedSample>) -> i32 {
728        let mt = msg_task_ctx::from_bytes(data);
729        let tx = mt.taskc_x;
730
731        // No idea how to print other types than LAVD_MSG_TASKC
732        if mt.hdr.kind != LAVD_MSG_TASKC {
733            return 0;
734        }
735
736        let mseq = Scheduler::get_msg_seq_id();
737
738        let c_tx_cm: *const c_char = (&tx.comm as *const [c_char; 17]) as *const c_char;
739        let c_tx_cm_str: &CStr = unsafe { CStr::from_ptr(c_tx_cm) };
740        let tx_comm: &str = c_tx_cm_str.to_str().unwrap();
741
742        let c_waker_cm: *const c_char = (&tx.waker_comm as *const [c_char; 17]) as *const c_char;
743        let c_waker_cm_str: &CStr = unsafe { CStr::from_ptr(c_waker_cm) };
744        let waker_comm: &str = c_waker_cm_str.to_str().unwrap();
745
746        let c_tx_st: *const c_char = (&tx.stat as *const [c_char; 5]) as *const c_char;
747        let c_tx_st_str: &CStr = unsafe { CStr::from_ptr(c_tx_st) };
748        let tx_stat: &str = c_tx_st_str.to_str().unwrap();
749
750        match intrspc_tx.try_send(SchedSample {
751            mseq,
752            pid: tx.pid,
753            comm: tx_comm.into(),
754            stat: tx_stat.into(),
755            cpu_id: tx.cpu_id,
756            prev_cpu_id: tx.prev_cpu_id,
757            suggested_cpu_id: tx.suggested_cpu_id,
758            waker_pid: tx.waker_pid,
759            waker_comm: waker_comm.into(),
760            slice_wall: tx.slice_wall,
761            lat_cri: tx.lat_cri,
762            avg_lat_cri: tx.avg_lat_cri,
763            static_prio: tx.static_prio,
764            rerunnable_interval_wall: tx.rerunnable_interval_wall,
765            resched_interval_wall: tx.resched_interval_wall,
766            run_freq: tx.run_freq,
767            avg_runtime_wall: tx.avg_runtime_wall,
768            wait_freq: tx.wait_freq,
769            wake_freq: tx.wake_freq,
770            perf_cri: tx.perf_cri,
771            thr_perf_cri: tx.thr_perf_cri,
772            cpuperf_cur: tx.cpuperf_cur,
773            cpu_util_wall: tx.cpu_util_wall,
774            cpu_util_invr: tx.cpu_util_invr,
775            steal_util_wall: tx.steal_util_wall,
776            steal_util_invr: tx.steal_util_invr,
777            dom_pinned_util_wall: tx.dom_pinned_util_wall,
778            dom_pinned_util_invr: tx.dom_pinned_util_invr,
779            nr_active: tx.nr_active,
780            dsq_id: tx.dsq_id,
781            dsq_consume_lat: tx.dsq_consume_lat,
782            lat_headroom: tx.lat_headroom,
783            vuln_thresh: tx.vuln_thresh,
784            task_util_est: tx.task_util_est,
785            norm_lat_cri: tx.norm_lat_cri,
786            slice_used_wall: tx.last_slice_used_wall,
787        }) {
788            Ok(()) | Err(TrySendError::Full(_)) => 0,
789            Err(e) => panic!("failed to send on intrspc_tx ({})", e),
790        }
791    }
792
793    fn prep_introspec(&mut self) {
794        if !self.skel.maps.bss_data.as_ref().unwrap().is_monitored {
795            self.skel.maps.bss_data.as_mut().unwrap().is_monitored = true;
796        }
797        self.skel.maps.bss_data.as_mut().unwrap().intrspc.cmd = self.intrspc.cmd;
798        self.skel.maps.bss_data.as_mut().unwrap().intrspc.arg = self.intrspc.arg;
799    }
800
801    fn cleanup_introspec(&mut self) {
802        self.skel.maps.bss_data.as_mut().unwrap().intrspc.cmd = LAVD_CMD_NOP;
803    }
804
805    fn get_pc(x: u64, y: u64) -> f64 {
806        return 100. * x as f64 / y as f64;
807    }
808
809    fn get_power_mode(power_mode: i32) -> &'static str {
810        match power_mode as u32 {
811            LAVD_PM_PERFORMANCE => "performance",
812            LAVD_PM_BALANCED => "balanced",
813            LAVD_PM_POWERSAVE => "powersave",
814            _ => "unknown",
815        }
816    }
817
818    fn stats_req_to_res(&mut self, req: &StatsReq) -> Result<StatsRes> {
819        Ok(match req {
820            StatsReq::NewSampler(tid) => {
821                self.rb_mgr.consume().unwrap();
822                self.monitor_tid = Some(*tid);
823                StatsRes::Ack
824            }
825            StatsReq::SysStatsReq { tid } => {
826                if Some(*tid) != self.monitor_tid {
827                    return Ok(StatsRes::Bye);
828                }
829                self.mseq_id += 1;
830
831                let bss_data = self.skel.maps.bss_data.as_ref().unwrap();
832                let st = bss_data.sys_stat;
833
834                let mseq = self.mseq_id;
835                let nr_queued_task = st.nr_queued_task;
836                let nr_active = st.nr_active;
837                let nr_sched = st.nr_sched;
838                let nr_preempt = st.nr_preempt;
839                let pc_pc = Self::get_pc(st.nr_perf_cri, nr_sched);
840                let pc_lc = Self::get_pc(st.nr_lat_cri, nr_sched);
841                let pc_x_migration = Self::get_pc(st.nr_x_migration, nr_sched);
842                let nr_stealee = st.nr_stealee;
843                let nr_big = st.nr_big;
844                let pc_big = Self::get_pc(nr_big, nr_sched);
845                let pc_pc_on_big = Self::get_pc(st.nr_pc_on_big, nr_big);
846                let pc_lc_on_big = Self::get_pc(st.nr_lc_on_big, nr_big);
847                let power_mode = Self::get_power_mode(bss_data.power_mode);
848                let total_time = bss_data.performance_mode_ns
849                    + bss_data.balanced_mode_ns
850                    + bss_data.powersave_mode_ns;
851                let pc_performance = Self::get_pc(bss_data.performance_mode_ns, total_time);
852                let pc_balanced = Self::get_pc(bss_data.balanced_mode_ns, total_time);
853                let pc_powersave = Self::get_pc(bss_data.powersave_mode_ns, total_time);
854
855                StatsRes::SysStats(SysStats {
856                    mseq,
857                    nr_queued_task,
858                    nr_active,
859                    nr_sched,
860                    nr_preempt,
861                    pc_pc,
862                    pc_lc,
863                    pc_x_migration,
864                    nr_stealee,
865                    pc_big,
866                    pc_pc_on_big,
867                    pc_lc_on_big,
868                    power_mode: power_mode.to_string(),
869                    pc_performance,
870                    pc_balanced,
871                    pc_powersave,
872                })
873            }
874            StatsReq::SchedSamplesNr {
875                tid,
876                nr_samples,
877                interval_ms,
878            } => {
879                if Some(*tid) != self.monitor_tid {
880                    return Ok(StatsRes::Bye);
881                }
882
883                self.intrspc.cmd = LAVD_CMD_SCHED_N;
884                self.intrspc.arg = *nr_samples;
885                self.prep_introspec();
886                std::thread::sleep(Duration::from_millis(*interval_ms));
887                self.rb_mgr.poll(Duration::from_millis(100)).unwrap();
888
889                let mut samples = vec![];
890                while let Ok(ts) = self.intrspc_rx.try_recv() {
891                    samples.push(ts);
892                }
893
894                self.cleanup_introspec();
895
896                StatsRes::SchedSamples(SchedSamples { samples })
897            }
898        })
899    }
900
901    fn stop_monitoring(&mut self) {
902        if self.skel.maps.bss_data.as_ref().unwrap().is_monitored {
903            self.skel.maps.bss_data.as_mut().unwrap().is_monitored = false;
904        }
905    }
906
907    pub fn exited(&mut self) -> bool {
908        uei_exited!(&self.skel, uei)
909    }
910
911    fn set_power_profile(&mut self, mode: u32) -> Result<(), u32> {
912        let prog = &mut self.skel.progs.set_power_profile;
913        let mut args = power_arg {
914            power_mode: mode as c_int,
915        };
916        let input = ProgramInput {
917            context_in: Some(unsafe {
918                std::slice::from_raw_parts_mut(
919                    &mut args as *mut _ as *mut u8,
920                    std::mem::size_of_val(&args),
921                )
922            }),
923            ..Default::default()
924        };
925        let out = prog.test_run(input).unwrap();
926        if out.return_value != 0 {
927            return Err(out.return_value);
928        }
929
930        Ok(())
931    }
932
933    fn update_power_profile(&mut self, prev_profile: PowerProfile) -> (bool, PowerProfile) {
934        let profile = fetch_power_profile(false);
935        if profile == prev_profile {
936            // If the profile is the same, skip updating the profile for BPF.
937            return (true, profile);
938        }
939
940        let _ = match profile {
941            PowerProfile::Performance => self.set_power_profile(LAVD_PM_PERFORMANCE),
942            PowerProfile::Balanced { .. } => self.set_power_profile(LAVD_PM_BALANCED),
943            PowerProfile::Powersave => self.set_power_profile(LAVD_PM_POWERSAVE),
944            PowerProfile::Unknown => {
945                // We don't know how to handle an unknown energy profile,
946                // so we just give up updating the profile from now on.
947                return (false, profile);
948            }
949        };
950
951        info!("Set the scheduler's power profile to {profile} mode.");
952        (true, profile)
953    }
954
955    fn run(&mut self, opts: &Opts, shutdown: Arc<AtomicBool>) -> Result<UserExitInfo> {
956        let (res_ch, req_ch) = self.stats_server.channels();
957        let mut autopower = opts.autopower;
958        let mut profile = PowerProfile::Unknown;
959
960        if opts.performance {
961            let _ = self.set_power_profile(LAVD_PM_PERFORMANCE);
962        } else if opts.powersave {
963            let _ = self.set_power_profile(LAVD_PM_POWERSAVE);
964        } else {
965            let _ = self.set_power_profile(LAVD_PM_BALANCED);
966        }
967
968        while !shutdown.load(Ordering::Relaxed) && !self.exited() {
969            if autopower {
970                (autopower, profile) = self.update_power_profile(profile);
971            }
972
973            match req_ch.recv_timeout(Duration::from_secs(1)) {
974                Ok(req) => {
975                    let res = self.stats_req_to_res(&req)?;
976                    res_ch.send(res)?;
977                }
978                Err(RecvTimeoutError::Timeout) => {
979                    self.stop_monitoring();
980                }
981                Err(e) => {
982                    self.stop_monitoring();
983                    Err(e)?
984                }
985            }
986            self.cleanup_introspec();
987        }
988        self.rb_mgr.consume().unwrap();
989
990        bpf_streams::dump_bpf_streams(&mut self.skel);
991        let _ = self.struct_ops.take();
992        uei_report!(&self.skel, uei)
993    }
994}
995
996impl Drop for Scheduler<'_> {
997    fn drop(&mut self) {
998        info!("Unregister {SCHEDULER_NAME} scheduler");
999
1000        if let Some(struct_ops) = self.struct_ops.take() {
1001            drop(struct_ops);
1002        }
1003    }
1004}
1005
1006fn init_log(opts: &Opts) {
1007    let env_filter = EnvFilter::try_from_default_env()
1008        .or_else(|_| match EnvFilter::try_new(&opts.log_level) {
1009            Ok(filter) => Ok(filter),
1010            Err(e) => {
1011                eprintln!(
1012                    "invalid log envvar: {}, using info, err is: {}",
1013                    opts.log_level, e
1014                );
1015                EnvFilter::try_new("info")
1016            }
1017        })
1018        .unwrap_or_else(|_| EnvFilter::new("info"));
1019
1020    match tracing_subscriber::fmt()
1021        .with_env_filter(env_filter)
1022        .with_target(true)
1023        .with_thread_ids(true)
1024        .with_file(true)
1025        .with_line_number(true)
1026        .try_init()
1027    {
1028        Ok(()) => {}
1029        Err(e) => eprintln!("failed to init logger: {}", e),
1030    }
1031}
1032
1033#[clap_main::clap_main]
1034fn main(mut opts: Opts) -> Result<()> {
1035    if opts.version {
1036        println!(
1037            "scx_lavd {}",
1038            build_id::full_version(env!("CARGO_PKG_VERSION"))
1039        );
1040        return Ok(());
1041    }
1042
1043    if opts.help_stats {
1044        let sys_stats_meta_name = SysStats::meta().name;
1045        let sched_sample_meta_name = SchedSample::meta().name;
1046        let stats_meta_names: &[&str] = &[
1047            sys_stats_meta_name.as_str(),
1048            sched_sample_meta_name.as_str(),
1049        ];
1050        stats::server_data(0).describe_meta(&mut std::io::stdout(), Some(&stats_meta_names))?;
1051        return Ok(());
1052    }
1053
1054    init_log(&opts);
1055
1056    if opts.verbose > 0 {
1057        warn!("Setting verbose via -v is deprecated and will be an error in future releases.");
1058    }
1059
1060    if let Some(run_id) = opts.run_id {
1061        info!("scx_lavd run_id: {}", run_id);
1062    }
1063
1064    if opts.monitor.is_none() && opts.monitor_sched_samples.is_none() {
1065        opts.proc().unwrap();
1066        info!("{:#?}", opts);
1067    }
1068
1069    let shutdown = Arc::new(AtomicBool::new(false));
1070    let shutdown_clone = shutdown.clone();
1071    ctrlc::set_handler(move || {
1072        shutdown_clone.store(true, Ordering::Relaxed);
1073    })
1074    .context("Error setting Ctrl-C handler")?;
1075
1076    if let Some(nr_samples) = opts.monitor_sched_samples {
1077        let shutdown_copy = shutdown.clone();
1078        let jh = std::thread::spawn(move || {
1079            stats::monitor_sched_samples(nr_samples, shutdown_copy).unwrap()
1080        });
1081        let _ = jh.join();
1082        return Ok(());
1083    }
1084
1085    if let Some(intv) = opts.monitor.or(opts.stats) {
1086        let shutdown_copy = shutdown.clone();
1087        let jh = std::thread::spawn(move || {
1088            stats::monitor(Duration::from_secs_f64(intv), shutdown_copy).unwrap()
1089        });
1090        if opts.monitor.is_some() {
1091            let _ = jh.join();
1092            return Ok(());
1093        }
1094    }
1095
1096    let mut open_object = MaybeUninit::uninit();
1097    loop {
1098        let mut sched = Scheduler::init(&opts, &mut open_object)?;
1099        info!(
1100            "scx_lavd scheduler is initialized (build ID: {})",
1101            build_id::full_version(env!("CARGO_PKG_VERSION"))
1102        );
1103        info!("scx_lavd scheduler starts running.");
1104        if !sched.run(&opts, shutdown.clone())?.should_restart() {
1105            break;
1106        }
1107    }
1108
1109    Ok(())
1110}