Skip to main content

scx_lavd/
main.rs

1// SPDX-License-Identifier: GPL-2.0
2//
3// Copyright (c) 2024 Valve Corporation.
4// Author: Changwoo Min <changwoo@igalia.com>
5
6// This software may be used and distributed according to the terms of the
7// GNU General Public License version 2.
8
9mod bpf_skel;
10pub use bpf_skel::*;
11pub mod bpf_intf;
12pub use bpf_intf::*;
13
14mod cpu_order;
15use scx_utils::init_libbpf_logging;
16mod stats;
17use std::ffi::c_int;
18use std::ffi::CStr;
19use std::mem;
20use std::mem::MaybeUninit;
21use std::str;
22use std::sync::atomic::AtomicBool;
23use std::sync::atomic::Ordering;
24use std::sync::Arc;
25use std::thread::ThreadId;
26use std::time::Duration;
27
28use anyhow::Context;
29use anyhow::Result;
30use clap::Parser;
31use clap_num::number_range;
32use cpu_order::CpuOrder;
33use cpu_order::PerfCpuOrder;
34use crossbeam::channel;
35use crossbeam::channel::Receiver;
36use crossbeam::channel::RecvTimeoutError;
37use crossbeam::channel::Sender;
38use crossbeam::channel::TrySendError;
39use libbpf_rs::skel::Skel;
40use libbpf_rs::OpenObject;
41use libbpf_rs::PrintLevel;
42use libbpf_rs::ProgramInput;
43use libc::c_char;
44use plain::Plain;
45use scx_arena::ArenaLib;
46use scx_stats::prelude::*;
47use scx_utils::autopower::{fetch_power_profile, PowerProfile};
48use scx_utils::build_id;
49use scx_utils::compat;
50use scx_utils::ksym_exists;
51use scx_utils::libbpf_clap_opts::LibbpfOpts;
52use scx_utils::scx_ops_attach;
53use scx_utils::scx_ops_load;
54use scx_utils::scx_ops_open;
55use scx_utils::try_set_rlimit_infinity;
56use scx_utils::uei_exited;
57use scx_utils::uei_report;
58use scx_utils::EnergyModel;
59use scx_utils::TopologyArgs;
60use scx_utils::UserExitInfo;
61use scx_utils::NR_CPU_IDS;
62use stats::SchedSample;
63use stats::SchedSamples;
64use stats::StatsReq;
65use stats::StatsRes;
66use stats::SysStats;
67use tracing::{debug, info, warn};
68use tracing_subscriber::filter::EnvFilter;
69
70const SCHEDULER_NAME: &str = "scx_lavd";
71/// scx_lavd: Latency-criticality Aware Virtual Deadline (LAVD) scheduler
72///
73/// The rust part is minimal. It processes command line options and logs out
74/// scheduling statistics. The BPF part makes all the scheduling decisions.
75/// See the more detailed overview of the LAVD design at main.bpf.c.
76#[derive(Debug, Parser)]
77struct Opts {
78    /// Deprecated, noop, use RUST_LOG or --log-level instead.
79    #[clap(short = 'v', long, action = clap::ArgAction::Count)]
80    verbose: u8,
81
82    /// Automatically decide the scheduler's power mode (performance vs.
83    /// powersave vs. balanced), CPU preference order, etc, based on system
84    /// load. The options affecting the power mode and the use of core compaction
85    /// (--autopower, --performance, --powersave, --balanced,
86    /// --no-core-compaction) cannot be used with this option. When no option
87    /// is specified, this is a default mode.
88    #[clap(long = "autopilot", action = clap::ArgAction::SetTrue)]
89    autopilot: bool,
90
91    /// Automatically decide the scheduler's power mode (performance vs.
92    /// powersave vs. balanced) based on the system's active power profile.
93    /// The scheduler's power mode decides the CPU preference order and the use
94    /// of core compaction, so the options affecting these (--autopilot,
95    /// --performance, --powersave, --balanced, --no-core-compaction) cannot
96    /// be used with this option.
97    #[clap(long = "autopower", action = clap::ArgAction::SetTrue)]
98    autopower: bool,
99
100    /// Run the scheduler in performance mode to get maximum performance.
101    /// This option cannot be used with other conflicting options (--autopilot,
102    /// --autopower, --balanced, --powersave, --no-core-compaction)
103    /// affecting the use of core compaction.
104    #[clap(long = "performance", action = clap::ArgAction::SetTrue)]
105    performance: bool,
106
107    /// Run the scheduler in powersave mode to minimize powr consumption.
108    /// This option cannot be used with other conflicting options (--autopilot,
109    /// --autopower, --performance, --balanced, --no-core-compaction)
110    /// affecting the use of core compaction.
111    #[clap(long = "powersave", action = clap::ArgAction::SetTrue)]
112    powersave: bool,
113
114    /// Run the scheduler in balanced mode aiming for sweetspot between power
115    /// and performance. This option cannot be used with other conflicting
116    /// options (--autopilot, --autopower, --performance, --powersave,
117    /// --no-core-compaction) affecting the use of core compaction.
118    #[clap(long = "balanced", action = clap::ArgAction::SetTrue)]
119    balanced: bool,
120
121    /// Maximum scheduling slice duration in microseconds.
122    #[clap(long = "slice-max-us", default_value = "5000")]
123    slice_max_us: u64,
124
125    /// Minimum scheduling slice duration in microseconds.
126    #[clap(long = "slice-min-us", default_value = "500")]
127    slice_min_us: u64,
128
129    /// Migration delta threshold percentage (0-100). When set to a non-zero value,
130    /// the migration threshold is mig-delta-pct percent of the average load.
131    /// Additionally, disables force task stealing in the consume path, relying only
132    /// on the is_stealer/is_stealee thresholds for more predictable load balancing.
133    /// Default is 0 (disabled, uses dynamic threshold based on load with both
134    /// probabilistic and force task stealing enabled). This is an experimental feature.
135    #[clap(long = "mig-delta-pct", default_value = "0", value_parser=Opts::mig_delta_pct_range)]
136    mig_delta_pct: u8,
137
138    /// Low utilization threshold percentage (0-100) for periodic load balancing.
139    /// When set to a non-zero value, periodic load balancing is skipped when
140    /// the maximum per-domain utilization is below this percentage.
141    /// Default is 25 (skip periodic LB below 25% utilization).
142    /// Set to 0 to disable. Set to 100 to always skip periodic LB.
143    #[clap(long = "lb-low-util-pct", default_value = "25", value_parser=Opts::lb_low_util_pct_range)]
144    lb_low_util_pct: u8,
145
146    /// Low utilization threshold percentage (0-100) for bypassing deadline
147    /// scheduling. When set to a non-zero value, tasks are dispatched directly
148    /// to the local DSQ (FIFO) instead of using deadline-based ordering when
149    /// the per-CPU utilization is below this percentage.
150    /// Default is 10 (bypass deadline scheduling below 10% utilization).
151    /// Set to 0 to disable. Set to 100 to always bypass deadline scheduling.
152    #[clap(long = "lb-local-dsq-util-pct", default_value = "10", value_parser=Opts::lb_local_dsq_util_pct_range)]
153    lb_local_dsq_util_pct: u8,
154
155    /// Slice duration in microseconds to use for all tasks when pinned tasks
156    /// are running on a CPU. Must be between slice-min-us and slice-max-us.
157    /// When this option is enabled, pinned tasks are always enqueued to per-CPU DSQs
158    /// and the dispatch logic compares vtimes across all DSQs to select the lowest
159    /// vtime task. This helps improve responsiveness for pinned tasks. By default,
160    /// this option is on with a default value of 5000 (5 msec). To turn off the option,
161    /// explicitly set the value to 0.
162    #[clap(long = "pinned-slice-us", default_value = "5000")]
163    pinned_slice_us: Option<u64>,
164
165    /// Limit the ratio of preemption to the roughly top P% of latency-critical
166    /// tasks. When N is given as an argument, P is 0.5^N * 100. The default
167    /// value is 6, which limits the preemption for the top 1.56% of
168    /// latency-critical tasks.
169    #[clap(long = "preempt-shift", default_value = "6", value_parser=Opts::preempt_shift_range)]
170    preempt_shift: u8,
171
172    /// List of CPUs in preferred order (e.g., "0-3,7,6,5,4"). The scheduler
173    /// uses the CPU preference mode only when the core compaction is enabled
174    /// (i.e., balanced or powersave mode is specified as an option or chosen
175    /// in the autopilot or autopower mode). When "--cpu-pref-order" is given,
176    /// it implies "--no-use-em".
177    #[clap(long = "cpu-pref-order", default_value = "")]
178    cpu_pref_order: String,
179
180    /// Do not use the energy model in making CPU preference order decisions.
181    #[clap(long = "no-use-em", action = clap::ArgAction::SetTrue)]
182    no_use_em: bool,
183
184    /// Do not boost futex holders.
185    #[clap(long = "no-futex-boost", action = clap::ArgAction::SetTrue)]
186    no_futex_boost: bool,
187
188    /// Disable preemption.
189    #[clap(long = "no-preemption", action = clap::ArgAction::SetTrue)]
190    no_preemption: bool,
191
192    /// Disable an optimization for synchronous wake-up.
193    #[clap(long = "no-wake-sync", action = clap::ArgAction::SetTrue)]
194    no_wake_sync: bool,
195
196    /// Disable dynamic slice boost for long-running tasks.
197    #[clap(long = "no-slice-boost", action = clap::ArgAction::SetTrue)]
198    no_slice_boost: bool,
199
200    /// Enables DSQs per CPU, this enables task queuing and dispatching
201    /// from CPU specific DSQs. This generally increases L1/L2 cache
202    /// locality for tasks and lowers lock contention compared to shared DSQs,
203    /// but at the cost of higher load balancing complexity. This is a
204    /// highly experimental feature.
205    #[clap(long = "per-cpu-dsq", action = clap::ArgAction::SetTrue)]
206    per_cpu_dsq: bool,
207
208    /// Enable CPU bandwidth control using cpu.max in cgroup v2.
209    /// This is a highly experimental feature.
210    #[clap(long = "enable-cpu-bw", action = clap::ArgAction::SetTrue)]
211    enable_cpu_bw: bool,
212
213    /// If specified, only tasks which have their scheduling policy set to
214    /// SCHED_EXT using sched_setscheduler(2) are switched. Otherwise, all
215    /// tasks are switched.
216    #[clap(long = "partial", action = clap::ArgAction::SetTrue)]
217    partial: bool,
218
219    ///
220    /// Disable core compaction so the scheduler uses all the online CPUs.
221    /// The core compaction attempts to minimize the number of actively used
222    /// CPUs for unaffinitized tasks, respecting the CPU preference order.
223    /// Normally, the core compaction is enabled by the power mode (i.e.,
224    /// balanced or powersave mode is specified as an option or chosen in
225    /// the autopilot or autopower mode). This option cannot be used with the
226    /// other options that control the core compaction (--autopilot,
227    /// --autopower, --performance, --balanced, --powersave).
228    #[clap(long = "no-core-compaction", action = clap::ArgAction::SetTrue)]
229    no_core_compaction: bool,
230
231    /// Disable controlling the CPU frequency.
232    #[clap(long = "no-freq-scaling", action = clap::ArgAction::SetTrue)]
233    no_freq_scaling: bool,
234
235    /// Enable stats monitoring with the specified interval.
236    #[clap(long)]
237    stats: Option<f64>,
238
239    /// Run in stats monitoring mode with the specified interval. Scheduler is not launched.
240    #[clap(long)]
241    monitor: Option<f64>,
242
243    /// Run in monitoring mode. Show the specified number of scheduling
244    /// samples every second.
245    #[clap(long)]
246    monitor_sched_samples: Option<u64>,
247
248    /// Specify the logging level. Accepts rust's envfilter syntax for modular
249    /// logging: https://docs.rs/tracing-subscriber/latest/tracing_subscriber/filter/struct.EnvFilter.html#example-syntax. Examples: ["info", "warn,tokio=info"]
250    #[clap(long, default_value = "info")]
251    log_level: String,
252
253    /// Print scheduler version and exit.
254    #[clap(short = 'V', long, action = clap::ArgAction::SetTrue)]
255    version: bool,
256
257    /// Optional run ID for tracking scheduler instances.
258    #[clap(long)]
259    run_id: Option<u64>,
260
261    /// Show descriptions for statistics.
262    #[clap(long)]
263    help_stats: bool,
264
265    #[clap(flatten, next_help_heading = "Libbpf Options")]
266    pub libbpf: LibbpfOpts,
267
268    /// Topology configuration options
269    #[clap(flatten)]
270    topology: Option<TopologyArgs>,
271}
272
273impl Opts {
274    fn can_autopilot(&self) -> bool {
275        self.autopower == false
276            && self.performance == false
277            && self.powersave == false
278            && self.balanced == false
279            && self.no_core_compaction == false
280    }
281
282    fn can_autopower(&self) -> bool {
283        self.autopilot == false
284            && self.performance == false
285            && self.powersave == false
286            && self.balanced == false
287            && self.no_core_compaction == false
288    }
289
290    fn can_performance(&self) -> bool {
291        self.autopilot == false
292            && self.autopower == false
293            && self.powersave == false
294            && self.balanced == false
295    }
296
297    fn can_balanced(&self) -> bool {
298        self.autopilot == false
299            && self.autopower == false
300            && self.performance == false
301            && self.powersave == false
302            && self.no_core_compaction == false
303    }
304
305    fn can_powersave(&self) -> bool {
306        self.autopilot == false
307            && self.autopower == false
308            && self.performance == false
309            && self.balanced == false
310            && self.no_core_compaction == false
311    }
312
313    fn proc(&mut self) -> Option<&mut Self> {
314        if !self.autopilot {
315            self.autopilot = self.can_autopilot();
316        }
317
318        if self.autopilot {
319            if !self.can_autopilot() {
320                info!("Autopilot mode cannot be used with conflicting options.");
321                return None;
322            }
323            info!("Autopilot mode is enabled.");
324        }
325
326        if self.autopower {
327            if !self.can_autopower() {
328                info!("Autopower mode cannot be used with conflicting options.");
329                return None;
330            }
331            info!("Autopower mode is enabled.");
332        }
333
334        if self.performance {
335            if !self.can_performance() {
336                info!("Performance mode cannot be used with conflicting options.");
337                return None;
338            }
339            info!("Performance mode is enabled.");
340            self.no_core_compaction = true;
341        }
342
343        if self.powersave {
344            if !self.can_powersave() {
345                info!("Powersave mode cannot be used with conflicting options.");
346                return None;
347            }
348            info!("Powersave mode is enabled.");
349            self.no_core_compaction = false;
350        }
351
352        if self.balanced {
353            if !self.can_balanced() {
354                info!("Balanced mode cannot be used with conflicting options.");
355                return None;
356            }
357            info!("Balanced mode is enabled.");
358            self.no_core_compaction = false;
359        }
360
361        if !EnergyModel::has_energy_model() || !self.cpu_pref_order.is_empty() {
362            self.no_use_em = true;
363            info!("Energy model won't be used for CPU preference order.");
364        }
365
366        if let Some(pinned_slice) = self.pinned_slice_us {
367            if pinned_slice == 0 {
368                info!("Pinned task slice mode is disabled. Pinned tasks will use per-domain DSQs.");
369            } else if pinned_slice < self.slice_min_us || pinned_slice > self.slice_max_us {
370                info!(
371                    "pinned-slice-us ({}) must be between slice-min-us ({}) and slice-max-us ({})",
372                    pinned_slice, self.slice_min_us, self.slice_max_us
373                );
374                return None;
375            } else {
376                info!(
377                "Pinned task slice mode is enabled ({} us). Pinned tasks will use per-CPU DSQs.",
378                pinned_slice
379            );
380            }
381        }
382
383        Some(self)
384    }
385
386    fn preempt_shift_range(s: &str) -> Result<u8, String> {
387        number_range(s, 0, 10)
388    }
389
390    fn mig_delta_pct_range(s: &str) -> Result<u8, String> {
391        number_range(s, 0, 100)
392    }
393
394    fn lb_low_util_pct_range(s: &str) -> Result<u8, String> {
395        number_range(s, 0, 100)
396    }
397
398    fn lb_local_dsq_util_pct_range(s: &str) -> Result<u8, String> {
399        number_range(s, 0, 100)
400    }
401}
402
403unsafe impl Plain for msg_task_ctx {}
404
405impl msg_task_ctx {
406    fn from_bytes(buf: &[u8]) -> &msg_task_ctx {
407        plain::from_bytes(buf).expect("The buffer is either too short or not aligned!")
408    }
409}
410
411impl introspec {
412    fn new() -> Self {
413        let intrspc = unsafe { mem::MaybeUninit::<introspec>::zeroed().assume_init() };
414        intrspc
415    }
416}
417
418struct Scheduler<'a> {
419    skel: BpfSkel<'a>,
420    struct_ops: Option<libbpf_rs::Link>,
421    rb_mgr: libbpf_rs::RingBuffer<'static>,
422    intrspc: introspec,
423    intrspc_rx: Receiver<SchedSample>,
424    monitor_tid: Option<ThreadId>,
425    stats_server: StatsServer<StatsReq, StatsRes>,
426    mseq_id: u64,
427}
428
429impl<'a> Scheduler<'a> {
430    fn init(opts: &'a Opts, open_object: &'a mut MaybeUninit<OpenObject>) -> Result<Self> {
431        if *NR_CPU_IDS > LAVD_CPU_ID_MAX as usize {
432            panic!(
433                "Num possible CPU IDs ({}) exceeds maximum of ({})",
434                *NR_CPU_IDS, LAVD_CPU_ID_MAX
435            );
436        }
437
438        try_set_rlimit_infinity();
439
440        // Open the BPF prog first for verification.
441        let debug_level = if opts.log_level.contains("trace") {
442            2
443        } else if opts.log_level.contains("debug") {
444            1
445        } else {
446            0
447        };
448        let mut skel_builder = BpfSkelBuilder::default();
449        skel_builder.obj_builder.debug(debug_level > 1);
450        init_libbpf_logging(Some(PrintLevel::Debug));
451
452        let open_opts = opts.libbpf.clone().into_bpf_open_opts();
453        let mut skel = scx_ops_open!(skel_builder, open_object, lavd_ops, open_opts)?;
454
455        // Enable futex tracing using ftrace if available. If the ftrace is not
456        // available, use tracepoint, which is known to be slower than ftrace.
457        if !opts.no_futex_boost {
458            if Self::attach_futex_ftraces(&mut skel)? == false {
459                info!("Fail to attach futex ftraces. Try with tracepoints.");
460                if Self::attach_futex_tracepoints(&mut skel)? == false {
461                    info!("Fail to attach futex tracepoints.");
462                }
463            }
464        }
465
466        // Initialize CPU topology with CLI arguments
467        let order = CpuOrder::new(opts.topology.as_ref()).unwrap();
468        Self::init_cpus(&mut skel, &order);
469        Self::init_cpdoms(&mut skel, &order);
470
471        // When there are multiple domains, hook the execve() syscall family
472        // to enable aggressive cross-domain migration when execve() is called.
473        if order.cpdom_map.len() > 1 {
474            Self::attach_execve_tracepoints(&mut skel)?;
475        }
476
477        // Initialize skel according to @opts.
478        Self::init_globals(&mut skel, &opts, &order, debug_level);
479
480        // Initialize arena
481        let mut skel = scx_ops_load!(skel, lavd_ops, uei)?;
482        let task_size = std::mem::size_of::<types::task_ctx>();
483        let arenalib = ArenaLib::init(skel.object_mut(), task_size, *NR_CPU_IDS)?;
484        arenalib.setup()?;
485
486        // Attach.
487        let struct_ops = Some(scx_ops_attach!(skel, lavd_ops)?);
488        let stats_server = StatsServer::new(stats::server_data(*NR_CPU_IDS as u64)).launch()?;
489
490        // Build a ring buffer for instrumentation
491        let (intrspc_tx, intrspc_rx) = channel::bounded(65536);
492        let rb_map = &mut skel.maps.introspec_msg;
493        let mut builder = libbpf_rs::RingBufferBuilder::new();
494        builder
495            .add(rb_map, move |data| {
496                Scheduler::relay_introspec(data, &intrspc_tx)
497            })
498            .unwrap();
499        let rb_mgr = builder.build().unwrap();
500
501        Ok(Self {
502            skel,
503            struct_ops,
504            rb_mgr,
505            intrspc: introspec::new(),
506            intrspc_rx,
507            monitor_tid: None,
508            stats_server,
509            mseq_id: 0,
510        })
511    }
512
513    fn attach_futex_ftraces(skel: &mut OpenBpfSkel) -> Result<bool> {
514        let ftraces = vec![
515            ("__futex_wait", &skel.progs.fexit___futex_wait),
516            ("futex_wait_multiple", &skel.progs.fexit_futex_wait_multiple),
517            (
518                "futex_wait_requeue_pi",
519                &skel.progs.fexit_futex_wait_requeue_pi,
520            ),
521            ("futex_wake", &skel.progs.fexit_futex_wake),
522            ("futex_wake_op", &skel.progs.fexit_futex_wake_op),
523            ("futex_lock_pi", &skel.progs.fexit_futex_lock_pi),
524            ("futex_unlock_pi", &skel.progs.fexit_futex_unlock_pi),
525        ];
526
527        if compat::tracer_available("function")? == false {
528            info!("Ftrace is not enabled in the kernel.");
529            return Ok(false);
530        }
531
532        compat::cond_kprobes_enable(ftraces)
533    }
534
535    fn attach_futex_tracepoints(skel: &mut OpenBpfSkel) -> Result<bool> {
536        let tracepoints = vec![
537            ("syscalls:sys_enter_futex", &skel.progs.rtp_sys_enter_futex),
538            ("syscalls:sys_exit_futex", &skel.progs.rtp_sys_exit_futex),
539            (
540                "syscalls:sys_exit_futex_wait",
541                &skel.progs.rtp_sys_exit_futex_wait,
542            ),
543            (
544                "syscalls:sys_exit_futex_waitv",
545                &skel.progs.rtp_sys_exit_futex_waitv,
546            ),
547            (
548                "syscalls:sys_exit_futex_wake",
549                &skel.progs.rtp_sys_exit_futex_wake,
550            ),
551        ];
552
553        compat::cond_tracepoints_enable(tracepoints)
554    }
555
556    fn attach_execve_tracepoints(skel: &mut OpenBpfSkel) -> Result<bool> {
557        let tracepoints = vec![
558            (
559                "syscalls:sys_enter_execve",
560                &skel.progs.cond_hook_sys_enter_execve,
561            ),
562            (
563                "syscalls:sys_enter_execveat",
564                &skel.progs.cond_hook_sys_enter_execveat,
565            ),
566        ];
567
568        compat::cond_tracepoints_enable(tracepoints)
569    }
570
571    fn init_cpus(skel: &mut OpenBpfSkel, order: &CpuOrder) {
572        debug!("{:#?}", order);
573
574        // Initialize CPU capacity and sibling
575        for cpu in order.cpuids.iter() {
576            skel.maps.rodata_data.as_mut().unwrap().cpu_capacity[cpu.cpu_adx] = cpu.cpu_cap as u16;
577            skel.maps.rodata_data.as_mut().unwrap().cpu_big[cpu.cpu_adx] = cpu.big_core as u8;
578            skel.maps.rodata_data.as_mut().unwrap().cpu_turbo[cpu.cpu_adx] = cpu.turbo_core as u8;
579            skel.maps.rodata_data.as_mut().unwrap().cpu_sibling[cpu.cpu_adx] =
580                cpu.cpu_sibling as u32;
581        }
582
583        // Initialize performance vs. CPU order table.
584        let nr_pco_states: u8 = order.perf_cpu_order.len() as u8;
585        if nr_pco_states > LAVD_PCO_STATE_MAX as u8 {
586            panic!("Generated performance vs. CPU order stats are too complex ({nr_pco_states}) to handle");
587        }
588
589        skel.maps.rodata_data.as_mut().unwrap().nr_pco_states = nr_pco_states;
590        for (i, (_, pco)) in order.perf_cpu_order.iter().enumerate() {
591            Self::init_pco_tuple(skel, i, &pco);
592            info!("{:#}", pco);
593        }
594
595        let (_, last_pco) = order.perf_cpu_order.last_key_value().unwrap();
596        for i in nr_pco_states..LAVD_PCO_STATE_MAX as u8 {
597            Self::init_pco_tuple(skel, i as usize, &last_pco);
598        }
599    }
600
601    fn init_pco_tuple(skel: &mut OpenBpfSkel, i: usize, pco: &PerfCpuOrder) {
602        let cpus_perf = pco.cpus_perf.borrow();
603        let cpus_ovflw = pco.cpus_ovflw.borrow();
604        let pco_nr_primary = cpus_perf.len();
605
606        skel.maps.rodata_data.as_mut().unwrap().pco_bounds[i] = pco.perf_cap as u32;
607        skel.maps.rodata_data.as_mut().unwrap().pco_nr_primary[i] = pco_nr_primary as u16;
608
609        for (j, &cpu_adx) in cpus_perf.iter().enumerate() {
610            skel.maps.rodata_data.as_mut().unwrap().pco_table[i][j] = cpu_adx as u16;
611        }
612
613        for (j, &cpu_adx) in cpus_ovflw.iter().enumerate() {
614            let k = j + pco_nr_primary;
615            skel.maps.rodata_data.as_mut().unwrap().pco_table[i][k] = cpu_adx as u16;
616        }
617    }
618
619    fn init_cpdoms(skel: &mut OpenBpfSkel, order: &CpuOrder) {
620        // Initialize compute domain contexts
621        for (k, v) in order.cpdom_map.iter() {
622            skel.maps.bss_data.as_mut().unwrap().cpdom_ctxs[v.cpdom_id].id = v.cpdom_id as u64;
623            skel.maps.bss_data.as_mut().unwrap().cpdom_ctxs[v.cpdom_id].alt_id =
624                v.cpdom_alt_id.get() as u64;
625            skel.maps.bss_data.as_mut().unwrap().cpdom_ctxs[v.cpdom_id].numa_id = k.numa_adx as u8;
626            skel.maps.bss_data.as_mut().unwrap().cpdom_ctxs[v.cpdom_id].llc_id = k.llc_adx as u8;
627            skel.maps.bss_data.as_mut().unwrap().cpdom_ctxs[v.cpdom_id].is_big = k.is_big as u8;
628            skel.maps.bss_data.as_mut().unwrap().cpdom_ctxs[v.cpdom_id].is_valid = 1;
629            for cpu_id in v.cpu_ids.iter() {
630                let i = cpu_id / 64;
631                let j = cpu_id % 64;
632                skel.maps.bss_data.as_mut().unwrap().cpdom_ctxs[v.cpdom_id].__cpumask[i] |=
633                    0x01 << j;
634            }
635
636            if v.neighbor_map.borrow().iter().len() > LAVD_CPDOM_MAX_DIST as usize {
637                panic!("The processor topology is too complex to handle in BPF.");
638            }
639
640            for (k, (_d, neighbors)) in v.neighbor_map.borrow().iter().enumerate() {
641                let nr_neighbors = neighbors.borrow().len() as u8;
642                if nr_neighbors > LAVD_CPDOM_MAX_NR as u8 {
643                    panic!("The processor topology is too complex to handle in BPF.");
644                }
645                skel.maps.bss_data.as_mut().unwrap().cpdom_ctxs[v.cpdom_id].nr_neighbors[k] =
646                    nr_neighbors;
647                for (i, &id) in neighbors.borrow().iter().enumerate() {
648                    let idx = (k * LAVD_CPDOM_MAX_NR as usize) + i;
649                    skel.maps.bss_data.as_mut().unwrap().cpdom_ctxs[v.cpdom_id].neighbor_ids[idx] =
650                        id as u8;
651                }
652            }
653        }
654    }
655
656    fn init_globals(skel: &mut OpenBpfSkel, opts: &Opts, order: &CpuOrder, debug_level: u8) {
657        let bss_data = skel.maps.bss_data.as_mut().unwrap();
658        bss_data.no_preemption = opts.no_preemption;
659        bss_data.no_core_compaction = opts.no_core_compaction;
660        bss_data.no_freq_scaling = opts.no_freq_scaling;
661        bss_data.is_powersave_mode = opts.powersave;
662        let rodata = skel.maps.rodata_data.as_mut().unwrap();
663        rodata.nr_llcs = order.nr_llcs as u64;
664        rodata.nr_cpu_ids = *NR_CPU_IDS as u32;
665        rodata.is_smt_active = order.smt_enabled;
666        rodata.is_autopilot_on = opts.autopilot;
667        rodata.verbose = debug_level;
668        rodata.slice_max_ns = opts.slice_max_us * 1000;
669        rodata.slice_min_ns = opts.slice_min_us * 1000;
670        rodata.pinned_slice_ns = opts.pinned_slice_us.map(|v| v * 1000).unwrap_or(0);
671        rodata.preempt_shift = opts.preempt_shift;
672        rodata.mig_delta_pct = opts.mig_delta_pct;
673        rodata.lb_low_util_wall = ((opts.lb_low_util_pct as u64) << 10) / 100;
674        rodata.lb_local_dsq_util_wall = ((opts.lb_local_dsq_util_pct as u64) << 10) / 100;
675        rodata.no_use_em = opts.no_use_em as u8;
676        rodata.no_wake_sync = opts.no_wake_sync;
677        rodata.no_slice_boost = opts.no_slice_boost;
678        rodata.per_cpu_dsq = opts.per_cpu_dsq;
679        rodata.enable_cpu_bw = opts.enable_cpu_bw;
680
681        if !ksym_exists("scx_group_set_bandwidth").unwrap() {
682            skel.struct_ops.lavd_ops_mut().cgroup_set_bandwidth = std::ptr::null_mut();
683            warn!("Kernel does not support ops.cgroup_set_bandwidth(), so disable it.");
684        }
685
686        skel.struct_ops.lavd_ops_mut().flags = *compat::SCX_OPS_ENQ_EXITING
687            | *compat::SCX_OPS_ENQ_LAST
688            | *compat::SCX_OPS_ENQ_MIGRATION_DISABLED
689            | *compat::SCX_OPS_KEEP_BUILTIN_IDLE;
690
691        if opts.partial {
692            skel.struct_ops.lavd_ops_mut().flags |= *compat::SCX_OPS_SWITCH_PARTIAL;
693        }
694    }
695
696    fn get_msg_seq_id() -> u64 {
697        static mut MSEQ: u64 = 0;
698        unsafe {
699            MSEQ += 1;
700            MSEQ
701        }
702    }
703
704    fn relay_introspec(data: &[u8], intrspc_tx: &Sender<SchedSample>) -> i32 {
705        let mt = msg_task_ctx::from_bytes(data);
706        let tx = mt.taskc_x;
707
708        // No idea how to print other types than LAVD_MSG_TASKC
709        if mt.hdr.kind != LAVD_MSG_TASKC {
710            return 0;
711        }
712
713        let mseq = Scheduler::get_msg_seq_id();
714
715        let c_tx_cm: *const c_char = (&tx.comm as *const [c_char; 17]) as *const c_char;
716        let c_tx_cm_str: &CStr = unsafe { CStr::from_ptr(c_tx_cm) };
717        let tx_comm: &str = c_tx_cm_str.to_str().unwrap();
718
719        let c_waker_cm: *const c_char = (&tx.waker_comm as *const [c_char; 17]) as *const c_char;
720        let c_waker_cm_str: &CStr = unsafe { CStr::from_ptr(c_waker_cm) };
721        let waker_comm: &str = c_waker_cm_str.to_str().unwrap();
722
723        let c_tx_st: *const c_char = (&tx.stat as *const [c_char; 5]) as *const c_char;
724        let c_tx_st_str: &CStr = unsafe { CStr::from_ptr(c_tx_st) };
725        let tx_stat: &str = c_tx_st_str.to_str().unwrap();
726
727        match intrspc_tx.try_send(SchedSample {
728            mseq,
729            pid: tx.pid,
730            comm: tx_comm.into(),
731            stat: tx_stat.into(),
732            cpu_id: tx.cpu_id,
733            prev_cpu_id: tx.prev_cpu_id,
734            suggested_cpu_id: tx.suggested_cpu_id,
735            waker_pid: tx.waker_pid,
736            waker_comm: waker_comm.into(),
737            slice_wall: tx.slice_wall,
738            lat_cri: tx.lat_cri,
739            avg_lat_cri: tx.avg_lat_cri,
740            static_prio: tx.static_prio,
741            rerunnable_interval_wall: tx.rerunnable_interval_wall,
742            resched_interval_wall: tx.resched_interval_wall,
743            run_freq: tx.run_freq,
744            avg_runtime_wall: tx.avg_runtime_wall,
745            wait_freq: tx.wait_freq,
746            wake_freq: tx.wake_freq,
747            perf_cri: tx.perf_cri,
748            thr_perf_cri: tx.thr_perf_cri,
749            cpuperf_cur: tx.cpuperf_cur,
750            cpu_util_wall: tx.cpu_util_wall,
751            cpu_util_invr: tx.cpu_util_invr,
752            steal_util_wall: tx.steal_util_wall,
753            steal_util_invr: tx.steal_util_invr,
754            dom_pinned_util_wall: tx.dom_pinned_util_wall,
755            dom_pinned_util_invr: tx.dom_pinned_util_invr,
756            nr_active: tx.nr_active,
757            dsq_id: tx.dsq_id,
758            dsq_consume_lat: tx.dsq_consume_lat,
759            slice_used_wall: tx.last_slice_used_wall,
760        }) {
761            Ok(()) | Err(TrySendError::Full(_)) => 0,
762            Err(e) => panic!("failed to send on intrspc_tx ({})", e),
763        }
764    }
765
766    fn prep_introspec(&mut self) {
767        if !self.skel.maps.bss_data.as_ref().unwrap().is_monitored {
768            self.skel.maps.bss_data.as_mut().unwrap().is_monitored = true;
769        }
770        self.skel.maps.bss_data.as_mut().unwrap().intrspc.cmd = self.intrspc.cmd;
771        self.skel.maps.bss_data.as_mut().unwrap().intrspc.arg = self.intrspc.arg;
772    }
773
774    fn cleanup_introspec(&mut self) {
775        self.skel.maps.bss_data.as_mut().unwrap().intrspc.cmd = LAVD_CMD_NOP;
776    }
777
778    fn get_pc(x: u64, y: u64) -> f64 {
779        return 100. * x as f64 / y as f64;
780    }
781
782    fn get_power_mode(power_mode: i32) -> &'static str {
783        match power_mode as u32 {
784            LAVD_PM_PERFORMANCE => "performance",
785            LAVD_PM_BALANCED => "balanced",
786            LAVD_PM_POWERSAVE => "powersave",
787            _ => "unknown",
788        }
789    }
790
791    fn stats_req_to_res(&mut self, req: &StatsReq) -> Result<StatsRes> {
792        Ok(match req {
793            StatsReq::NewSampler(tid) => {
794                self.rb_mgr.consume().unwrap();
795                self.monitor_tid = Some(*tid);
796                StatsRes::Ack
797            }
798            StatsReq::SysStatsReq { tid } => {
799                if Some(*tid) != self.monitor_tid {
800                    return Ok(StatsRes::Bye);
801                }
802                self.mseq_id += 1;
803
804                let bss_data = self.skel.maps.bss_data.as_ref().unwrap();
805                let st = bss_data.sys_stat;
806
807                let mseq = self.mseq_id;
808                let nr_queued_task = st.nr_queued_task;
809                let nr_active = st.nr_active;
810                let nr_sched = st.nr_sched;
811                let nr_preempt = st.nr_preempt;
812                let pc_pc = Self::get_pc(st.nr_perf_cri, nr_sched);
813                let pc_lc = Self::get_pc(st.nr_lat_cri, nr_sched);
814                let pc_x_migration = Self::get_pc(st.nr_x_migration, nr_sched);
815                let nr_stealee = st.nr_stealee;
816                let nr_big = st.nr_big;
817                let pc_big = Self::get_pc(nr_big, nr_sched);
818                let pc_pc_on_big = Self::get_pc(st.nr_pc_on_big, nr_big);
819                let pc_lc_on_big = Self::get_pc(st.nr_lc_on_big, nr_big);
820                let power_mode = Self::get_power_mode(bss_data.power_mode);
821                let total_time = bss_data.performance_mode_ns
822                    + bss_data.balanced_mode_ns
823                    + bss_data.powersave_mode_ns;
824                let pc_performance = Self::get_pc(bss_data.performance_mode_ns, total_time);
825                let pc_balanced = Self::get_pc(bss_data.balanced_mode_ns, total_time);
826                let pc_powersave = Self::get_pc(bss_data.powersave_mode_ns, total_time);
827
828                StatsRes::SysStats(SysStats {
829                    mseq,
830                    nr_queued_task,
831                    nr_active,
832                    nr_sched,
833                    nr_preempt,
834                    pc_pc,
835                    pc_lc,
836                    pc_x_migration,
837                    nr_stealee,
838                    pc_big,
839                    pc_pc_on_big,
840                    pc_lc_on_big,
841                    power_mode: power_mode.to_string(),
842                    pc_performance,
843                    pc_balanced,
844                    pc_powersave,
845                })
846            }
847            StatsReq::SchedSamplesNr {
848                tid,
849                nr_samples,
850                interval_ms,
851            } => {
852                if Some(*tid) != self.monitor_tid {
853                    return Ok(StatsRes::Bye);
854                }
855
856                self.intrspc.cmd = LAVD_CMD_SCHED_N;
857                self.intrspc.arg = *nr_samples;
858                self.prep_introspec();
859                std::thread::sleep(Duration::from_millis(*interval_ms));
860                self.rb_mgr.poll(Duration::from_millis(100)).unwrap();
861
862                let mut samples = vec![];
863                while let Ok(ts) = self.intrspc_rx.try_recv() {
864                    samples.push(ts);
865                }
866
867                self.cleanup_introspec();
868
869                StatsRes::SchedSamples(SchedSamples { samples })
870            }
871        })
872    }
873
874    fn stop_monitoring(&mut self) {
875        if self.skel.maps.bss_data.as_ref().unwrap().is_monitored {
876            self.skel.maps.bss_data.as_mut().unwrap().is_monitored = false;
877        }
878    }
879
880    pub fn exited(&mut self) -> bool {
881        uei_exited!(&self.skel, uei)
882    }
883
884    fn set_power_profile(&mut self, mode: u32) -> Result<(), u32> {
885        let prog = &mut self.skel.progs.set_power_profile;
886        let mut args = power_arg {
887            power_mode: mode as c_int,
888        };
889        let input = ProgramInput {
890            context_in: Some(unsafe {
891                std::slice::from_raw_parts_mut(
892                    &mut args as *mut _ as *mut u8,
893                    std::mem::size_of_val(&args),
894                )
895            }),
896            ..Default::default()
897        };
898        let out = prog.test_run(input).unwrap();
899        if out.return_value != 0 {
900            return Err(out.return_value);
901        }
902
903        Ok(())
904    }
905
906    fn update_power_profile(&mut self, prev_profile: PowerProfile) -> (bool, PowerProfile) {
907        let profile = fetch_power_profile(false);
908        if profile == prev_profile {
909            // If the profile is the same, skip updaring the profile for BPF.
910            return (true, profile);
911        }
912
913        let _ = match profile {
914            PowerProfile::Performance => self.set_power_profile(LAVD_PM_PERFORMANCE),
915            PowerProfile::Balanced { .. } => self.set_power_profile(LAVD_PM_BALANCED),
916            PowerProfile::Powersave => self.set_power_profile(LAVD_PM_POWERSAVE),
917            PowerProfile::Unknown => {
918                // We don't know how to handle an unknown energy profile,
919                // so we just give up updating the profile from now on.
920                return (false, profile);
921            }
922        };
923
924        info!("Set the scheduler's power profile to {profile} mode.");
925        (true, profile)
926    }
927
928    fn run(&mut self, opts: &Opts, shutdown: Arc<AtomicBool>) -> Result<UserExitInfo> {
929        let (res_ch, req_ch) = self.stats_server.channels();
930        let mut autopower = opts.autopower;
931        let mut profile = PowerProfile::Unknown;
932
933        if opts.performance {
934            let _ = self.set_power_profile(LAVD_PM_PERFORMANCE);
935        } else if opts.powersave {
936            let _ = self.set_power_profile(LAVD_PM_POWERSAVE);
937        } else {
938            let _ = self.set_power_profile(LAVD_PM_BALANCED);
939        }
940
941        while !shutdown.load(Ordering::Relaxed) && !self.exited() {
942            if autopower {
943                (autopower, profile) = self.update_power_profile(profile);
944            }
945
946            match req_ch.recv_timeout(Duration::from_secs(1)) {
947                Ok(req) => {
948                    let res = self.stats_req_to_res(&req)?;
949                    res_ch.send(res)?;
950                }
951                Err(RecvTimeoutError::Timeout) => {
952                    self.stop_monitoring();
953                }
954                Err(e) => {
955                    self.stop_monitoring();
956                    Err(e)?
957                }
958            }
959            self.cleanup_introspec();
960        }
961        self.rb_mgr.consume().unwrap();
962
963        let _ = self.struct_ops.take();
964        uei_report!(&self.skel, uei)
965    }
966}
967
968impl Drop for Scheduler<'_> {
969    fn drop(&mut self) {
970        info!("Unregister {SCHEDULER_NAME} scheduler");
971
972        if let Some(struct_ops) = self.struct_ops.take() {
973            drop(struct_ops);
974        }
975    }
976}
977
978fn init_log(opts: &Opts) {
979    let env_filter = EnvFilter::try_from_default_env()
980        .or_else(|_| match EnvFilter::try_new(&opts.log_level) {
981            Ok(filter) => Ok(filter),
982            Err(e) => {
983                eprintln!(
984                    "invalid log envvar: {}, using info, err is: {}",
985                    opts.log_level, e
986                );
987                EnvFilter::try_new("info")
988            }
989        })
990        .unwrap_or_else(|_| EnvFilter::new("info"));
991
992    match tracing_subscriber::fmt()
993        .with_env_filter(env_filter)
994        .with_target(true)
995        .with_thread_ids(true)
996        .with_file(true)
997        .with_line_number(true)
998        .try_init()
999    {
1000        Ok(()) => {}
1001        Err(e) => eprintln!("failed to init logger: {}", e),
1002    }
1003}
1004
1005#[clap_main::clap_main]
1006fn main(mut opts: Opts) -> Result<()> {
1007    if opts.version {
1008        println!(
1009            "scx_lavd {}",
1010            build_id::full_version(env!("CARGO_PKG_VERSION"))
1011        );
1012        return Ok(());
1013    }
1014
1015    if opts.help_stats {
1016        let sys_stats_meta_name = SysStats::meta().name;
1017        let sched_sample_meta_name = SchedSample::meta().name;
1018        let stats_meta_names: &[&str] = &[
1019            sys_stats_meta_name.as_str(),
1020            sched_sample_meta_name.as_str(),
1021        ];
1022        stats::server_data(0).describe_meta(&mut std::io::stdout(), Some(&stats_meta_names))?;
1023        return Ok(());
1024    }
1025
1026    init_log(&opts);
1027
1028    if opts.verbose > 0 {
1029        warn!("Setting verbose via -v is deprecated and will be an error in future releases.");
1030    }
1031
1032    if let Some(run_id) = opts.run_id {
1033        info!("scx_lavd run_id: {}", run_id);
1034    }
1035
1036    if opts.monitor.is_none() && opts.monitor_sched_samples.is_none() {
1037        opts.proc().unwrap();
1038        info!("{:#?}", opts);
1039    }
1040
1041    let shutdown = Arc::new(AtomicBool::new(false));
1042    let shutdown_clone = shutdown.clone();
1043    ctrlc::set_handler(move || {
1044        shutdown_clone.store(true, Ordering::Relaxed);
1045    })
1046    .context("Error setting Ctrl-C handler")?;
1047
1048    if let Some(nr_samples) = opts.monitor_sched_samples {
1049        let shutdown_copy = shutdown.clone();
1050        let jh = std::thread::spawn(move || {
1051            stats::monitor_sched_samples(nr_samples, shutdown_copy).unwrap()
1052        });
1053        let _ = jh.join();
1054        return Ok(());
1055    }
1056
1057    if let Some(intv) = opts.monitor.or(opts.stats) {
1058        let shutdown_copy = shutdown.clone();
1059        let jh = std::thread::spawn(move || {
1060            stats::monitor(Duration::from_secs_f64(intv), shutdown_copy).unwrap()
1061        });
1062        if opts.monitor.is_some() {
1063            let _ = jh.join();
1064            return Ok(());
1065        }
1066    }
1067
1068    let mut open_object = MaybeUninit::uninit();
1069    loop {
1070        let mut sched = Scheduler::init(&opts, &mut open_object)?;
1071        info!(
1072            "scx_lavd scheduler is initialized (build ID: {})",
1073            build_id::full_version(env!("CARGO_PKG_VERSION"))
1074        );
1075        info!("scx_lavd scheduler starts running.");
1076        if !sched.run(&opts, shutdown.clone())?.should_restart() {
1077            break;
1078        }
1079    }
1080
1081    Ok(())
1082}