scx_layered/
main.rs

1// Copyright (c) Meta Platforms, Inc. and affiliates.
2
3// This software may be used and distributed according to the terms of the
4// GNU General Public License version 2.
5mod bpf_skel;
6mod stats;
7
8use std::collections::BTreeMap;
9use std::collections::HashMap;
10use std::ffi::CString;
11use std::fs;
12use std::io::Write;
13use std::mem::MaybeUninit;
14use std::ops::Sub;
15use std::sync::Arc;
16use std::sync::atomic::AtomicBool;
17use std::sync::atomic::Ordering;
18use std::thread::ThreadId;
19use std::time::Duration;
20use std::time::Instant;
21
22use ::fb_procfs as procfs;
23use anyhow::Context;
24use anyhow::Result;
25use anyhow::anyhow;
26use anyhow::bail;
27pub use bpf_skel::*;
28use clap::Parser;
29use crossbeam::channel::RecvTimeoutError;
30use lazy_static::lazy_static;
31use libbpf_rs::MapCore as _;
32use libbpf_rs::OpenObject;
33use libbpf_rs::ProgramInput;
34use log::debug;
35use log::info;
36use log::trace;
37use log::warn;
38use scx_layered::*;
39use scx_stats::prelude::*;
40use scx_utils::CoreType;
41use scx_utils::Cpumask;
42use scx_utils::Llc;
43use scx_utils::NR_CPU_IDS;
44use scx_utils::NR_CPUS_POSSIBLE;
45use scx_utils::NetDev;
46use scx_utils::Topology;
47use scx_utils::UserExitInfo;
48use scx_utils::compat;
49use scx_utils::init_libbpf_logging;
50use scx_utils::pm::{cpu_idle_resume_latency_supported, update_cpu_idle_resume_latency};
51use scx_utils::read_netdevs;
52use scx_utils::scx_enums;
53use scx_utils::scx_ops_attach;
54use scx_utils::scx_ops_load;
55use scx_utils::scx_ops_open;
56use scx_utils::uei_exited;
57use scx_utils::uei_report;
58use stats::LayerStats;
59use stats::StatsReq;
60use stats::StatsRes;
61use stats::SysStats;
62
63const MAX_PATH: usize = bpf_intf::consts_MAX_PATH as usize;
64const MAX_COMM: usize = bpf_intf::consts_MAX_COMM as usize;
65const MAX_LAYER_WEIGHT: u32 = bpf_intf::consts_MAX_LAYER_WEIGHT;
66const MIN_LAYER_WEIGHT: u32 = bpf_intf::consts_MIN_LAYER_WEIGHT;
67const MAX_LAYER_MATCH_ORS: usize = bpf_intf::consts_MAX_LAYER_MATCH_ORS as usize;
68const MAX_LAYER_NAME: usize = bpf_intf::consts_MAX_LAYER_NAME as usize;
69const MAX_LAYERS: usize = bpf_intf::consts_MAX_LAYERS as usize;
70const DEFAULT_LAYER_WEIGHT: u32 = bpf_intf::consts_DEFAULT_LAYER_WEIGHT;
71const USAGE_HALF_LIFE: u32 = bpf_intf::consts_USAGE_HALF_LIFE;
72const USAGE_HALF_LIFE_F64: f64 = USAGE_HALF_LIFE as f64 / 1_000_000_000.0;
73
74const LAYER_USAGE_OWNED: usize = bpf_intf::layer_usage_LAYER_USAGE_OWNED as usize;
75const LAYER_USAGE_OPEN: usize = bpf_intf::layer_usage_LAYER_USAGE_OPEN as usize;
76const LAYER_USAGE_SUM_UPTO: usize = bpf_intf::layer_usage_LAYER_USAGE_SUM_UPTO as usize;
77const LAYER_USAGE_PROTECTED: usize = bpf_intf::layer_usage_LAYER_USAGE_PROTECTED as usize;
78const LAYER_USAGE_PROTECTED_PREEMPT: usize =
79    bpf_intf::layer_usage_LAYER_USAGE_PROTECTED_PREEMPT as usize;
80const NR_LAYER_USAGES: usize = bpf_intf::layer_usage_NR_LAYER_USAGES as usize;
81
82const NR_GSTATS: usize = bpf_intf::global_stat_id_NR_GSTATS as usize;
83const NR_LSTATS: usize = bpf_intf::layer_stat_id_NR_LSTATS as usize;
84const NR_LLC_LSTATS: usize = bpf_intf::llc_layer_stat_id_NR_LLC_LSTATS as usize;
85
86const NR_LAYER_MATCH_KINDS: usize = bpf_intf::layer_match_kind_NR_LAYER_MATCH_KINDS as usize;
87
88lazy_static! {
89    static ref USAGE_DECAY: f64 = 0.5f64.powf(1.0 / USAGE_HALF_LIFE_F64);
90    static ref DFL_DISALLOW_OPEN_AFTER_US: u64 = 2 * scx_enums.SCX_SLICE_DFL / 1000;
91    static ref DFL_DISALLOW_PREEMPT_AFTER_US: u64 = 4 * scx_enums.SCX_SLICE_DFL / 1000;
92    static ref EXAMPLE_CONFIG: LayerConfig = LayerConfig {
93        specs: vec![
94            LayerSpec {
95                name: "batch".into(),
96                comment: Some("tasks under system.slice or tasks with nice value > 0".into()),
97                matches: vec![
98                    vec![LayerMatch::CgroupPrefix("system.slice/".into())],
99                    vec![LayerMatch::NiceAbove(0)],
100                ],
101                kind: LayerKind::Confined {
102                    util_range: (0.8, 0.9),
103                    cpus_range: Some((0, 16)),
104                    cpus_range_frac: None,
105                    protected: false,
106                    common: LayerCommon {
107                        min_exec_us: 1000,
108                        yield_ignore: 0.0,
109                        preempt: false,
110                        preempt_first: false,
111                        exclusive: false,
112                        allow_node_aligned: false,
113                        skip_remote_node: false,
114                        prev_over_idle_core: false,
115                        idle_smt: None,
116                        slice_us: 20000,
117                        fifo: false,
118                        weight: DEFAULT_LAYER_WEIGHT,
119                        disallow_open_after_us: None,
120                        disallow_preempt_after_us: None,
121                        xllc_mig_min_us: 1000.0,
122                        growth_algo: LayerGrowthAlgo::Sticky,
123                        idle_resume_us: None,
124                        perf: 1024,
125                        nodes: vec![],
126                        llcs: vec![],
127                    },
128                },
129            },
130            LayerSpec {
131                name: "immediate".into(),
132                comment: Some("tasks under workload.slice with nice value < 0".into()),
133                matches: vec![vec![
134                    LayerMatch::CgroupPrefix("workload.slice/".into()),
135                    LayerMatch::NiceBelow(0),
136                ]],
137                kind: LayerKind::Open {
138                    common: LayerCommon {
139                        min_exec_us: 100,
140                        yield_ignore: 0.25,
141                        preempt: true,
142                        preempt_first: false,
143                        exclusive: true,
144                        allow_node_aligned: true,
145                        skip_remote_node: false,
146                        prev_over_idle_core: true,
147                        idle_smt: None,
148                        slice_us: 20000,
149                        fifo: false,
150                        weight: DEFAULT_LAYER_WEIGHT,
151                        disallow_open_after_us: None,
152                        disallow_preempt_after_us: None,
153                        xllc_mig_min_us: 0.0,
154                        growth_algo: LayerGrowthAlgo::Sticky,
155                        perf: 1024,
156                        idle_resume_us: None,
157                        nodes: vec![],
158                        llcs: vec![],
159                    },
160                },
161            },
162            LayerSpec {
163                name: "stress-ng".into(),
164                comment: Some("stress-ng test layer".into()),
165                matches: vec![
166                    vec![LayerMatch::CommPrefix("stress-ng".into()),],
167                    vec![LayerMatch::PcommPrefix("stress-ng".into()),]
168                ],
169                kind: LayerKind::Confined {
170                    cpus_range: None,
171                    util_range: (0.2, 0.8),
172                    protected: false,
173                    cpus_range_frac: None,
174                    common: LayerCommon {
175                        min_exec_us: 800,
176                        yield_ignore: 0.0,
177                        preempt: true,
178                        preempt_first: false,
179                        exclusive: false,
180                        allow_node_aligned: false,
181                        skip_remote_node: false,
182                        prev_over_idle_core: false,
183                        idle_smt: None,
184                        slice_us: 800,
185                        fifo: false,
186                        weight: DEFAULT_LAYER_WEIGHT,
187                        disallow_open_after_us: None,
188                        disallow_preempt_after_us: None,
189                        xllc_mig_min_us: 0.0,
190                        growth_algo: LayerGrowthAlgo::Topo,
191                        perf: 1024,
192                        idle_resume_us: None,
193                        nodes: vec![],
194                        llcs: vec![],
195                    },
196                },
197            },
198            LayerSpec {
199                name: "normal".into(),
200                comment: Some("the rest".into()),
201                matches: vec![vec![]],
202                kind: LayerKind::Grouped {
203                    cpus_range: None,
204                    util_range: (0.5, 0.6),
205                    protected: false,
206                    cpus_range_frac: None,
207                    common: LayerCommon {
208                        min_exec_us: 200,
209                        yield_ignore: 0.0,
210                        preempt: false,
211                        preempt_first: false,
212                        exclusive: false,
213                        allow_node_aligned: false,
214                        skip_remote_node: false,
215                        prev_over_idle_core: false,
216                        idle_smt: None,
217                        slice_us: 20000,
218                        fifo: false,
219                        weight: DEFAULT_LAYER_WEIGHT,
220                        disallow_open_after_us: None,
221                        disallow_preempt_after_us: None,
222                        xllc_mig_min_us: 100.0,
223                        growth_algo: LayerGrowthAlgo::Linear,
224                        perf: 1024,
225                        idle_resume_us: None,
226                        nodes: vec![],
227                        llcs: vec![],
228                    },
229                },
230            },
231        ],
232    };
233}
234
235/// scx_layered: A highly configurable multi-layer sched_ext scheduler
236///
237/// scx_layered allows classifying tasks into multiple layers and applying
238/// different scheduling policies to them. The configuration is specified in
239/// json and composed of two parts - matches and policies.
240///
241/// Matches
242/// =======
243///
244/// Whenever a task is forked or its attributes are changed, the task goes
245/// through a series of matches to determine the layer it belongs to. A
246/// match set is composed of OR groups of AND blocks. An example:
247///
248///   "matches": [
249///     [
250///       {
251///         "CgroupPrefix": "system.slice/"
252///       }
253///     ],
254///     [
255///       {
256///         "CommPrefix": "fbagent"
257///       },
258///       {
259///         "NiceAbove": 0
260///       }
261///     ]
262///   ],
263///
264/// The outer array contains the OR groups and the inner AND blocks, so the
265/// above matches:
266///
267/// - Tasks which are in the cgroup sub-hierarchy under "system.slice".
268///
269/// - Or tasks whose comm starts with "fbagent" and have a nice value > 0.
270///
271/// Currently, the following matches are supported:
272///
273/// - CgroupPrefix: Matches the prefix of the cgroup that the task belongs
274///   to. As this is a string match, whether the pattern has the trailing
275///   '/' makes a difference. For example, "TOP/CHILD/" only matches tasks
276///   which are under that particular cgroup while "TOP/CHILD" also matches
277///   tasks under "TOP/CHILD0/" or "TOP/CHILD1/".
278///
279/// - CommPrefix: Matches the task's comm prefix.
280///
281/// - PcommPrefix: Matches the task's thread group leader's comm prefix.
282///
283/// - NiceAbove: Matches if the task's nice value is greater than the
284///   pattern.
285///
286/// - NiceBelow: Matches if the task's nice value is smaller than the
287///   pattern.
288///
289/// - NiceEquals: Matches if the task's nice value is exactly equal to
290///   the pattern.
291///
292/// - UIDEquals: Matches if the task's effective user id matches the value
293///
294/// - GIDEquals: Matches if the task's effective group id matches the value.
295///
296/// - PIDEquals: Matches if the task's pid matches the value.
297///
298/// - PPIDEquals: Matches if the task's ppid matches the value.
299///
300/// - TGIDEquals: Matches if the task's tgid matches the value.
301///
302/// - NSPIDEquals: Matches if the task's namespace id and pid matches the values.
303///
304/// - NSEquals: Matches if the task's namespace id matches the values.
305///
306/// - IsGroupLeader: Bool. When true, matches if the task is group leader
307///   (i.e. PID == TGID), aka the thread from which other threads are made.
308///   When false, matches if the task is *not* the group leader (i.e. the rest).
309///
310/// - CmdJoin: Matches when the task uses pthread_setname_np to send a join/leave
311/// command to the scheduler. See examples/cmdjoin.c for more details.
312///
313/// - UsedGpuTid: Bool. When true, matches if the tasks which have used
314///   gpus by tid.
315///
316/// - UsedGpuPid: Bool. When true, matches if the tasks which have used gpu
317///   by tgid/pid.
318///
319/// While there are complexity limitations as the matches are performed in
320/// BPF, it is straightforward to add more types of matches.
321///
322/// Policies
323/// ========
324///
325/// The following is an example policy configuration for a layer.
326///
327///   "kind": {
328///     "Confined": {
329///       "cpus_range": [1, 8],
330///       "util_range": [0.8, 0.9]
331///     }
332///   }
333///
334/// It's of "Confined" kind, which tries to concentrate the layer's tasks
335/// into a limited number of CPUs. In the above case, the number of CPUs
336/// assigned to the layer is scaled between 1 and 8 so that the per-cpu
337/// utilization is kept between 80% and 90%. If the CPUs are loaded higher
338/// than 90%, more CPUs are allocated to the layer. If the utilization drops
339/// below 80%, the layer loses CPUs.
340///
341/// Currently, the following policy kinds are supported:
342///
343/// - Confined: Tasks are restricted to the allocated CPUs. The number of
344///   CPUs allocated is modulated to keep the per-CPU utilization in
345///   "util_range". The range can optionally be restricted with the
346///   "cpus_range" property.
347///
348/// - Grouped: Similar to Confined but tasks may spill outside if there are
349///   idle CPUs outside the allocated ones. The range can optionally be
350///   restricted with the "cpus_range" property.
351///
352/// - Open: Prefer the CPUs which are not occupied by Confined or Grouped
353///   layers. Tasks in this group will spill into occupied CPUs if there are
354///   no unoccupied idle CPUs.
355///
356/// All layers take the following options:
357///
358/// - min_exec_us: Minimum execution time in microseconds. Whenever a task
359///   is scheduled in, this is the minimum CPU time that it's charged no
360///   matter how short the actual execution time may be.
361///
362/// - yield_ignore: Yield ignore ratio. If 0.0, yield(2) forfeits a whole
363///   execution slice. 0.25 yields three quarters of an execution slice and
364///   so on. If 1.0, yield is completely ignored.
365///
366/// - slice_us: Scheduling slice duration in microseconds.
367///
368/// - fifo: Use FIFO queues within the layer instead of the default vtime.
369///
370/// - preempt: If true, tasks in the layer will preempt tasks which belong
371///   to other non-preempting layers when no idle CPUs are available.
372///
373/// - preempt_first: If true, tasks in the layer will try to preempt tasks
374///   in their previous CPUs before trying to find idle CPUs.
375///
376/// - exclusive: If true, tasks in the layer will occupy the whole core. The
377///   other logical CPUs sharing the same core will be kept idle. This isn't
378///   a hard guarantee, so don't depend on it for security purposes.
379///
380/// - allow_node_aligned: Put node aligned tasks on layer DSQs instead of lo
381///   fallback. This is a hack to support node-affine tasks without making
382///   the whole scheduler node aware and should only be used with open
383///   layers on non-saturated machines to avoid possible stalls.
384///
385/// - prev_over_idle_core: On SMT enabled systems, prefer using the same CPU
386///   when picking a CPU for tasks on this layer, even if that CPUs SMT
387///   sibling is processing a task.
388///
389/// - weight: Weight of the layer, which is a range from 1 to 10000 with a
390///   default of 100. Layer weights are used during contention to prevent
391///   starvation across layers. Weights are used in combination with
392///   utilization to determine the infeasible adjusted weight with higher
393///   weights having a larger adjustment in adjusted utilization.
394///
395/// - disallow_open_after_us: Duration to wait after machine reaches saturation
396///   before confining tasks in Open layers.
397///
398/// - cpus_range_frac: Array of 2 floats between 0 and 1.0. Lower and upper
399///   bound fractions of all CPUs to give to a layer. Mutually exclusive
400///   with cpus_range.
401///
402/// - disallow_preempt_after_us: Duration to wait after machine reaches saturation
403///   before confining tasks to preempt.
404///
405/// - xllc_mig_min_us: Skip cross-LLC migrations if they are likely to run on
406///   their existing LLC sooner than this.
407///
408/// - idle_smt: *** DEPRECATED ****
409///
410/// - growth_algo: When a layer is allocated new CPUs different algorithms can
411///   be used to determine which CPU should be allocated next. The default
412///   algorithm is a "sticky" algorithm that attempts to spread layers evenly
413///   across cores.
414///
415/// - perf: CPU performance target. 0 means no configuration. A value
416///   between 1 and 1024 indicates the performance level CPUs running tasks
417///   in this layer are configured to using scx_bpf_cpuperf_set().
418///
419/// - idle_resume_us: Sets the idle resume QoS value. CPU idle time governors are expected to
420///   regard the minimum of the global (effective) CPU latency limit and the effective resume
421///   latency constraint for the given CPU as the upper limit for the exit latency of the idle
422///   states. See the latest kernel docs for more details:
423///   https://www.kernel.org/doc/html/latest/admin-guide/pm/cpuidle.html
424///
425/// - nodes: If set the layer will use the set of NUMA nodes for scheduling
426///   decisions. If unset then all available NUMA nodes will be used. If the
427///   llcs value is set the cpuset of NUMA nodes will be or'ed with the LLC
428///   config.
429///
430/// - llcs: If set the layer will use the set of LLCs (last level caches)
431///   for scheduling decisions. If unset then all LLCs will be used. If
432///   the nodes value is set the cpuset of LLCs will be or'ed with the nodes
433///   config.
434///
435///
436/// Similar to matches, adding new policies and extending existing ones
437/// should be relatively straightforward.
438///
439/// Configuration example and running scx_layered
440/// =============================================
441///
442/// An scx_layered config is composed of layer configs. A layer config is
443/// composed of a name, a set of matches, and a policy block. Running the
444/// following will write an example configuration into example.json.
445///
446///   $ scx_layered -e example.json
447///
448/// Note that the last layer in the configuration must have an empty match set
449/// as a catch-all for tasks which haven't been matched into previous layers.
450///
451/// The configuration can be specified in multiple json files and
452/// command line arguments, which are concatenated in the specified
453/// order. Each must contain valid layer configurations.
454///
455/// By default, an argument to scx_layered is interpreted as a JSON string. If
456/// the argument is a pointer to a JSON file, it should be prefixed with file:
457/// or f: as follows:
458///
459///   $ scx_layered file:example.json
460///   ...
461///   $ scx_layered f:example.json
462///
463/// Monitoring Statistics
464/// =====================
465///
466/// Run with `--stats INTERVAL` to enable stats monitoring. There is
467/// also an scx_stat server listening on /var/run/scx/root/stat that can
468/// be monitored by running `scx_layered --monitor INTERVAL` separately.
469///
470///   ```bash
471///   $ scx_layered --monitor 1
472///   tot= 117909 local=86.20 open_idle= 0.21 affn_viol= 1.37 proc=6ms
473///   busy= 34.2 util= 1733.6 load=  21744.1 fallback_cpu=  1
474///     batch    : util/frac=   11.8/  0.7 load/frac=     29.7:  0.1 tasks=  2597
475///                tot=   3478 local=67.80 open_idle= 0.00 preempt= 0.00 affn_viol= 0.00
476///                cpus=  2 [  2,  2] 04000001 00000000
477///     immediate: util/frac= 1218.8/ 70.3 load/frac=  21399.9: 98.4 tasks=  1107
478///                tot=  68997 local=90.57 open_idle= 0.26 preempt= 9.36 affn_viol= 0.00
479///                cpus= 50 [ 50, 50] fbfffffe 000fffff
480///     normal   : util/frac=  502.9/ 29.0 load/frac=    314.5:  1.4 tasks=  3512
481///                tot=  45434 local=80.97 open_idle= 0.16 preempt= 0.00 affn_viol= 3.56
482///                cpus= 50 [ 50, 50] fbfffffe 000fffff
483///   ```
484///
485/// Global statistics: see [`SysStats`]
486///
487/// Per-layer statistics: see [`LayerStats`]
488///
489#[derive(Debug, Parser)]
490#[command(verbatim_doc_comment)]
491struct Opts {
492    /// Scheduling slice duration in microseconds.
493    #[clap(short = 's', long, default_value = "20000")]
494    slice_us: u64,
495
496    /// Maximum consecutive execution time in microseconds. A task may be
497    /// allowed to keep executing on a CPU for this long. Note that this is
498    /// the upper limit and a task may have to moved off the CPU earlier. 0
499    /// indicates default - 20 * slice_us.
500    #[clap(short = 'M', long, default_value = "0")]
501    max_exec_us: u64,
502
503    /// Scheduling interval in seconds.
504    #[clap(short = 'i', long, default_value = "0.1")]
505    interval: f64,
506
507    /// ***DEPRECATED*** Disable load-fraction based max layer CPU limit.
508    /// recommended.
509    #[clap(short = 'n', long, default_value = "false")]
510    no_load_frac_limit: bool,
511
512    /// Exit debug dump buffer length. 0 indicates default.
513    #[clap(long, default_value = "0")]
514    exit_dump_len: u32,
515
516    /// Enable verbose output, including libbpf details. Specify multiple
517    /// times to increase verbosity.
518    #[clap(short = 'v', long, action = clap::ArgAction::Count)]
519    verbose: u8,
520
521    /// Disable topology awareness. When enabled, the "nodes" and "llcs" settings on
522    /// a layer are ignored. Defaults to false on topologies with multiple NUMA nodes
523    /// or LLCs, and true otherwise.
524    #[arg(short = 't', long, num_args = 0..=1, default_missing_value = "true", require_equals = true)]
525    disable_topology: Option<bool>,
526
527    /// Enable cross NUMA preemption.
528    #[clap(long)]
529    xnuma_preemption: bool,
530
531    /// Disable monitor
532    #[clap(long)]
533    monitor_disable: bool,
534
535    /// Write example layer specifications into the file and exit.
536    #[clap(short = 'e', long)]
537    example: Option<String>,
538
539    /// ***DEPRECATED*** Disables preemption if the weighted load fraction
540    /// of a layer (load_frac_adj) exceeds the threshold. The default is
541    /// disabled (0.0).
542    #[clap(long, default_value = "0.0")]
543    layer_preempt_weight_disable: f64,
544
545    /// ***DEPRECATED*** Disables layer growth if the weighted load fraction
546    /// of a layer (load_frac_adj) exceeds the threshold. The default is
547    /// disabled (0.0).
548    #[clap(long, default_value = "0.0")]
549    layer_growth_weight_disable: f64,
550
551    /// Enable stats monitoring with the specified interval.
552    #[clap(long)]
553    stats: Option<f64>,
554
555    /// Run in stats monitoring mode with the specified interval. Scheduler
556    /// is not launched.
557    #[clap(long)]
558    monitor: Option<f64>,
559
560    /// Run with example layer specifications (useful for e.g. CI pipelines)
561    #[clap(long)]
562    run_example: bool,
563
564    /// ***DEPRECATED *** Enables iteration over local LLCs first for
565    /// dispatch.
566    #[clap(long, default_value = "false")]
567    local_llc_iteration: bool,
568
569    /// Low priority fallback DSQs are used to execute tasks with custom CPU
570    /// affinities. These DSQs are immediately executed iff a CPU is
571    /// otherwise idle. However, after the specified wait, they are
572    /// guranteed upto --lo-fb-share fraction of each CPU.
573    #[clap(long, default_value = "10000")]
574    lo_fb_wait_us: u64,
575
576    /// The fraction of CPU time guaranteed to low priority fallback DSQs.
577    /// See --lo-fb-wait-us.
578    #[clap(long, default_value = ".05")]
579    lo_fb_share: f64,
580
581    /// Disable antistall
582    #[clap(long, default_value = "false")]
583    disable_antistall: bool,
584
585    /// Maximum task runnable_at delay (in seconds) before antistall turns on
586    #[clap(long, default_value = "3")]
587    antistall_sec: u64,
588
589    /// Enable gpu support
590    #[clap(long, default_value = "false")]
591    enable_gpu_support: bool,
592
593    /// Gpu Kprobe Level
594    /// The value set here determines how agressive
595    /// the kprobes enabled on gpu driver functions are.
596    /// Higher values are more aggressive, incurring more system overhead
597    /// and more accurately identifying PIDs using GPUs in a more timely manner.
598    /// Lower values incur less system overhead, at the cost of less accurately
599    /// identifying GPU pids and taking longer to do so.
600    #[clap(long, default_value = "3")]
601    gpu_kprobe_level: u64,
602
603    /// Enable netdev IRQ balancing. This is experimental and should be used with caution.
604    #[clap(long, default_value = "false")]
605    netdev_irq_balance: bool,
606
607    /// Disable queued wakeup optimization.
608    #[clap(long, default_value = "false")]
609    disable_queued_wakeup: bool,
610
611    /// Per-cpu kthreads are preempting by default. Make it not so.
612    #[clap(long, default_value = "false")]
613    disable_percpu_kthread_preempt: bool,
614
615    /// Show descriptions for statistics.
616    #[clap(long)]
617    help_stats: bool,
618
619    /// Layer specification. See --help.
620    specs: Vec<String>,
621}
622
623fn read_total_cpu(reader: &procfs::ProcReader) -> Result<procfs::CpuStat> {
624    reader
625        .read_stat()
626        .context("Failed to read procfs")?
627        .total_cpu
628        .ok_or_else(|| anyhow!("Could not read total cpu stat in proc"))
629}
630
631fn calc_util(curr: &procfs::CpuStat, prev: &procfs::CpuStat) -> Result<f64> {
632    match (curr, prev) {
633        (
634            procfs::CpuStat {
635                user_usec: Some(curr_user),
636                nice_usec: Some(curr_nice),
637                system_usec: Some(curr_system),
638                idle_usec: Some(curr_idle),
639                iowait_usec: Some(curr_iowait),
640                irq_usec: Some(curr_irq),
641                softirq_usec: Some(curr_softirq),
642                stolen_usec: Some(curr_stolen),
643                ..
644            },
645            procfs::CpuStat {
646                user_usec: Some(prev_user),
647                nice_usec: Some(prev_nice),
648                system_usec: Some(prev_system),
649                idle_usec: Some(prev_idle),
650                iowait_usec: Some(prev_iowait),
651                irq_usec: Some(prev_irq),
652                softirq_usec: Some(prev_softirq),
653                stolen_usec: Some(prev_stolen),
654                ..
655            },
656        ) => {
657            let idle_usec = curr_idle.saturating_sub(*prev_idle);
658            let iowait_usec = curr_iowait.saturating_sub(*prev_iowait);
659            let user_usec = curr_user.saturating_sub(*prev_user);
660            let system_usec = curr_system.saturating_sub(*prev_system);
661            let nice_usec = curr_nice.saturating_sub(*prev_nice);
662            let irq_usec = curr_irq.saturating_sub(*prev_irq);
663            let softirq_usec = curr_softirq.saturating_sub(*prev_softirq);
664            let stolen_usec = curr_stolen.saturating_sub(*prev_stolen);
665
666            let busy_usec =
667                user_usec + system_usec + nice_usec + irq_usec + softirq_usec + stolen_usec;
668            let total_usec = idle_usec + busy_usec + iowait_usec;
669            if total_usec > 0 {
670                Ok(((busy_usec as f64) / (total_usec as f64)).clamp(0.0, 1.0))
671            } else {
672                Ok(1.0)
673            }
674        }
675        _ => bail!("Missing stats in cpustat"),
676    }
677}
678
679fn copy_into_cstr(dst: &mut [i8], src: &str) {
680    let cstr = CString::new(src).unwrap();
681    let bytes = unsafe { std::mem::transmute::<&[u8], &[i8]>(cstr.as_bytes_with_nul()) };
682    dst[0..bytes.len()].copy_from_slice(bytes);
683}
684
685fn nodemask_from_nodes(nodes: &Vec<usize>) -> usize {
686    let mut mask = 0;
687    for node in nodes {
688        mask |= 1 << node;
689    }
690    mask
691}
692
693fn llcmask_from_llcs(llcs: &BTreeMap<usize, Arc<Llc>>) -> usize {
694    let mut mask = 0;
695    for (_, cache) in llcs {
696        mask |= 1 << cache.id;
697    }
698    mask
699}
700
701fn read_cpu_ctxs(skel: &BpfSkel) -> Result<Vec<bpf_intf::cpu_ctx>> {
702    let mut cpu_ctxs = vec![];
703    let cpu_ctxs_vec = skel
704        .maps
705        .cpu_ctxs
706        .lookup_percpu(&0u32.to_ne_bytes(), libbpf_rs::MapFlags::ANY)
707        .context("Failed to lookup cpu_ctx")?
708        .unwrap();
709    for cpu in 0..*NR_CPUS_POSSIBLE {
710        cpu_ctxs.push(*unsafe {
711            &*(cpu_ctxs_vec[cpu].as_slice().as_ptr() as *const bpf_intf::cpu_ctx)
712        });
713    }
714    Ok(cpu_ctxs)
715}
716
717#[derive(Clone, Debug)]
718struct BpfStats {
719    gstats: Vec<u64>,
720    lstats: Vec<Vec<u64>>,
721    lstats_sums: Vec<u64>,
722    llc_lstats: Vec<Vec<Vec<u64>>>, // [layer][llc][stat]
723}
724
725impl BpfStats {
726    fn read(skel: &BpfSkel, cpu_ctxs: &[bpf_intf::cpu_ctx]) -> Self {
727        let nr_layers = skel.maps.rodata_data.nr_layers as usize;
728        let nr_llcs = skel.maps.rodata_data.nr_llcs as usize;
729        let mut gstats = vec![0u64; NR_GSTATS];
730        let mut lstats = vec![vec![0u64; NR_LSTATS]; nr_layers];
731        let mut llc_lstats = vec![vec![vec![0u64; NR_LLC_LSTATS]; nr_llcs]; nr_layers];
732
733        for cpu in 0..*NR_CPUS_POSSIBLE {
734            for stat in 0..NR_GSTATS {
735                gstats[stat] += cpu_ctxs[cpu].gstats[stat];
736            }
737            for layer in 0..nr_layers {
738                for stat in 0..NR_LSTATS {
739                    lstats[layer][stat] += cpu_ctxs[cpu].lstats[layer][stat];
740                }
741            }
742        }
743
744        let mut lstats_sums = vec![0u64; NR_LSTATS];
745        for layer in 0..nr_layers {
746            for stat in 0..NR_LSTATS {
747                lstats_sums[stat] += lstats[layer][stat];
748            }
749        }
750
751        for llc_id in 0..nr_llcs {
752            // XXX - This would be a lot easier if llc_ctx were in
753            // the bss. Unfortunately, kernel < v6.12 crashes and
754            // kernel >= v6.12 fails verification after such
755            // conversion due to seemingly verifier bugs. Convert to
756            // bss maps later.
757            let key = llc_id as u32;
758            let llc_id_slice =
759                unsafe { std::slice::from_raw_parts((&key as *const u32) as *const u8, 4) };
760            let v = skel
761                .maps
762                .llc_data
763                .lookup(llc_id_slice, libbpf_rs::MapFlags::ANY)
764                .unwrap()
765                .unwrap();
766            let llcc = unsafe { *(v.as_slice().as_ptr() as *const bpf_intf::llc_ctx) };
767
768            for layer_id in 0..nr_layers {
769                for stat_id in 0..NR_LLC_LSTATS {
770                    llc_lstats[layer_id][llc_id][stat_id] = llcc.lstats[layer_id][stat_id];
771                }
772            }
773        }
774
775        Self {
776            gstats,
777            lstats,
778            lstats_sums,
779            llc_lstats,
780        }
781    }
782}
783
784impl<'a, 'b> Sub<&'b BpfStats> for &'a BpfStats {
785    type Output = BpfStats;
786
787    fn sub(self, rhs: &'b BpfStats) -> BpfStats {
788        let vec_sub = |l: &[u64], r: &[u64]| l.iter().zip(r.iter()).map(|(l, r)| *l - *r).collect();
789        BpfStats {
790            gstats: vec_sub(&self.gstats, &rhs.gstats),
791            lstats: self
792                .lstats
793                .iter()
794                .zip(rhs.lstats.iter())
795                .map(|(l, r)| vec_sub(l, r))
796                .collect(),
797            lstats_sums: vec_sub(&self.lstats_sums, &rhs.lstats_sums),
798            llc_lstats: self
799                .llc_lstats
800                .iter()
801                .zip(rhs.llc_lstats.iter())
802                .map(|(l_layer, r_layer)| {
803                    l_layer
804                        .iter()
805                        .zip(r_layer.iter())
806                        .map(|(l_llc, r_llc)| {
807                            let (l_llc, mut r_llc) = (l_llc.clone(), r_llc.clone());
808                            // Lat is not subtractable, take L side.
809                            r_llc[bpf_intf::llc_layer_stat_id_LLC_LSTAT_LAT as usize] = 0;
810                            vec_sub(&l_llc, &r_llc)
811                        })
812                        .collect()
813                })
814                .collect(),
815        }
816    }
817}
818
819#[derive(Clone, Debug)]
820struct Stats {
821    at: Instant,
822    elapsed: Duration,
823    nr_layers: usize,
824    nr_layer_tasks: Vec<usize>,
825    nr_nodes: usize,
826
827    total_util: f64, // Running AVG of sum of layer_utils
828    layer_utils: Vec<Vec<f64>>,
829    prev_layer_usages: Vec<Vec<u64>>,
830
831    cpu_busy: f64, // Read from /proc, maybe higher than total_util
832    prev_total_cpu: procfs::CpuStat,
833
834    bpf_stats: BpfStats,
835    prev_bpf_stats: BpfStats,
836
837    processing_dur: Duration,
838    prev_processing_dur: Duration,
839
840    layer_slice_us: Vec<u64>,
841}
842
843impl Stats {
844    fn read_layer_usages(cpu_ctxs: &[bpf_intf::cpu_ctx], nr_layers: usize) -> Vec<Vec<u64>> {
845        let mut layer_usages = vec![vec![0u64; NR_LAYER_USAGES]; nr_layers];
846
847        for cpu in 0..*NR_CPUS_POSSIBLE {
848            for layer in 0..nr_layers {
849                for usage in 0..NR_LAYER_USAGES {
850                    layer_usages[layer][usage] += cpu_ctxs[cpu].layer_usages[layer][usage];
851                }
852            }
853        }
854
855        layer_usages
856    }
857
858    fn new(skel: &mut BpfSkel, proc_reader: &procfs::ProcReader) -> Result<Self> {
859        let nr_layers = skel.maps.rodata_data.nr_layers as usize;
860        let cpu_ctxs = read_cpu_ctxs(skel)?;
861        let bpf_stats = BpfStats::read(skel, &cpu_ctxs);
862        let nr_nodes = skel.maps.rodata_data.nr_nodes as usize;
863
864        Ok(Self {
865            at: Instant::now(),
866            elapsed: Default::default(),
867            nr_layers,
868            nr_layer_tasks: vec![0; nr_layers],
869            nr_nodes,
870
871            total_util: 0.0,
872            layer_utils: vec![vec![0.0; NR_LAYER_USAGES]; nr_layers],
873            prev_layer_usages: Self::read_layer_usages(&cpu_ctxs, nr_layers),
874
875            cpu_busy: 0.0,
876            prev_total_cpu: read_total_cpu(proc_reader)?,
877
878            bpf_stats: bpf_stats.clone(),
879            prev_bpf_stats: bpf_stats,
880
881            processing_dur: Default::default(),
882            prev_processing_dur: Default::default(),
883
884            layer_slice_us: vec![0; nr_layers],
885        })
886    }
887
888    fn refresh(
889        &mut self,
890        skel: &mut BpfSkel,
891        proc_reader: &procfs::ProcReader,
892        now: Instant,
893        cur_processing_dur: Duration,
894    ) -> Result<()> {
895        let elapsed = now.duration_since(self.at);
896        let elapsed_f64 = elapsed.as_secs_f64();
897        let cpu_ctxs = read_cpu_ctxs(skel)?;
898
899        let nr_layer_tasks: Vec<usize> = skel
900            .maps
901            .bss_data
902            .layers
903            .iter()
904            .take(self.nr_layers)
905            .map(|layer| layer.nr_tasks as usize)
906            .collect();
907        let layer_slice_us: Vec<u64> = skel
908            .maps
909            .bss_data
910            .layers
911            .iter()
912            .take(self.nr_layers)
913            .map(|layer| layer.slice_ns / 1000_u64)
914            .collect();
915
916        let cur_layer_usages = Self::read_layer_usages(&cpu_ctxs, self.nr_layers);
917        let cur_layer_utils: Vec<Vec<f64>> = cur_layer_usages
918            .iter()
919            .zip(self.prev_layer_usages.iter())
920            .map(|(cur, prev)| {
921                cur.iter()
922                    .zip(prev.iter())
923                    .map(|(c, p)| (c - p) as f64 / 1_000_000_000.0 / elapsed_f64)
924                    .collect()
925            })
926            .collect();
927        let layer_utils: Vec<Vec<f64>> = cur_layer_utils
928            .iter()
929            .zip(self.layer_utils.iter())
930            .map(|(cur, prev)| {
931                cur.iter()
932                    .zip(prev.iter())
933                    .map(|(c, p)| {
934                        let decay = USAGE_DECAY.powf(elapsed_f64);
935                        p * decay + c * (1.0 - decay)
936                    })
937                    .collect()
938            })
939            .collect();
940
941        let cur_total_cpu = read_total_cpu(proc_reader)?;
942        let cpu_busy = calc_util(&cur_total_cpu, &self.prev_total_cpu)?;
943
944        let cur_bpf_stats = BpfStats::read(skel, &cpu_ctxs);
945        let bpf_stats = &cur_bpf_stats - &self.prev_bpf_stats;
946
947        let processing_dur = cur_processing_dur
948            .checked_sub(self.prev_processing_dur)
949            .unwrap();
950
951        *self = Self {
952            at: now,
953            elapsed,
954            nr_layers: self.nr_layers,
955            nr_layer_tasks,
956            nr_nodes: self.nr_nodes,
957
958            total_util: layer_utils
959                .iter()
960                .map(|x| x.iter().take(LAYER_USAGE_SUM_UPTO + 1).sum::<f64>())
961                .sum(),
962            layer_utils,
963            prev_layer_usages: cur_layer_usages,
964
965            cpu_busy,
966            prev_total_cpu: cur_total_cpu,
967
968            bpf_stats,
969            prev_bpf_stats: cur_bpf_stats,
970
971            processing_dur,
972            prev_processing_dur: cur_processing_dur,
973
974            layer_slice_us,
975        };
976        Ok(())
977    }
978}
979
980#[derive(Debug)]
981struct Layer {
982    name: String,
983    kind: LayerKind,
984    growth_algo: LayerGrowthAlgo,
985    core_order: Vec<usize>,
986
987    target_llc_cpus: (usize, usize),
988    assigned_llcs: Vec<usize>,
989
990    nr_cpus: usize,
991    nr_llc_cpus: Vec<usize>,
992    cpus: Cpumask,
993    allowed_cpus: Cpumask,
994}
995
996fn resolve_cpus_pct_range(
997    cpus_range: &Option<(usize, usize)>,
998    cpus_range_frac: &Option<(f64, f64)>,
999    max_cpus: usize,
1000) -> Result<(usize, usize)> {
1001    match (cpus_range, cpus_range_frac) {
1002        (Some(_x), Some(_y)) => {
1003            bail!("cpus_range cannot be used with cpus_pct.");
1004        }
1005        (Some((cpus_range_min, cpus_range_max)), None) => Ok((*cpus_range_min, *cpus_range_max)),
1006        (None, Some((cpus_frac_min, cpus_frac_max))) => {
1007            if *cpus_frac_min < 0_f64
1008                || *cpus_frac_min > 1_f64
1009                || *cpus_frac_max < 0_f64
1010                || *cpus_frac_max > 1_f64
1011            {
1012                bail!("cpus_range_frac values must be between 0.0 and 1.0");
1013            }
1014            let cpus_min_count = ((max_cpus as f64) * cpus_frac_min).round_ties_even() as usize;
1015            let cpus_max_count = ((max_cpus as f64) * cpus_frac_max).round_ties_even() as usize;
1016            Ok((
1017                std::cmp::max(cpus_min_count, 1),
1018                std::cmp::min(cpus_max_count, max_cpus),
1019            ))
1020        }
1021        (None, None) => Ok((0, max_cpus)),
1022    }
1023}
1024
1025impl Layer {
1026    fn new(spec: &LayerSpec, topo: &Topology, core_order: &Vec<usize>) -> Result<Self> {
1027        let name = &spec.name;
1028        let kind = spec.kind.clone();
1029        let mut allowed_cpus = Cpumask::new();
1030        match &kind {
1031            LayerKind::Confined {
1032                cpus_range,
1033                cpus_range_frac,
1034                util_range,
1035                common: LayerCommon { nodes, llcs, .. },
1036                ..
1037            } => {
1038                let cpus_range =
1039                    resolve_cpus_pct_range(cpus_range, cpus_range_frac, topo.all_cpus.len())?;
1040                if cpus_range.0 > cpus_range.1 || cpus_range.1 == 0 {
1041                    bail!("invalid cpus_range {:?}", cpus_range);
1042                }
1043                if nodes.is_empty() && llcs.is_empty() {
1044                    allowed_cpus.set_all();
1045                } else {
1046                    // build up the cpus bitset
1047                    for (node_id, node) in &topo.nodes {
1048                        // first do the matching for nodes
1049                        if nodes.contains(node_id) {
1050                            for &id in node.all_cpus.keys() {
1051                                allowed_cpus.set_cpu(id)?;
1052                            }
1053                        }
1054                        // next match on any LLCs
1055                        for (llc_id, llc) in &node.llcs {
1056                            if llcs.contains(llc_id) {
1057                                for &id in llc.all_cpus.keys() {
1058                                    allowed_cpus.set_cpu(id)?;
1059                                }
1060                            }
1061                        }
1062                    }
1063                }
1064
1065                if util_range.0 < 0.0
1066                    || util_range.0 > 1.0
1067                    || util_range.1 < 0.0
1068                    || util_range.1 > 1.0
1069                    || util_range.0 >= util_range.1
1070                {
1071                    bail!("invalid util_range {:?}", util_range);
1072                }
1073            }
1074            LayerKind::Grouped {
1075                common: LayerCommon { nodes, llcs, .. },
1076                ..
1077            }
1078            | LayerKind::Open {
1079                common: LayerCommon { nodes, llcs, .. },
1080                ..
1081            } => {
1082                if nodes.is_empty() && llcs.is_empty() {
1083                    allowed_cpus.set_all();
1084                } else {
1085                    // build up the cpus bitset
1086                    for (node_id, node) in &topo.nodes {
1087                        // first do the matching for nodes
1088                        if nodes.contains(node_id) {
1089                            for &id in node.all_cpus.keys() {
1090                                allowed_cpus.set_cpu(id)?;
1091                            }
1092                        }
1093                        // next match on any LLCs
1094                        for (llc_id, llc) in &node.llcs {
1095                            if llcs.contains(llc_id) {
1096                                for &id in llc.all_cpus.keys() {
1097                                    allowed_cpus.set_cpu(id)?;
1098                                }
1099                            }
1100                        }
1101                    }
1102                }
1103            }
1104        }
1105
1106        let layer_growth_algo = kind.common().growth_algo.clone();
1107
1108        debug!(
1109            "layer: {} algo: {:?} core order: {:?}",
1110            name, &layer_growth_algo, core_order
1111        );
1112
1113        Ok(Self {
1114            name: name.into(),
1115            kind,
1116            growth_algo: layer_growth_algo,
1117            core_order: core_order.clone(),
1118
1119            target_llc_cpus: (0, 0),
1120            assigned_llcs: vec![],
1121
1122            nr_cpus: 0,
1123            nr_llc_cpus: vec![0; topo.all_llcs.len()],
1124            cpus: Cpumask::new(),
1125            allowed_cpus,
1126        })
1127    }
1128
1129    fn free_some_cpus(&mut self, cpu_pool: &mut CpuPool, max_to_free: usize) -> Result<usize> {
1130        let cpus_to_free = match cpu_pool.next_to_free(&self.cpus, self.core_order.iter().rev())? {
1131            Some(ret) => ret.clone(),
1132            None => return Ok(0),
1133        };
1134
1135        let nr_to_free = cpus_to_free.weight();
1136
1137        Ok(if nr_to_free <= max_to_free {
1138            trace!("[{}] freeing CPUs: {}", self.name, &cpus_to_free);
1139            self.cpus &= &cpus_to_free.not();
1140            self.nr_cpus -= nr_to_free;
1141            for cpu in cpus_to_free.iter() {
1142                self.nr_llc_cpus[cpu_pool.topo.all_cpus[&cpu].llc_id] -= 1;
1143            }
1144            cpu_pool.free(&cpus_to_free)?;
1145            nr_to_free
1146        } else {
1147            0
1148        })
1149    }
1150
1151    fn alloc_some_cpus(&mut self, cpu_pool: &mut CpuPool) -> Result<usize> {
1152        let new_cpus = match cpu_pool
1153            .alloc_cpus(&self.allowed_cpus, &self.core_order)
1154            .clone()
1155        {
1156            Some(ret) => ret.clone(),
1157            None => {
1158                trace!("layer-{} can't grow, no CPUs", &self.name);
1159                return Ok(0);
1160            }
1161        };
1162
1163        let nr_new_cpus = new_cpus.weight();
1164
1165        trace!("[{}] adding CPUs: {}", &self.name, &new_cpus);
1166        self.cpus |= &new_cpus;
1167        self.nr_cpus += nr_new_cpus;
1168        for cpu in new_cpus.iter() {
1169            self.nr_llc_cpus[cpu_pool.topo.all_cpus[&cpu].llc_id] += 1;
1170        }
1171        Ok(nr_new_cpus)
1172    }
1173}
1174
1175struct Scheduler<'a> {
1176    skel: BpfSkel<'a>,
1177    struct_ops: Option<libbpf_rs::Link>,
1178    layer_specs: Vec<LayerSpec>,
1179
1180    sched_intv: Duration,
1181
1182    cpu_pool: CpuPool,
1183    layers: Vec<Layer>,
1184    idle_qos_enabled: bool,
1185
1186    proc_reader: procfs::ProcReader,
1187    sched_stats: Stats,
1188
1189    nr_layer_cpus_ranges: Vec<(usize, usize)>,
1190    processing_dur: Duration,
1191
1192    topo: Arc<Topology>,
1193    netdevs: BTreeMap<String, NetDev>,
1194    stats_server: StatsServer<StatsReq, StatsRes>,
1195}
1196
1197impl<'a> Scheduler<'a> {
1198    fn init_layers(skel: &mut OpenBpfSkel, specs: &[LayerSpec], topo: &Topology) -> Result<()> {
1199        skel.maps.rodata_data.nr_layers = specs.len() as u32;
1200        let mut perf_set = false;
1201
1202        let mut layer_iteration_order = (0..specs.len()).collect::<Vec<_>>();
1203        let mut layer_weights: Vec<usize> = vec![];
1204
1205        for (spec_i, spec) in specs.iter().enumerate() {
1206            let layer = &mut skel.maps.bss_data.layers[spec_i];
1207
1208            for (or_i, or) in spec.matches.iter().enumerate() {
1209                for (and_i, and) in or.iter().enumerate() {
1210                    let mt = &mut layer.matches[or_i].matches[and_i];
1211
1212                    // Rules are allowlist-based by default
1213                    mt.exclude.write(false);
1214
1215                    match and {
1216                        LayerMatch::CgroupPrefix(prefix) => {
1217                            mt.kind = bpf_intf::layer_match_kind_MATCH_CGROUP_PREFIX as i32;
1218                            copy_into_cstr(&mut mt.cgroup_prefix, prefix.as_str());
1219                        }
1220                        LayerMatch::CommPrefix(prefix) => {
1221                            mt.kind = bpf_intf::layer_match_kind_MATCH_COMM_PREFIX as i32;
1222                            copy_into_cstr(&mut mt.comm_prefix, prefix.as_str());
1223                        }
1224                        LayerMatch::CommPrefixExclude(prefix) => {
1225                            mt.kind = bpf_intf::layer_match_kind_MATCH_COMM_PREFIX as i32;
1226                            mt.exclude.write(true);
1227                            copy_into_cstr(&mut mt.comm_prefix, prefix.as_str());
1228                        }
1229                        LayerMatch::PcommPrefix(prefix) => {
1230                            mt.kind = bpf_intf::layer_match_kind_MATCH_PCOMM_PREFIX as i32;
1231                            copy_into_cstr(&mut mt.pcomm_prefix, prefix.as_str());
1232                        }
1233                        LayerMatch::PcommPrefixExclude(prefix) => {
1234                            mt.kind = bpf_intf::layer_match_kind_MATCH_PCOMM_PREFIX as i32;
1235                            mt.exclude.write(true);
1236                            copy_into_cstr(&mut mt.pcomm_prefix, prefix.as_str());
1237                        }
1238                        LayerMatch::NiceAbove(nice) => {
1239                            mt.kind = bpf_intf::layer_match_kind_MATCH_NICE_ABOVE as i32;
1240                            mt.nice = *nice;
1241                        }
1242                        LayerMatch::NiceBelow(nice) => {
1243                            mt.kind = bpf_intf::layer_match_kind_MATCH_NICE_BELOW as i32;
1244                            mt.nice = *nice;
1245                        }
1246                        LayerMatch::NiceEquals(nice) => {
1247                            mt.kind = bpf_intf::layer_match_kind_MATCH_NICE_EQUALS as i32;
1248                            mt.nice = *nice;
1249                        }
1250                        LayerMatch::UIDEquals(user_id) => {
1251                            mt.kind = bpf_intf::layer_match_kind_MATCH_USER_ID_EQUALS as i32;
1252                            mt.user_id = *user_id;
1253                        }
1254                        LayerMatch::GIDEquals(group_id) => {
1255                            mt.kind = bpf_intf::layer_match_kind_MATCH_GROUP_ID_EQUALS as i32;
1256                            mt.group_id = *group_id;
1257                        }
1258                        LayerMatch::PIDEquals(pid) => {
1259                            mt.kind = bpf_intf::layer_match_kind_MATCH_PID_EQUALS as i32;
1260                            mt.pid = *pid;
1261                        }
1262                        LayerMatch::PPIDEquals(ppid) => {
1263                            mt.kind = bpf_intf::layer_match_kind_MATCH_PPID_EQUALS as i32;
1264                            mt.ppid = *ppid;
1265                        }
1266                        LayerMatch::TGIDEquals(tgid) => {
1267                            mt.kind = bpf_intf::layer_match_kind_MATCH_TGID_EQUALS as i32;
1268                            mt.tgid = *tgid;
1269                        }
1270                        LayerMatch::NSPIDEquals(nsid, pid) => {
1271                            mt.kind = bpf_intf::layer_match_kind_MATCH_NSPID_EQUALS as i32;
1272                            mt.nsid = *nsid;
1273                            mt.pid = *pid;
1274                        }
1275                        LayerMatch::NSEquals(nsid) => {
1276                            mt.kind = bpf_intf::layer_match_kind_MATCH_NS_EQUALS as i32;
1277                            mt.nsid = *nsid as u64;
1278                        }
1279                        LayerMatch::CmdJoin(joincmd) => {
1280                            mt.kind = bpf_intf::layer_match_kind_MATCH_SCXCMD_JOIN as i32;
1281                            copy_into_cstr(&mut mt.comm_prefix, joincmd);
1282                        }
1283                        LayerMatch::IsGroupLeader(polarity) => {
1284                            mt.kind = bpf_intf::layer_match_kind_MATCH_IS_GROUP_LEADER as i32;
1285                            mt.is_group_leader.write(*polarity);
1286                        }
1287                        LayerMatch::IsKthread(polarity) => {
1288                            mt.kind = bpf_intf::layer_match_kind_MATCH_IS_KTHREAD as i32;
1289                            mt.is_kthread.write(*polarity);
1290                        }
1291                        LayerMatch::UsedGpuTid(polarity) => {
1292                            mt.kind = bpf_intf::layer_match_kind_MATCH_USED_GPU_TID as i32;
1293                            mt.used_gpu_tid.write(*polarity);
1294                        }
1295                        LayerMatch::UsedGpuPid(polarity) => {
1296                            mt.kind = bpf_intf::layer_match_kind_MATCH_USED_GPU_PID as i32;
1297                            mt.used_gpu_pid.write(*polarity);
1298                        }
1299                    }
1300                }
1301                layer.matches[or_i].nr_match_ands = or.len() as i32;
1302            }
1303
1304            layer.nr_match_ors = spec.matches.len() as u32;
1305            layer.kind = spec.kind.as_bpf_enum();
1306
1307            {
1308                let LayerCommon {
1309                    min_exec_us,
1310                    yield_ignore,
1311                    perf,
1312                    preempt,
1313                    preempt_first,
1314                    exclusive,
1315                    allow_node_aligned,
1316                    skip_remote_node,
1317                    prev_over_idle_core,
1318                    growth_algo,
1319                    nodes,
1320                    slice_us,
1321                    fifo,
1322                    weight,
1323                    disallow_open_after_us,
1324                    disallow_preempt_after_us,
1325                    xllc_mig_min_us,
1326                    ..
1327                } = spec.kind.common();
1328
1329                layer.slice_ns = *slice_us * 1000;
1330                layer.fifo.write(*fifo);
1331                layer.min_exec_ns = min_exec_us * 1000;
1332                layer.yield_step_ns = if *yield_ignore > 0.999 {
1333                    0
1334                } else if *yield_ignore < 0.001 {
1335                    layer.slice_ns
1336                } else {
1337                    (layer.slice_ns as f64 * (1.0 - *yield_ignore)) as u64
1338                };
1339                let mut layer_name: String = spec.name.clone();
1340                layer_name.truncate(MAX_LAYER_NAME);
1341                copy_into_cstr(&mut layer.name, layer_name.as_str());
1342                layer.preempt.write(*preempt);
1343                layer.preempt_first.write(*preempt_first);
1344                layer.exclusive.write(*exclusive);
1345                layer.allow_node_aligned.write(*allow_node_aligned);
1346                layer.skip_remote_node.write(*skip_remote_node);
1347                layer.prev_over_idle_core.write(*prev_over_idle_core);
1348                layer.growth_algo = growth_algo.as_bpf_enum();
1349                layer.weight = *weight;
1350                layer.disallow_open_after_ns = match disallow_open_after_us.unwrap() {
1351                    v if v == u64::MAX => v,
1352                    v => v * 1000,
1353                };
1354                layer.disallow_preempt_after_ns = match disallow_preempt_after_us.unwrap() {
1355                    v if v == u64::MAX => v,
1356                    v => v * 1000,
1357                };
1358                layer.xllc_mig_min_ns = (xllc_mig_min_us * 1000.0) as u64;
1359                layer_weights.push(layer.weight.try_into().unwrap());
1360                layer.perf = u32::try_from(*perf)?;
1361                layer.node_mask = nodemask_from_nodes(nodes) as u64;
1362                for (topo_node_id, topo_node) in &topo.nodes {
1363                    if !nodes.is_empty() && !nodes.contains(topo_node_id) {
1364                        continue;
1365                    }
1366                    layer.llc_mask |= llcmask_from_llcs(&topo_node.llcs) as u64;
1367                }
1368            }
1369
1370            layer.is_protected.write(match spec.kind {
1371                LayerKind::Open { .. } => false,
1372                LayerKind::Confined { protected, .. } | LayerKind::Grouped { protected, .. } => {
1373                    protected
1374                }
1375            });
1376
1377            perf_set |= layer.perf > 0;
1378        }
1379
1380        layer_iteration_order.sort_by(|i, j| layer_weights[*i].cmp(&layer_weights[*j]));
1381        for (idx, layer_idx) in layer_iteration_order.iter().enumerate() {
1382            skel.maps.rodata_data.layer_iteration_order[idx] = *layer_idx as u32;
1383        }
1384
1385        if perf_set && !compat::ksym_exists("scx_bpf_cpuperf_set")? {
1386            warn!("cpufreq support not available, ignoring perf configurations");
1387        }
1388
1389        Ok(())
1390    }
1391
1392    fn init_nodes(skel: &mut OpenBpfSkel, _opts: &Opts, topo: &Topology) {
1393        skel.maps.rodata_data.nr_nodes = topo.nodes.len() as u32;
1394        skel.maps.rodata_data.nr_llcs = 0;
1395
1396        for (&node_id, node) in &topo.nodes {
1397            debug!("configuring node {}, LLCs {:?}", node_id, node.llcs.len());
1398            skel.maps.rodata_data.nr_llcs += node.llcs.len() as u32;
1399            let raw_numa_slice = node.span.as_raw_slice();
1400            let node_cpumask_slice = &mut skel.maps.rodata_data.numa_cpumasks[node_id];
1401            let (left, _) = node_cpumask_slice.split_at_mut(raw_numa_slice.len());
1402            left.clone_from_slice(raw_numa_slice);
1403            debug!(
1404                "node {} mask: {:?}",
1405                node_id, skel.maps.rodata_data.numa_cpumasks[node_id]
1406            );
1407
1408            for llc in node.llcs.values() {
1409                debug!("configuring llc {:?} for node {:?}", llc.id, node_id);
1410                skel.maps.rodata_data.llc_numa_id_map[llc.id] = node_id as u32;
1411            }
1412        }
1413
1414        for cpu in topo.all_cpus.values() {
1415            skel.maps.rodata_data.cpu_llc_id_map[cpu.id] = cpu.llc_id as u32;
1416        }
1417    }
1418
1419    fn init_cpu_prox_map(topo: &Topology, cpu_ctxs: &mut [bpf_intf::cpu_ctx]) {
1420        let radiate = |mut vec: Vec<usize>, center_id: usize| -> Vec<usize> {
1421            vec.sort_by_key(|&id| (center_id as i32 - id as i32).abs());
1422            vec
1423        };
1424        let radiate_cpu =
1425            |mut vec: Vec<usize>, center_cpu: usize, center_core: usize| -> Vec<usize> {
1426                vec.sort_by_key(|&id| {
1427                    (
1428                        (center_core as i32 - topo.all_cpus.get(&id).unwrap().core_id as i32).abs(),
1429                        (center_cpu as i32 - id as i32).abs(),
1430                    )
1431                });
1432                vec
1433            };
1434
1435        for (&cpu_id, cpu) in &topo.all_cpus {
1436            // Collect the spans.
1437            let mut core_span = topo.all_cores[&cpu.core_id].span.clone();
1438            let llc_span = &topo.all_llcs[&cpu.llc_id].span;
1439            let node_span = &topo.nodes[&cpu.node_id].span;
1440            let sys_span = &topo.span;
1441
1442            // Make the spans exclusive.
1443            let sys_span = sys_span.and(&node_span.not());
1444            let node_span = node_span.and(&llc_span.not());
1445            let llc_span = llc_span.and(&core_span.not());
1446            core_span.clear_cpu(cpu_id).unwrap();
1447
1448            // Convert them into arrays.
1449            let mut sys_order: Vec<usize> = sys_span.iter().collect();
1450            let mut node_order: Vec<usize> = node_span.iter().collect();
1451            let mut llc_order: Vec<usize> = llc_span.iter().collect();
1452            let mut core_order: Vec<usize> = core_span.iter().collect();
1453
1454            // Shuffle them so that different CPUs follow different orders.
1455            // Each CPU radiates in both directions based on the cpu id and
1456            // radiates out to the closest cores based on core ids.
1457
1458            sys_order = radiate_cpu(sys_order, cpu_id, cpu.core_id);
1459            node_order = radiate(node_order, cpu.node_id);
1460            llc_order = radiate_cpu(llc_order, cpu_id, cpu.core_id);
1461            core_order = radiate_cpu(core_order, cpu_id, cpu.core_id);
1462
1463            // Concatenate and record the topology boundaries.
1464            let mut order: Vec<usize> = vec![];
1465            let mut idx: usize = 0;
1466
1467            idx += 1;
1468            order.push(cpu_id);
1469
1470            idx += core_order.len();
1471            order.append(&mut core_order);
1472            let core_end = idx;
1473
1474            idx += llc_order.len();
1475            order.append(&mut llc_order);
1476            let llc_end = idx;
1477
1478            idx += node_order.len();
1479            order.append(&mut node_order);
1480            let node_end = idx;
1481
1482            idx += sys_order.len();
1483            order.append(&mut sys_order);
1484            let sys_end = idx;
1485
1486            debug!(
1487                "CPU[{}] proximity map[{}/{}/{}/{}]: {:?}",
1488                cpu_id, core_end, llc_end, node_end, sys_end, &order
1489            );
1490
1491            // Record in cpu_ctx.
1492            let pmap = &mut cpu_ctxs[cpu_id].prox_map;
1493            for (i, &cpu) in order.iter().enumerate() {
1494                pmap.cpus[i] = cpu as u16;
1495            }
1496            pmap.core_end = core_end as u32;
1497            pmap.llc_end = llc_end as u32;
1498            pmap.node_end = node_end as u32;
1499            pmap.sys_end = sys_end as u32;
1500        }
1501    }
1502
1503    fn convert_cpu_ctxs(cpu_ctxs: Vec<bpf_intf::cpu_ctx>) -> Vec<Vec<u8>> {
1504        cpu_ctxs
1505            .into_iter()
1506            .map(|cpu_ctx| {
1507                let bytes = unsafe {
1508                    std::slice::from_raw_parts(
1509                        &cpu_ctx as *const bpf_intf::cpu_ctx as *const u8,
1510                        std::mem::size_of::<bpf_intf::cpu_ctx>(),
1511                    )
1512                };
1513                bytes.to_vec()
1514            })
1515            .collect()
1516    }
1517
1518    fn init_cpus(skel: &BpfSkel, layer_specs: &[LayerSpec], topo: &Topology) -> Result<()> {
1519        let key = (0_u32).to_ne_bytes();
1520        let mut cpu_ctxs: Vec<bpf_intf::cpu_ctx> = vec![];
1521        let cpu_ctxs_vec = skel
1522            .maps
1523            .cpu_ctxs
1524            .lookup_percpu(&key, libbpf_rs::MapFlags::ANY)
1525            .context("Failed to lookup cpu_ctx")?
1526            .unwrap();
1527
1528        let op_layers: Vec<u32> = layer_specs
1529            .iter()
1530            .enumerate()
1531            .filter(|(_idx, spec)| match &spec.kind {
1532                LayerKind::Open { .. } => spec.kind.common().preempt,
1533                _ => false,
1534            })
1535            .map(|(idx, _)| idx as u32)
1536            .collect();
1537        let on_layers: Vec<u32> = layer_specs
1538            .iter()
1539            .enumerate()
1540            .filter(|(_idx, spec)| match &spec.kind {
1541                LayerKind::Open { .. } => !spec.kind.common().preempt,
1542                _ => false,
1543            })
1544            .map(|(idx, _)| idx as u32)
1545            .collect();
1546        let gp_layers: Vec<u32> = layer_specs
1547            .iter()
1548            .enumerate()
1549            .filter(|(_idx, spec)| match &spec.kind {
1550                LayerKind::Grouped { .. } => spec.kind.common().preempt,
1551                _ => false,
1552            })
1553            .map(|(idx, _)| idx as u32)
1554            .collect();
1555        let gn_layers: Vec<u32> = layer_specs
1556            .iter()
1557            .enumerate()
1558            .filter(|(_idx, spec)| match &spec.kind {
1559                LayerKind::Grouped { .. } => !spec.kind.common().preempt,
1560                _ => false,
1561            })
1562            .map(|(idx, _)| idx as u32)
1563            .collect();
1564
1565        // FIXME - this incorrectly assumes all possible CPUs are consecutive.
1566        for cpu in 0..*NR_CPUS_POSSIBLE {
1567            cpu_ctxs.push(*unsafe {
1568                &*(cpu_ctxs_vec[cpu].as_slice().as_ptr() as *const bpf_intf::cpu_ctx)
1569            });
1570
1571            let topo_cpu = topo.all_cpus.get(&cpu).unwrap();
1572            let is_big = topo_cpu.core_type == CoreType::Big { turbo: true };
1573            cpu_ctxs[cpu].cpu = cpu as i32;
1574            cpu_ctxs[cpu].layer_id = MAX_LAYERS as u32;
1575            cpu_ctxs[cpu].task_layer_id = MAX_LAYERS as u32;
1576            cpu_ctxs[cpu].is_big = is_big;
1577
1578            fastrand::seed(cpu as u64);
1579
1580            let mut ogp_order = op_layers.clone();
1581            ogp_order.append(&mut gp_layers.clone());
1582            fastrand::shuffle(&mut ogp_order);
1583
1584            let mut ogn_order = on_layers.clone();
1585            ogn_order.append(&mut gn_layers.clone());
1586            fastrand::shuffle(&mut ogn_order);
1587
1588            let mut op_order = op_layers.clone();
1589            fastrand::shuffle(&mut op_order);
1590
1591            let mut on_order = on_layers.clone();
1592            fastrand::shuffle(&mut on_order);
1593
1594            let mut gp_order = gp_layers.clone();
1595            fastrand::shuffle(&mut gp_order);
1596
1597            let mut gn_order = gn_layers.clone();
1598            fastrand::shuffle(&mut gn_order);
1599
1600            for i in 0..MAX_LAYERS {
1601                cpu_ctxs[cpu].ogp_layer_order[i] =
1602                    ogp_order.get(i).cloned().unwrap_or(MAX_LAYERS as u32);
1603                cpu_ctxs[cpu].ogn_layer_order[i] =
1604                    ogn_order.get(i).cloned().unwrap_or(MAX_LAYERS as u32);
1605
1606                cpu_ctxs[cpu].op_layer_order[i] =
1607                    op_order.get(i).cloned().unwrap_or(MAX_LAYERS as u32);
1608                cpu_ctxs[cpu].on_layer_order[i] =
1609                    on_order.get(i).cloned().unwrap_or(MAX_LAYERS as u32);
1610                cpu_ctxs[cpu].gp_layer_order[i] =
1611                    gp_order.get(i).cloned().unwrap_or(MAX_LAYERS as u32);
1612                cpu_ctxs[cpu].gn_layer_order[i] =
1613                    gn_order.get(i).cloned().unwrap_or(MAX_LAYERS as u32);
1614            }
1615        }
1616
1617        Self::init_cpu_prox_map(topo, &mut cpu_ctxs);
1618
1619        skel.maps
1620            .cpu_ctxs
1621            .update_percpu(
1622                &key,
1623                &Self::convert_cpu_ctxs(cpu_ctxs),
1624                libbpf_rs::MapFlags::ANY,
1625            )
1626            .context("Failed to update cpu_ctx")?;
1627
1628        Ok(())
1629    }
1630
1631    fn init_llc_prox_map(skel: &mut BpfSkel, topo: &Topology) -> Result<()> {
1632        for (&llc_id, llc) in &topo.all_llcs {
1633            // Collect the orders.
1634            let mut node_order: Vec<usize> =
1635                topo.nodes[&llc.node_id].llcs.keys().cloned().collect();
1636            let mut sys_order: Vec<usize> = topo.all_llcs.keys().cloned().collect();
1637
1638            // Make the orders exclusive.
1639            sys_order.retain(|id| !node_order.contains(id));
1640            node_order.retain(|&id| id != llc_id);
1641
1642            // Shufle so that different LLCs follow different orders. See
1643            // init_cpu_prox_map().
1644            fastrand::seed(llc_id as u64);
1645            fastrand::shuffle(&mut sys_order);
1646            fastrand::shuffle(&mut node_order);
1647
1648            // Concatenate and record the node boundary.
1649            let mut order: Vec<usize> = vec![];
1650            let mut idx: usize = 0;
1651
1652            idx += 1;
1653            order.push(llc_id);
1654
1655            idx += node_order.len();
1656            order.append(&mut node_order);
1657            let node_end = idx;
1658
1659            idx += sys_order.len();
1660            order.append(&mut sys_order);
1661            let sys_end = idx;
1662
1663            debug!(
1664                "LLC[{}] proximity map[{}/{}]: {:?}",
1665                llc_id, node_end, sys_end, &order
1666            );
1667
1668            // Record in llc_ctx.
1669            //
1670            // XXX - This would be a lot easier if llc_ctx were in the bss.
1671            // See BpfStats::read().
1672            let key = llc_id as u32;
1673            let llc_id_slice =
1674                unsafe { std::slice::from_raw_parts((&key as *const u32) as *const u8, 4) };
1675            let v = skel
1676                .maps
1677                .llc_data
1678                .lookup(llc_id_slice, libbpf_rs::MapFlags::ANY)
1679                .unwrap()
1680                .unwrap();
1681            let mut llcc = unsafe { *(v.as_slice().as_ptr() as *const bpf_intf::llc_ctx) };
1682
1683            let pmap = &mut llcc.prox_map;
1684            for (i, &llc_id) in order.iter().enumerate() {
1685                pmap.llcs[i] = llc_id as u16;
1686            }
1687            pmap.node_end = node_end as u32;
1688            pmap.sys_end = sys_end as u32;
1689
1690            let v = unsafe {
1691                std::slice::from_raw_parts(
1692                    &llcc as *const bpf_intf::llc_ctx as *const u8,
1693                    std::mem::size_of::<bpf_intf::llc_ctx>(),
1694                )
1695            };
1696
1697            skel.maps
1698                .llc_data
1699                .update(llc_id_slice, v, libbpf_rs::MapFlags::ANY)?
1700        }
1701
1702        Ok(())
1703    }
1704
1705    fn init(
1706        opts: &'a Opts,
1707        layer_specs: &[LayerSpec],
1708        open_object: &'a mut MaybeUninit<OpenObject>,
1709    ) -> Result<Self> {
1710        let nr_layers = layer_specs.len();
1711        let mut disable_topology = opts.disable_topology.unwrap_or(false);
1712
1713        let topo = Arc::new(if disable_topology {
1714            Topology::with_flattened_llc_node()?
1715        } else {
1716            Topology::new()?
1717        });
1718
1719        /*
1720         * FIXME: scx_layered incorrectly assumes that node, LLC and CPU IDs
1721         * are consecutive. Verify that they are on this system and bail if
1722         * not. It's lucky that core ID is not used anywhere as core IDs are
1723         * not consecutive on some Ryzen CPUs.
1724         */
1725        if topo.nodes.keys().enumerate().any(|(i, &k)| i != k) {
1726            bail!("Holes in node IDs detected: {:?}", topo.nodes.keys());
1727        }
1728        if topo.all_llcs.keys().enumerate().any(|(i, &k)| i != k) {
1729            bail!("Holes in LLC IDs detected: {:?}", topo.all_llcs.keys());
1730        }
1731        if topo.all_cpus.keys().enumerate().any(|(i, &k)| i != k) {
1732            bail!("Holes in CPU IDs detected: {:?}", topo.all_cpus.keys());
1733        }
1734
1735        let netdevs = if opts.netdev_irq_balance {
1736            warn!(
1737                "Experimental netdev IRQ balancing enabled. Reset IRQ masks of network devices after use!!!"
1738            );
1739            read_netdevs()?
1740        } else {
1741            BTreeMap::new()
1742        };
1743
1744        if !disable_topology {
1745            if topo.nodes.len() == 1 && topo.nodes[&0].llcs.len() == 1 {
1746                disable_topology = true;
1747            };
1748            info!(
1749                "Topology awareness not specified, selecting {} based on hardware",
1750                if disable_topology {
1751                    "disabled"
1752                } else {
1753                    "enabled"
1754                }
1755            );
1756        };
1757
1758        let cpu_pool = CpuPool::new(topo.clone())?;
1759
1760        // If disabling topology awareness clear out any set NUMA/LLC configs and
1761        // it will fallback to using all cores.
1762        let layer_specs: Vec<_> = if disable_topology {
1763            info!("Disabling topology awareness");
1764            layer_specs
1765                .iter()
1766                .cloned()
1767                .map(|mut s| {
1768                    s.kind.common_mut().nodes.clear();
1769                    s.kind.common_mut().llcs.clear();
1770                    s
1771                })
1772                .collect()
1773        } else {
1774            layer_specs.to_vec()
1775        };
1776
1777        // Open the BPF prog first for verification.
1778        let mut skel_builder = BpfSkelBuilder::default();
1779        skel_builder.obj_builder.debug(opts.verbose > 1);
1780        init_libbpf_logging(None);
1781        let mut skel = scx_ops_open!(skel_builder, open_object, layered)?;
1782
1783        // enable autoloads for conditionally loaded things
1784        // immediately after creating skel (because this is always before loading)
1785        if opts.enable_gpu_support {
1786            // by default, enable open if gpu support is enabled.
1787            // open has been observed to be relatively cheap to kprobe.
1788            if opts.gpu_kprobe_level >= 1 {
1789                compat::cond_kprobe_enable("nvidia_open", &skel.progs.kprobe_nvidia_open)?;
1790            }
1791            // enable the rest progressively based upon how often they are called
1792            // for observed workloads
1793            if opts.gpu_kprobe_level >= 2 {
1794                compat::cond_kprobe_enable("nvidia_mmap", &skel.progs.kprobe_nvidia_mmap)?;
1795            }
1796            if opts.gpu_kprobe_level >= 3 {
1797                compat::cond_kprobe_enable("nvidia_poll", &skel.progs.kprobe_nvidia_poll)?;
1798            }
1799        }
1800
1801        skel.maps.rodata_data.slice_ns = scx_enums.SCX_SLICE_DFL;
1802        skel.maps.rodata_data.max_exec_ns = 20 * scx_enums.SCX_SLICE_DFL;
1803
1804        // Initialize skel according to @opts.
1805        skel.struct_ops.layered_mut().exit_dump_len = opts.exit_dump_len;
1806
1807        if !opts.disable_queued_wakeup {
1808            match *compat::SCX_OPS_ALLOW_QUEUED_WAKEUP {
1809                0 => info!("Kernel does not support queued wakeup optimization"),
1810                v => skel.struct_ops.layered_mut().flags |= v,
1811            }
1812        }
1813
1814        skel.maps.rodata_data.percpu_kthread_preempt = !opts.disable_percpu_kthread_preempt;
1815        skel.maps.rodata_data.debug = opts.verbose as u32;
1816        skel.maps.rodata_data.slice_ns = opts.slice_us * 1000;
1817        skel.maps.rodata_data.max_exec_ns = if opts.max_exec_us > 0 {
1818            opts.max_exec_us * 1000
1819        } else {
1820            opts.slice_us * 1000 * 20
1821        };
1822        skel.maps.rodata_data.nr_cpu_ids = *NR_CPU_IDS as u32;
1823        skel.maps.rodata_data.nr_possible_cpus = *NR_CPUS_POSSIBLE as u32;
1824        skel.maps.rodata_data.smt_enabled = topo.smt_enabled;
1825        skel.maps.rodata_data.has_little_cores = topo.has_little_cores();
1826        skel.maps.rodata_data.xnuma_preemption = opts.xnuma_preemption;
1827        skel.maps.rodata_data.antistall_sec = opts.antistall_sec;
1828        skel.maps.rodata_data.monitor_disable = opts.monitor_disable;
1829        skel.maps.rodata_data.lo_fb_wait_ns = opts.lo_fb_wait_us * 1000;
1830        skel.maps.rodata_data.lo_fb_share_ppk = ((opts.lo_fb_share * 1024.0) as u32).clamp(1, 1024);
1831        skel.maps.rodata_data.enable_antistall = !opts.disable_antistall;
1832        skel.maps.rodata_data.enable_gpu_support = opts.enable_gpu_support;
1833
1834        for (cpu, sib) in topo.sibling_cpus().iter().enumerate() {
1835            skel.maps.rodata_data.__sibling_cpu[cpu] = *sib;
1836        }
1837        for cpu in topo.all_cpus.keys() {
1838            skel.maps.rodata_data.all_cpus[cpu / 8] |= 1 << (cpu % 8);
1839        }
1840
1841        skel.maps.rodata_data.nr_op_layers = layer_specs
1842            .iter()
1843            .filter(|spec| match &spec.kind {
1844                LayerKind::Open { .. } => spec.kind.common().preempt,
1845                _ => false,
1846            })
1847            .count() as u32;
1848        skel.maps.rodata_data.nr_on_layers = layer_specs
1849            .iter()
1850            .filter(|spec| match &spec.kind {
1851                LayerKind::Open { .. } => !spec.kind.common().preempt,
1852                _ => false,
1853            })
1854            .count() as u32;
1855        skel.maps.rodata_data.nr_gp_layers = layer_specs
1856            .iter()
1857            .filter(|spec| match &spec.kind {
1858                LayerKind::Grouped { .. } => spec.kind.common().preempt,
1859                _ => false,
1860            })
1861            .count() as u32;
1862        skel.maps.rodata_data.nr_gn_layers = layer_specs
1863            .iter()
1864            .filter(|spec| match &spec.kind {
1865                LayerKind::Grouped { .. } => !spec.kind.common().preempt,
1866                _ => false,
1867            })
1868            .count() as u32;
1869
1870        let mut min_open = u64::MAX;
1871        let mut min_preempt = u64::MAX;
1872
1873        for spec in layer_specs.iter() {
1874            if let LayerKind::Open { common, .. } = &spec.kind {
1875                min_open = min_open.min(common.disallow_open_after_us.unwrap());
1876                min_preempt = min_preempt.min(common.disallow_preempt_after_us.unwrap());
1877            }
1878        }
1879
1880        skel.maps.rodata_data.min_open_layer_disallow_open_after_ns = match min_open {
1881            u64::MAX => *DFL_DISALLOW_OPEN_AFTER_US,
1882            v => v,
1883        };
1884        skel.maps
1885            .rodata_data
1886            .min_open_layer_disallow_preempt_after_ns = match min_preempt {
1887            u64::MAX => *DFL_DISALLOW_PREEMPT_AFTER_US,
1888            v => v,
1889        };
1890
1891        // Consider all layers empty at the beginning.
1892        for i in 0..layer_specs.len() {
1893            skel.maps.bss_data.empty_layer_ids[i] = i as u32;
1894        }
1895        skel.maps.bss_data.nr_empty_layer_ids = nr_layers as u32;
1896
1897        Self::init_layers(&mut skel, &layer_specs, &topo)?;
1898        Self::init_nodes(&mut skel, opts, &topo);
1899
1900        let mut skel = scx_ops_load!(skel, layered, uei)?;
1901
1902        let mut layers = vec![];
1903        let layer_growth_orders =
1904            LayerGrowthAlgo::layer_core_orders(&cpu_pool, &layer_specs, &topo)?;
1905        for (idx, spec) in layer_specs.iter().enumerate() {
1906            let growth_order = layer_growth_orders
1907                .get(&idx)
1908                .with_context(|| "layer has no growth order".to_string())?;
1909            layers.push(Layer::new(spec, &topo, growth_order)?);
1910        }
1911
1912        let mut idle_qos_enabled = layers
1913            .iter()
1914            .any(|layer| layer.kind.common().idle_resume_us.unwrap_or(0) > 0);
1915        if idle_qos_enabled && !cpu_idle_resume_latency_supported() {
1916            warn!("idle_resume_us not supported, ignoring");
1917            idle_qos_enabled = false;
1918        }
1919
1920        Self::init_cpus(&skel, &layer_specs, &topo)?;
1921        Self::init_llc_prox_map(&mut skel, &topo)?;
1922
1923        // Other stuff.
1924        let proc_reader = procfs::ProcReader::new();
1925
1926        // Handle setup if layered is running in a pid namespace.
1927        let input = ProgramInput {
1928            ..Default::default()
1929        };
1930        let prog = &mut skel.progs.initialize_pid_namespace;
1931
1932        let _ = prog.test_run(input);
1933
1934        // XXX If we try to refresh the cpumasks here before attaching, we
1935        // sometimes (non-deterministically) don't see the updated values in
1936        // BPF. It would be better to update the cpumasks here before we
1937        // attach, but the value will quickly converge anyways so it's not a
1938        // huge problem in the interim until we figure it out.
1939
1940        // Attach.
1941        let struct_ops = scx_ops_attach!(skel, layered)?;
1942        let stats_server = StatsServer::new(stats::server_data()).launch()?;
1943
1944        let sched = Self {
1945            struct_ops: Some(struct_ops),
1946            layer_specs,
1947
1948            sched_intv: Duration::from_secs_f64(opts.interval),
1949
1950            cpu_pool,
1951            layers,
1952            idle_qos_enabled,
1953
1954            sched_stats: Stats::new(&mut skel, &proc_reader)?,
1955
1956            nr_layer_cpus_ranges: vec![(0, 0); nr_layers],
1957            processing_dur: Default::default(),
1958
1959            proc_reader,
1960            skel,
1961
1962            topo,
1963            netdevs,
1964            stats_server,
1965        };
1966
1967        info!("Layered Scheduler Attached. Run `scx_layered --monitor` for metrics.");
1968
1969        Ok(sched)
1970    }
1971
1972    fn update_bpf_layer_cpumask(layer: &Layer, bpf_layer: &mut types::layer) {
1973        trace!("[{}] Updating BPF CPUs: {}", layer.name, &layer.cpus);
1974        for cpu in 0..layer.cpus.len() {
1975            if layer.cpus.test_cpu(cpu) {
1976                bpf_layer.cpus[cpu / 8] |= 1 << (cpu % 8);
1977            } else {
1978                bpf_layer.cpus[cpu / 8] &= !(1 << (cpu % 8));
1979            }
1980        }
1981
1982        bpf_layer.nr_cpus = layer.nr_cpus as u32;
1983        for (llc_id, &nr_llc_cpus) in layer.nr_llc_cpus.iter().enumerate() {
1984            bpf_layer.nr_llc_cpus[llc_id] = nr_llc_cpus as u32;
1985        }
1986
1987        bpf_layer.refresh_cpus = 1;
1988    }
1989
1990    fn update_netdev_cpumasks(&mut self) -> Result<()> {
1991        let available_cpus = self.cpu_pool.available_cpus();
1992        if available_cpus.is_empty() {
1993            return Ok(());
1994        }
1995
1996        for (iface, netdev) in self.netdevs.iter_mut() {
1997            let node = self
1998                .topo
1999                .nodes
2000                .values()
2001                .take_while(|n| n.id == netdev.node())
2002                .next()
2003                .ok_or_else(|| anyhow!("Failed to get netdev node"))?;
2004            let node_cpus = node.span.clone();
2005            for (irq, irqmask) in netdev.irqs.iter_mut() {
2006                irqmask.clear_all();
2007                for cpu in available_cpus.iter() {
2008                    if !node_cpus.test_cpu(cpu) {
2009                        continue;
2010                    }
2011                    let _ = irqmask.set_cpu(cpu);
2012                }
2013                // If no CPUs are available in the node then spread the load across the node
2014                if irqmask.weight() == 0 {
2015                    for cpu in node_cpus.iter() {
2016                        let _ = irqmask.set_cpu(cpu);
2017                    }
2018                }
2019                trace!("{} updating irq {} cpumask {:?}", iface, irq, irqmask);
2020            }
2021            netdev.apply_cpumasks()?;
2022        }
2023
2024        Ok(())
2025    }
2026
2027    /// Calculate how many CPUs each layer would like to have if there were
2028    /// no competition. The CPU range is determined by applying the inverse
2029    /// of util_range and then capping by cpus_range. If the current
2030    /// allocation is within the acceptable range, no change is made.
2031    /// Returns (target, min) pair for each layer.
2032    fn calc_target_nr_cpus(&self) -> Vec<(usize, usize)> {
2033        let nr_cpus = self.cpu_pool.topo.all_cpus.len();
2034        let utils = &self.sched_stats.layer_utils;
2035
2036        let mut records: Vec<(u64, u64, u64, usize, usize, usize)> = vec![];
2037        let mut targets: Vec<(usize, usize)> = vec![];
2038
2039        for (idx, layer) in self.layers.iter().enumerate() {
2040            targets.push(match &layer.kind {
2041                LayerKind::Confined {
2042                    util_range,
2043                    cpus_range,
2044                    cpus_range_frac,
2045                    ..
2046                }
2047                | LayerKind::Grouped {
2048                    util_range,
2049                    cpus_range,
2050                    cpus_range_frac,
2051                    ..
2052                } => {
2053                    // Guide layer sizing by utilization within each layer
2054                    // to avoid oversizing grouped layers. As an empty layer
2055                    // can only get CPU time through fallback (counted as
2056                    // owned) or open execution, add open cputime for empty
2057                    // layers.
2058                    let owned = utils[idx][LAYER_USAGE_OWNED];
2059                    let open = utils[idx][LAYER_USAGE_OPEN];
2060
2061                    let mut util = owned;
2062                    if layer.nr_cpus == 0 {
2063                        util += open;
2064                    }
2065
2066                    let util = if util < 0.01 { 0.0 } else { util };
2067                    let low = (util / util_range.1).ceil() as usize;
2068                    let high = ((util / util_range.0).floor() as usize).max(low);
2069                    let target = layer.cpus.weight().clamp(low, high);
2070                    let cpus_range =
2071                        resolve_cpus_pct_range(cpus_range, cpus_range_frac, nr_cpus).unwrap();
2072
2073                    records.push((
2074                        (owned * 100.0) as u64,
2075                        (open * 100.0) as u64,
2076                        (util * 100.0) as u64,
2077                        low,
2078                        high,
2079                        target,
2080                    ));
2081
2082                    (target.clamp(cpus_range.0, cpus_range.1), cpus_range.0)
2083                }
2084                LayerKind::Open { .. } => (0, 0),
2085            });
2086        }
2087
2088        trace!("initial targets: {:?}", &targets);
2089        trace!("(owned, open, util, low, high, target): {:?}", &records);
2090        targets
2091    }
2092
2093    /// Given (target, min) pair for each layer which was determined
2094    /// assuming infinite number of CPUs, distribute the actual CPUs
2095    /// according to their weights.
2096    fn weighted_target_nr_cpus(&self, targets: &[(usize, usize)]) -> Vec<usize> {
2097        let mut nr_left = self.cpu_pool.topo.all_cpus.len();
2098        let weights: Vec<usize> = self
2099            .layers
2100            .iter()
2101            .map(|layer| layer.kind.common().weight as usize)
2102            .collect();
2103        let mut cands: BTreeMap<usize, (usize, usize, usize)> = targets
2104            .iter()
2105            .zip(&weights)
2106            .enumerate()
2107            .map(|(i, ((target, min), weight))| (i, (*target, *min, *weight)))
2108            .collect();
2109        let mut weight_sum: usize = weights.iter().sum();
2110        let mut weighted: Vec<usize> = vec![0; self.layers.len()];
2111
2112        trace!("cands: {:?}", &cands);
2113
2114        // First, accept all layers that are <= min.
2115        cands.retain(|&i, &mut (target, min, weight)| {
2116            if target <= min {
2117                let target = target.min(nr_left);
2118                weighted[i] = target;
2119                weight_sum -= weight;
2120                nr_left -= target;
2121                false
2122            } else {
2123                true
2124            }
2125        });
2126
2127        trace!("cands after accepting mins: {:?}", &cands);
2128
2129        // Keep accepting ones under their allotted share.
2130        let calc_share = |nr_left, weight, weight_sum| {
2131            (((nr_left * weight) as f64 / weight_sum as f64).ceil() as usize).min(nr_left)
2132        };
2133
2134        while !cands.is_empty() {
2135            let mut progress = false;
2136
2137            cands.retain(|&i, &mut (target, _min, weight)| {
2138                let share = calc_share(nr_left, weight, weight_sum);
2139                if target <= share {
2140                    weighted[i] = target;
2141                    weight_sum -= weight;
2142                    nr_left -= target;
2143                    progress = true;
2144                    false
2145                } else {
2146                    true
2147                }
2148            });
2149
2150            if !progress {
2151                break;
2152            }
2153        }
2154
2155        trace!("cands after accepting under allotted: {:?}", &cands);
2156
2157        // The remaining candidates are in contention with each other,
2158        // distribute according to the shares.
2159        let nr_to_share = nr_left;
2160        for (i, (_target, _min, weight)) in cands.into_iter() {
2161            let share = calc_share(nr_to_share, weight, weight_sum).min(nr_left);
2162            weighted[i] = share;
2163            nr_left -= share;
2164        }
2165
2166        trace!("weighted: {:?}", &weighted);
2167
2168        weighted
2169    }
2170
2171    // Figure out a tuple (LLCs, extra_cpus) in terms of the target CPUs
2172    // computed by weighted_target_nr_cpus. Returns the number of full LLCs
2173    // occupied by a layer, and any extra CPUs that don't occupy a full LLC.
2174    fn compute_target_llcs(target: usize, topo: &Topology) -> (usize, usize) {
2175        // TODO(kkd): We assume each LLC has equal number of cores.
2176        let cores_per_llc = topo.all_cores.len() / topo.all_llcs.len();
2177        // TODO(kkd): We assume each core has fixed number of threads.
2178        let cpus_per_core = topo.all_cores.first_key_value().unwrap().1.cpus.len();
2179        let cpus_per_llc = cores_per_llc * cpus_per_core;
2180
2181        let full = target / cpus_per_llc;
2182        let extra = target % cpus_per_llc;
2183
2184        (full, extra.div_ceil(cpus_per_core))
2185    }
2186
2187    // Recalculate the core order for layers using StickyDynamic growth
2188    // algorithm. Tuples from compute_target_llcs are used to decide how many
2189    // LLCs and cores should be assigned to each layer, logic to alloc and free
2190    // CPUs operates on that core order. This happens in three logical steps, we
2191    // first free LLCs from layers that shrunk from last recomputation, then
2192    // distribute freed LLCs to growing layers, and then spill over remaining
2193    // cores in free LLCs.
2194    fn recompute_layer_core_order(&mut self, layer_targets: &Vec<(usize, usize)>) {
2195        // Collect freed LLCs from shrinking layers.
2196        debug!(
2197            " free: before pass: free_llcs={:?}",
2198            self.cpu_pool.free_llcs
2199        );
2200        for &(idx, target) in layer_targets.iter().rev() {
2201            let layer = &mut self.layers[idx];
2202            let old_tlc = layer.target_llc_cpus;
2203            let new_tlc = Self::compute_target_llcs(target, &self.topo);
2204
2205            if layer.growth_algo != LayerGrowthAlgo::StickyDynamic {
2206                continue;
2207            }
2208
2209            let mut to_free = (old_tlc.0 as i32 - new_tlc.0 as i32).max(0) as usize;
2210
2211            debug!(
2212                " free: layer={} old_tlc={:?} new_tlc={:?} to_free={} assigned={} free={}",
2213                layer.name,
2214                old_tlc,
2215                new_tlc,
2216                to_free,
2217                layer.assigned_llcs.len(),
2218                self.cpu_pool.free_llcs.len()
2219            );
2220
2221            while to_free > 0 && layer.assigned_llcs.len() > 0 {
2222                let llc = layer.assigned_llcs.pop().unwrap();
2223                self.cpu_pool.free_llcs.push((llc, 0));
2224                to_free -= 1;
2225
2226                debug!(" layer={} freed_llc={}", layer.name, llc);
2227            }
2228        }
2229        debug!(" free: after pass: free_llcs={:?}", self.cpu_pool.free_llcs);
2230
2231        // Redistribute the freed LLCs to growing layers.
2232        for &(idx, target) in layer_targets.iter().rev() {
2233            let layer = &mut self.layers[idx];
2234            let old_tlc = layer.target_llc_cpus;
2235            let new_tlc = Self::compute_target_llcs(target, &self.topo);
2236
2237            if layer.growth_algo != LayerGrowthAlgo::StickyDynamic {
2238                continue;
2239            }
2240
2241            let mut to_alloc = (new_tlc.0 as i32 - old_tlc.0 as i32).max(0) as usize;
2242
2243            debug!(
2244                " alloc: layer={} old_tlc={:?} new_tlc={:?} to_alloc={} assigned={} free={}",
2245                layer.name,
2246                old_tlc,
2247                new_tlc,
2248                to_alloc,
2249                layer.assigned_llcs.len(),
2250                self.cpu_pool.free_llcs.len()
2251            );
2252
2253            while to_alloc > 0
2254                && self.cpu_pool.free_llcs.len() > 0
2255                && to_alloc <= self.cpu_pool.free_llcs.len()
2256            {
2257                let llc = self.cpu_pool.free_llcs.pop().unwrap().0;
2258                layer.assigned_llcs.push(llc);
2259                to_alloc -= 1;
2260
2261                debug!(" layer={} alloc_llc={}", layer.name, llc);
2262            }
2263
2264            debug!(
2265                " alloc: layer={} assigned_llcs={:?}",
2266                layer.name, layer.assigned_llcs
2267            );
2268
2269            // Update for next iteration.
2270            layer.target_llc_cpus = new_tlc;
2271        }
2272
2273        // Spillover overflowing cores into free LLCs. Bigger layers get to take
2274        // a chunk before smaller layers.
2275        for &(idx, _) in layer_targets.iter() {
2276            let mut core_order = vec![];
2277            let layer = &mut self.layers[idx];
2278
2279            if layer.growth_algo != LayerGrowthAlgo::StickyDynamic {
2280                continue;
2281            }
2282
2283            let tlc = layer.target_llc_cpus;
2284            let mut extra = tlc.1;
2285            // TODO(kkd): Move this logic into cpu_pool? What's the best place?
2286            let cores_per_llc = self.topo.all_cores.len() / self.topo.all_llcs.len();
2287            let cpus_per_core = self.topo.all_cores.first_key_value().unwrap().1.cpus.len();
2288            let cpus_per_llc = cores_per_llc * cpus_per_core;
2289
2290            // Consume from front since we pop from the back.
2291            for i in 0..self.cpu_pool.free_llcs.len() {
2292                let free_vec = &mut self.cpu_pool.free_llcs;
2293                // Available CPUs in LLC.
2294                let avail = cpus_per_llc - free_vec[i].1;
2295                // The amount we'll use.
2296                let mut used = extra.min(avail);
2297
2298                let shift = free_vec[i].1;
2299                free_vec[i].1 += used;
2300
2301                let llc_id = free_vec[i].0;
2302                let llc = self.topo.all_llcs.get(&llc_id).unwrap();
2303
2304                for core in llc.cores.iter().skip(shift) {
2305                    core_order.push(core.1.id);
2306                    if used == 0 {
2307                        break;
2308                    }
2309                    used -= 1;
2310                }
2311
2312                extra -= used;
2313                if extra == 0 {
2314                    break;
2315                }
2316            }
2317
2318            core_order.reverse();
2319            layer.core_order = core_order;
2320        }
2321
2322        // Reset consumed entries in free LLCs.
2323        for i in 0..self.cpu_pool.free_llcs.len() {
2324            self.cpu_pool.free_llcs[i].1 = 0;
2325        }
2326
2327        for &(idx, _) in layer_targets.iter() {
2328            let layer = &mut self.layers[idx];
2329
2330            if layer.growth_algo != LayerGrowthAlgo::StickyDynamic {
2331                continue;
2332            }
2333
2334            for core in self.topo.all_cores.iter() {
2335                let llc_id = core.1.llc_id;
2336                if layer.assigned_llcs.contains(&llc_id) {
2337                    layer.core_order.push(core.1.id);
2338                }
2339            }
2340            // Update core_order for the layer, but reverse to keep the start stable.
2341            layer.core_order.reverse();
2342
2343            debug!(
2344                " alloc: layer={} core_order={:?}",
2345                layer.name, layer.core_order
2346            );
2347        }
2348    }
2349
2350    fn refresh_cpumasks(&mut self) -> Result<()> {
2351        let layer_is_open = |layer: &Layer| matches!(layer.kind, LayerKind::Open { .. });
2352
2353        let mut updated = false;
2354        let targets = self.calc_target_nr_cpus();
2355        let targets = self.weighted_target_nr_cpus(&targets);
2356
2357        let mut ascending: Vec<(usize, usize)> = targets.iter().copied().enumerate().collect();
2358        ascending.sort_by(|a, b| a.1.cmp(&b.1));
2359
2360        self.recompute_layer_core_order(&ascending);
2361
2362        // If any layer is growing, guarantee that the largest layer that is
2363        // freeing CPUs frees at least one CPU.
2364        let mut force_free = self
2365            .layers
2366            .iter()
2367            .zip(targets.iter())
2368            .any(|(layer, &target)| layer.nr_cpus < target);
2369
2370        // Shrink all layers first so that CPUs are available for
2371        // redistribution. Do so in the descending target number of CPUs
2372        // order.
2373        for &(idx, target) in ascending.iter().rev() {
2374            let layer = &mut self.layers[idx];
2375            if layer_is_open(layer) {
2376                continue;
2377            }
2378
2379            let nr_cur = layer.cpus.weight();
2380            if nr_cur <= target {
2381                continue;
2382            }
2383            let mut nr_to_free = nr_cur - target;
2384
2385            // There's some dampening built into util metrics but slow down
2386            // freeing further to avoid unnecessary changes. This is solely
2387            // based on intution. Drop or update according to real-world
2388            // behavior.
2389            let nr_to_break_at = nr_to_free / 2;
2390
2391            let mut freed = false;
2392
2393            while nr_to_free > 0 {
2394                let max_to_free = if force_free {
2395                    force_free = false;
2396                    layer.nr_cpus
2397                } else {
2398                    nr_to_free
2399                };
2400
2401                let nr_freed = layer.free_some_cpus(&mut self.cpu_pool, max_to_free)?;
2402                if nr_freed == 0 {
2403                    break;
2404                }
2405
2406                nr_to_free = nr_to_free.saturating_sub(nr_freed);
2407                freed = true;
2408
2409                if nr_to_free <= nr_to_break_at {
2410                    break;
2411                }
2412            }
2413
2414            if freed {
2415                Self::update_bpf_layer_cpumask(layer, &mut self.skel.maps.bss_data.layers[idx]);
2416                updated = true;
2417            }
2418        }
2419
2420        // Grow layers. Do so in the ascending target number of CPUs order
2421        // so that we're always more generous to smaller layers. This avoids
2422        // starving small layers and shouldn't make noticable difference for
2423        // bigger layers as work conservation should still be achieved
2424        // through open execution.
2425        for &(idx, target) in &ascending {
2426            let layer = &mut self.layers[idx];
2427
2428            if layer_is_open(layer) {
2429                continue;
2430            }
2431
2432            let nr_cur = layer.cpus.weight();
2433            if nr_cur >= target {
2434                continue;
2435            }
2436
2437            let mut nr_to_alloc = target - nr_cur;
2438            let mut alloced = false;
2439
2440            while nr_to_alloc > 0 {
2441                let nr_alloced = layer.alloc_some_cpus(&mut self.cpu_pool)?;
2442                if nr_alloced == 0 {
2443                    break;
2444                }
2445                alloced = true;
2446                nr_to_alloc -= nr_alloced.min(nr_to_alloc);
2447            }
2448
2449            if alloced {
2450                Self::update_bpf_layer_cpumask(layer, &mut self.skel.maps.bss_data.layers[idx]);
2451                updated = true;
2452            }
2453        }
2454
2455        // Give the rest to the open layers.
2456        if updated {
2457            for (idx, layer) in self.layers.iter_mut().enumerate() {
2458                if !layer_is_open(layer) {
2459                    continue;
2460                }
2461
2462                let bpf_layer = &mut self.skel.maps.bss_data.layers[idx];
2463                let available_cpus = self.cpu_pool.available_cpus().and(&layer.allowed_cpus);
2464                let nr_available_cpus = available_cpus.weight();
2465
2466                // Open layers need the intersection of allowed cpus and
2467                // available cpus.
2468                layer.cpus = available_cpus;
2469                layer.nr_cpus = nr_available_cpus;
2470                Self::update_bpf_layer_cpumask(layer, bpf_layer);
2471            }
2472
2473            self.skel.maps.bss_data.fallback_cpu = self.cpu_pool.fallback_cpu as u32;
2474
2475            for (lidx, layer) in self.layers.iter().enumerate() {
2476                self.nr_layer_cpus_ranges[lidx] = (
2477                    self.nr_layer_cpus_ranges[lidx].0.min(layer.nr_cpus),
2478                    self.nr_layer_cpus_ranges[lidx].1.max(layer.nr_cpus),
2479                );
2480            }
2481
2482            // Trigger updates on the BPF side.
2483            let input = ProgramInput {
2484                ..Default::default()
2485            };
2486            let prog = &mut self.skel.progs.refresh_layer_cpumasks;
2487            let _ = prog.test_run(input);
2488
2489            // Update empty_layers.
2490            let empty_layer_ids: Vec<u32> = self
2491                .layers
2492                .iter()
2493                .enumerate()
2494                .filter(|(_idx, layer)| layer.nr_cpus == 0)
2495                .map(|(idx, _layer)| idx as u32)
2496                .collect();
2497            for i in 0..self.layers.len() {
2498                self.skel.maps.bss_data.empty_layer_ids[i] =
2499                    empty_layer_ids.get(i).cloned().unwrap_or(MAX_LAYERS as u32);
2500            }
2501            self.skel.maps.bss_data.nr_empty_layer_ids = empty_layer_ids.len() as u32;
2502        }
2503
2504        let _ = self.update_netdev_cpumasks();
2505        Ok(())
2506    }
2507
2508    fn refresh_idle_qos(&mut self) -> Result<()> {
2509        if !self.idle_qos_enabled {
2510            return Ok(());
2511        }
2512
2513        let mut cpu_idle_qos = vec![0; *NR_CPU_IDS];
2514        for layer in self.layers.iter() {
2515            let idle_resume_us = layer.kind.common().idle_resume_us.unwrap_or(0) as i32;
2516            for cpu in layer.cpus.iter() {
2517                cpu_idle_qos[cpu] = idle_resume_us;
2518            }
2519        }
2520
2521        for (cpu, idle_resume_usec) in cpu_idle_qos.iter().enumerate() {
2522            update_cpu_idle_resume_latency(cpu, *idle_resume_usec)?;
2523        }
2524
2525        Ok(())
2526    }
2527
2528    fn step(&mut self) -> Result<()> {
2529        let started_at = Instant::now();
2530        self.sched_stats.refresh(
2531            &mut self.skel,
2532            &self.proc_reader,
2533            started_at,
2534            self.processing_dur,
2535        )?;
2536        self.refresh_cpumasks()?;
2537        self.refresh_idle_qos()?;
2538        self.processing_dur += Instant::now().duration_since(started_at);
2539        Ok(())
2540    }
2541
2542    fn generate_sys_stats(
2543        &mut self,
2544        stats: &Stats,
2545        cpus_ranges: &mut [(usize, usize)],
2546    ) -> Result<SysStats> {
2547        let bstats = &stats.bpf_stats;
2548        let mut sys_stats = SysStats::new(stats, bstats, self.cpu_pool.fallback_cpu)?;
2549
2550        for (lidx, (spec, layer)) in self.layer_specs.iter().zip(self.layers.iter()).enumerate() {
2551            let layer_stats = LayerStats::new(lidx, layer, stats, bstats, cpus_ranges[lidx]);
2552            sys_stats.layers.insert(spec.name.to_string(), layer_stats);
2553            cpus_ranges[lidx] = (layer.nr_cpus, layer.nr_cpus);
2554        }
2555
2556        Ok(sys_stats)
2557    }
2558
2559    fn run(&mut self, shutdown: Arc<AtomicBool>) -> Result<UserExitInfo> {
2560        let (res_ch, req_ch) = self.stats_server.channels();
2561        let mut next_sched_at = Instant::now() + self.sched_intv;
2562        let mut cpus_ranges = HashMap::<ThreadId, Vec<(usize, usize)>>::new();
2563
2564        while !shutdown.load(Ordering::Relaxed) && !uei_exited!(&self.skel, uei) {
2565            let now = Instant::now();
2566
2567            if now >= next_sched_at {
2568                self.step()?;
2569                while next_sched_at < now {
2570                    next_sched_at += self.sched_intv;
2571                }
2572            }
2573
2574            match req_ch.recv_deadline(next_sched_at) {
2575                Ok(StatsReq::Hello(tid)) => {
2576                    cpus_ranges.insert(
2577                        tid,
2578                        self.layers.iter().map(|l| (l.nr_cpus, l.nr_cpus)).collect(),
2579                    );
2580                    let stats = Stats::new(&mut self.skel, &self.proc_reader)?;
2581                    res_ch.send(StatsRes::Hello(stats))?;
2582                }
2583                Ok(StatsReq::Refresh(tid, mut stats)) => {
2584                    // Propagate self's layer cpu ranges into each stat's.
2585                    for i in 0..self.nr_layer_cpus_ranges.len() {
2586                        for (_, ranges) in cpus_ranges.iter_mut() {
2587                            ranges[i] = (
2588                                ranges[i].0.min(self.nr_layer_cpus_ranges[i].0),
2589                                ranges[i].1.max(self.nr_layer_cpus_ranges[i].1),
2590                            );
2591                        }
2592                        self.nr_layer_cpus_ranges[i] =
2593                            (self.layers[i].nr_cpus, self.layers[i].nr_cpus);
2594                    }
2595
2596                    stats.refresh(&mut self.skel, &self.proc_reader, now, self.processing_dur)?;
2597                    let sys_stats =
2598                        self.generate_sys_stats(&stats, cpus_ranges.get_mut(&tid).unwrap())?;
2599                    res_ch.send(StatsRes::Refreshed((stats, sys_stats)))?;
2600                }
2601                Ok(StatsReq::Bye(tid)) => {
2602                    cpus_ranges.remove(&tid);
2603                    res_ch.send(StatsRes::Bye)?;
2604                }
2605                Err(RecvTimeoutError::Timeout) => {}
2606                Err(e) => Err(e)?,
2607            }
2608        }
2609
2610        self.struct_ops.take();
2611        uei_report!(&self.skel, uei)
2612    }
2613}
2614
2615impl Drop for Scheduler<'_> {
2616    fn drop(&mut self) {
2617        if let Some(struct_ops) = self.struct_ops.take() {
2618            drop(struct_ops);
2619        }
2620    }
2621}
2622
2623fn write_example_file(path: &str) -> Result<()> {
2624    let mut f = fs::OpenOptions::new()
2625        .create_new(true)
2626        .write(true)
2627        .open(path)?;
2628    Ok(f.write_all(serde_json::to_string_pretty(&*EXAMPLE_CONFIG)?.as_bytes())?)
2629}
2630
2631fn verify_layer_specs(specs: &[LayerSpec]) -> Result<()> {
2632    let nr_specs = specs.len();
2633    if nr_specs == 0 {
2634        bail!("No layer spec");
2635    }
2636    if nr_specs > MAX_LAYERS {
2637        bail!("Too many layer specs");
2638    }
2639
2640    for (idx, spec) in specs.iter().enumerate() {
2641        if idx < nr_specs - 1 {
2642            if spec.matches.is_empty() {
2643                bail!("Non-terminal spec {:?} has NULL matches", spec.name);
2644            }
2645        } else {
2646            if spec.matches.len() != 1 || !spec.matches[0].is_empty() {
2647                bail!("Terminal spec {:?} must have an empty match", spec.name);
2648            }
2649        }
2650
2651        if spec.matches.len() > MAX_LAYER_MATCH_ORS {
2652            bail!(
2653                "Spec {:?} has too many ({}) OR match blocks",
2654                spec.name,
2655                spec.matches.len()
2656            );
2657        }
2658
2659        for (ands_idx, ands) in spec.matches.iter().enumerate() {
2660            if ands.len() > NR_LAYER_MATCH_KINDS {
2661                bail!(
2662                    "Spec {:?}'s {}th OR block has too many ({}) match conditions",
2663                    spec.name,
2664                    ands_idx,
2665                    ands.len()
2666                );
2667            }
2668            for one in ands.iter() {
2669                match one {
2670                    LayerMatch::CgroupPrefix(prefix) => {
2671                        if prefix.len() > MAX_PATH {
2672                            bail!("Spec {:?} has too long a cgroup prefix", spec.name);
2673                        }
2674                    }
2675                    LayerMatch::CommPrefix(prefix) => {
2676                        if prefix.len() > MAX_COMM {
2677                            bail!("Spec {:?} has too long a comm prefix", spec.name);
2678                        }
2679                    }
2680                    LayerMatch::PcommPrefix(prefix) => {
2681                        if prefix.len() > MAX_COMM {
2682                            bail!("Spec {:?} has too long a process name prefix", spec.name);
2683                        }
2684                    }
2685                    _ => {}
2686                }
2687            }
2688        }
2689
2690        match spec.kind {
2691            LayerKind::Confined {
2692                cpus_range,
2693                util_range,
2694                ..
2695            }
2696            | LayerKind::Grouped {
2697                cpus_range,
2698                util_range,
2699                ..
2700            } => {
2701                if let Some((cpus_min, cpus_max)) = cpus_range {
2702                    if cpus_min > cpus_max {
2703                        bail!(
2704                            "Spec {:?} has invalid cpus_range({}, {})",
2705                            spec.name,
2706                            cpus_min,
2707                            cpus_max
2708                        );
2709                    }
2710                }
2711                if util_range.0 >= util_range.1 {
2712                    bail!(
2713                        "Spec {:?} has invalid util_range ({}, {})",
2714                        spec.name,
2715                        util_range.0,
2716                        util_range.1
2717                    );
2718                }
2719            }
2720            _ => {}
2721        }
2722    }
2723
2724    Ok(())
2725}
2726
2727fn main() -> Result<()> {
2728    let opts = Opts::parse();
2729
2730    if opts.help_stats {
2731        stats::server_data().describe_meta(&mut std::io::stdout(), None)?;
2732        return Ok(());
2733    }
2734
2735    if opts.no_load_frac_limit {
2736        warn!("--no-load-frac-limit is deprecated and noop");
2737    }
2738    if opts.layer_preempt_weight_disable != 0.0 {
2739        warn!("--layer-preempt-weight-disable is deprecated and noop");
2740    }
2741    if opts.layer_growth_weight_disable != 0.0 {
2742        warn!("--layer-growth-weight-disable is deprecated and noop");
2743    }
2744    if opts.local_llc_iteration {
2745        warn!("--local_llc_iteration is deprecated and noop");
2746    }
2747
2748    let llv = match opts.verbose {
2749        0 => simplelog::LevelFilter::Info,
2750        1 => simplelog::LevelFilter::Debug,
2751        _ => simplelog::LevelFilter::Trace,
2752    };
2753    let mut lcfg = simplelog::ConfigBuilder::new();
2754    lcfg.set_time_level(simplelog::LevelFilter::Error)
2755        .set_location_level(simplelog::LevelFilter::Off)
2756        .set_target_level(simplelog::LevelFilter::Off)
2757        .set_thread_level(simplelog::LevelFilter::Off);
2758    simplelog::TermLogger::init(
2759        llv,
2760        lcfg.build(),
2761        simplelog::TerminalMode::Stderr,
2762        simplelog::ColorChoice::Auto,
2763    )?;
2764
2765    debug!("opts={:?}", &opts);
2766
2767    let shutdown = Arc::new(AtomicBool::new(false));
2768    let shutdown_clone = shutdown.clone();
2769    ctrlc::set_handler(move || {
2770        shutdown_clone.store(true, Ordering::Relaxed);
2771    })
2772    .context("Error setting Ctrl-C handler")?;
2773
2774    if let Some(intv) = opts.monitor.or(opts.stats) {
2775        let shutdown_copy = shutdown.clone();
2776        let jh = std::thread::spawn(move || {
2777            match stats::monitor(Duration::from_secs_f64(intv), shutdown_copy) {
2778                Ok(_) => {
2779                    debug!("stats monitor thread finished successfully")
2780                }
2781                Err(error_object) => {
2782                    warn!(
2783                        "stats monitor thread finished because of an error {}",
2784                        error_object
2785                    )
2786                }
2787            }
2788        });
2789        if opts.monitor.is_some() {
2790            let _ = jh.join();
2791            return Ok(());
2792        }
2793    }
2794
2795    if let Some(path) = &opts.example {
2796        write_example_file(path)?;
2797        return Ok(());
2798    }
2799
2800    let mut layer_config = match opts.run_example {
2801        true => EXAMPLE_CONFIG.clone(),
2802        false => LayerConfig { specs: vec![] },
2803    };
2804
2805    for (idx, input) in opts.specs.iter().enumerate() {
2806        layer_config.specs.append(
2807            &mut LayerSpec::parse(input)
2808                .context(format!("Failed to parse specs[{}] ({:?})", idx, input))?,
2809        );
2810    }
2811
2812    for spec in layer_config.specs.iter_mut() {
2813        let common = spec.kind.common_mut();
2814
2815        if common.slice_us == 0 {
2816            common.slice_us = opts.slice_us;
2817        }
2818
2819        if common.weight == 0 {
2820            common.weight = DEFAULT_LAYER_WEIGHT;
2821        }
2822        common.weight = common.weight.clamp(MIN_LAYER_WEIGHT, MAX_LAYER_WEIGHT);
2823
2824        if common.preempt {
2825            if common.disallow_open_after_us.is_some() {
2826                warn!(
2827                    "Preempt layer {} has non-null disallow_open_after_us, ignored",
2828                    &spec.name
2829                );
2830            }
2831            if common.disallow_preempt_after_us.is_some() {
2832                warn!(
2833                    "Preempt layer {} has non-null disallow_preempt_after_us, ignored",
2834                    &spec.name
2835                );
2836            }
2837            common.disallow_open_after_us = Some(u64::MAX);
2838            common.disallow_preempt_after_us = Some(u64::MAX);
2839        } else {
2840            if common.disallow_open_after_us.is_none() {
2841                common.disallow_open_after_us = Some(*DFL_DISALLOW_OPEN_AFTER_US);
2842            }
2843
2844            if common.disallow_preempt_after_us.is_none() {
2845                common.disallow_preempt_after_us = Some(*DFL_DISALLOW_PREEMPT_AFTER_US);
2846            }
2847        }
2848
2849        if common.idle_smt.is_some() {
2850            warn!("Layer {} has deprecated flag \"idle_smt\"", &spec.name);
2851        }
2852    }
2853
2854    debug!("specs={}", serde_json::to_string_pretty(&layer_config)?);
2855    verify_layer_specs(&layer_config.specs)?;
2856
2857    let mut open_object = MaybeUninit::uninit();
2858    loop {
2859        let mut sched = Scheduler::init(&opts, &layer_config.specs, &mut open_object)?;
2860        if !sched.run(shutdown.clone())?.should_restart() {
2861            break;
2862        }
2863    }
2864
2865    Ok(())
2866}