scx_layered/
main.rs

1// Copyright (c) Meta Platforms, Inc. and affiliates.
2
3// This software may be used and distributed according to the terms of the
4// GNU General Public License version 2.
5mod bpf_skel;
6mod stats;
7
8use std::collections::BTreeMap;
9use std::collections::HashMap;
10use std::collections::HashSet;
11use std::ffi::CString;
12use std::fs;
13use std::io::Write;
14use std::mem::MaybeUninit;
15use std::ops::Sub;
16use std::path::Path;
17use std::path::PathBuf;
18use std::sync::atomic::AtomicBool;
19use std::sync::atomic::Ordering;
20use std::sync::Arc;
21use std::thread::ThreadId;
22use std::time::Duration;
23use std::time::Instant;
24
25use inotify::{Inotify, WatchMask};
26use libc;
27use std::os::unix::io::AsRawFd;
28
29use anyhow::anyhow;
30use anyhow::bail;
31use anyhow::Context;
32use anyhow::Result;
33pub use bpf_skel::*;
34use clap::Parser;
35use crossbeam::channel::Receiver;
36use crossbeam::select;
37use lazy_static::lazy_static;
38use libbpf_rs::libbpf_sys;
39use libbpf_rs::AsRawLibbpf;
40use libbpf_rs::MapCore as _;
41use libbpf_rs::OpenObject;
42use libbpf_rs::ProgramInput;
43use log::info;
44use log::trace;
45use log::warn;
46use log::{debug, error};
47use nix::sched::CpuSet;
48use nvml_wrapper::error::NvmlError;
49use nvml_wrapper::Nvml;
50use once_cell::sync::OnceCell;
51use regex::Regex;
52use scx_bpf_compat;
53use scx_layered::*;
54use scx_raw_pmu::PMUManager;
55use scx_stats::prelude::*;
56use scx_utils::build_id;
57use scx_utils::compat;
58use scx_utils::init_libbpf_logging;
59use scx_utils::libbpf_clap_opts::LibbpfOpts;
60use scx_utils::perf;
61use scx_utils::pm::{cpu_idle_resume_latency_supported, update_cpu_idle_resume_latency};
62use scx_utils::read_netdevs;
63use scx_utils::scx_enums;
64use scx_utils::scx_ops_attach;
65use scx_utils::scx_ops_load;
66use scx_utils::scx_ops_open;
67use scx_utils::uei_exited;
68use scx_utils::uei_report;
69use scx_utils::CoreType;
70use scx_utils::Cpumask;
71use scx_utils::Llc;
72use scx_utils::NetDev;
73use scx_utils::Topology;
74use scx_utils::TopologyArgs;
75use scx_utils::UserExitInfo;
76use scx_utils::NR_CPUS_POSSIBLE;
77use scx_utils::NR_CPU_IDS;
78use stats::LayerStats;
79use stats::StatsReq;
80use stats::StatsRes;
81use stats::SysStats;
82use std::collections::VecDeque;
83use sysinfo::{Pid, ProcessRefreshKind, ProcessesToUpdate, System};
84use walkdir::WalkDir;
85
86const SCHEDULER_NAME: &str = "scx_layered";
87const MAX_PATH: usize = bpf_intf::consts_MAX_PATH as usize;
88const MAX_COMM: usize = bpf_intf::consts_MAX_COMM as usize;
89const MAX_LAYER_WEIGHT: u32 = bpf_intf::consts_MAX_LAYER_WEIGHT;
90const MIN_LAYER_WEIGHT: u32 = bpf_intf::consts_MIN_LAYER_WEIGHT;
91const MAX_LAYER_MATCH_ORS: usize = bpf_intf::consts_MAX_LAYER_MATCH_ORS as usize;
92const MAX_LAYER_NAME: usize = bpf_intf::consts_MAX_LAYER_NAME as usize;
93const MAX_LAYERS: usize = bpf_intf::consts_MAX_LAYERS as usize;
94const DEFAULT_LAYER_WEIGHT: u32 = bpf_intf::consts_DEFAULT_LAYER_WEIGHT;
95const USAGE_HALF_LIFE: u32 = bpf_intf::consts_USAGE_HALF_LIFE;
96const USAGE_HALF_LIFE_F64: f64 = USAGE_HALF_LIFE as f64 / 1_000_000_000.0;
97
98const LAYER_USAGE_OWNED: usize = bpf_intf::layer_usage_LAYER_USAGE_OWNED as usize;
99const LAYER_USAGE_OPEN: usize = bpf_intf::layer_usage_LAYER_USAGE_OPEN as usize;
100const LAYER_USAGE_SUM_UPTO: usize = bpf_intf::layer_usage_LAYER_USAGE_SUM_UPTO as usize;
101const LAYER_USAGE_PROTECTED: usize = bpf_intf::layer_usage_LAYER_USAGE_PROTECTED as usize;
102const LAYER_USAGE_PROTECTED_PREEMPT: usize =
103    bpf_intf::layer_usage_LAYER_USAGE_PROTECTED_PREEMPT as usize;
104const NR_LAYER_USAGES: usize = bpf_intf::layer_usage_NR_LAYER_USAGES as usize;
105
106const NR_GSTATS: usize = bpf_intf::global_stat_id_NR_GSTATS as usize;
107const NR_LSTATS: usize = bpf_intf::layer_stat_id_NR_LSTATS as usize;
108const NR_LLC_LSTATS: usize = bpf_intf::llc_layer_stat_id_NR_LLC_LSTATS as usize;
109
110const NR_LAYER_MATCH_KINDS: usize = bpf_intf::layer_match_kind_NR_LAYER_MATCH_KINDS as usize;
111
112static NVML: OnceCell<Nvml> = OnceCell::new();
113
114fn nvml() -> Result<&'static Nvml, NvmlError> {
115    NVML.get_or_try_init(Nvml::init)
116}
117
118lazy_static! {
119    static ref USAGE_DECAY: f64 = 0.5f64.powf(1.0 / USAGE_HALF_LIFE_F64);
120    static ref DFL_DISALLOW_OPEN_AFTER_US: u64 = 2 * scx_enums.SCX_SLICE_DFL / 1000;
121    static ref DFL_DISALLOW_PREEMPT_AFTER_US: u64 = 4 * scx_enums.SCX_SLICE_DFL / 1000;
122    static ref EXAMPLE_CONFIG: LayerConfig = LayerConfig {
123        specs: vec![
124            LayerSpec {
125                name: "batch".into(),
126                comment: Some("tasks under system.slice or tasks with nice value > 0".into()),
127                cpuset: None,
128                template: None,
129                matches: vec![
130                    vec![LayerMatch::CgroupPrefix("system.slice/".into())],
131                    vec![LayerMatch::NiceAbove(0)],
132                ],
133                kind: LayerKind::Confined {
134                    util_range: (0.8, 0.9),
135                    cpus_range: Some((0, 16)),
136                    cpus_range_frac: None,
137                    protected: false,
138                    membw_gb: None,
139                    common: LayerCommon {
140                        min_exec_us: 1000,
141                        yield_ignore: 0.0,
142                        preempt: false,
143                        preempt_first: false,
144                        exclusive: false,
145                        allow_node_aligned: false,
146                        skip_remote_node: false,
147                        prev_over_idle_core: false,
148                        idle_smt: None,
149                        slice_us: 20000,
150                        fifo: false,
151                        weight: DEFAULT_LAYER_WEIGHT,
152                        disallow_open_after_us: None,
153                        disallow_preempt_after_us: None,
154                        xllc_mig_min_us: 1000.0,
155                        growth_algo: LayerGrowthAlgo::Sticky,
156                        idle_resume_us: None,
157                        perf: 1024,
158                        nodes: vec![],
159                        llcs: vec![],
160                        member_expire_ms: 0,
161                        placement: LayerPlacement::Standard,
162                    },
163                },
164            },
165            LayerSpec {
166                name: "immediate".into(),
167                comment: Some("tasks under workload.slice with nice value < 0".into()),
168                cpuset: None,
169                template: None,
170                matches: vec![vec![
171                    LayerMatch::CgroupPrefix("workload.slice/".into()),
172                    LayerMatch::NiceBelow(0),
173                ]],
174                kind: LayerKind::Open {
175                    common: LayerCommon {
176                        min_exec_us: 100,
177                        yield_ignore: 0.25,
178                        preempt: true,
179                        preempt_first: false,
180                        exclusive: true,
181                        allow_node_aligned: true,
182                        skip_remote_node: false,
183                        prev_over_idle_core: true,
184                        idle_smt: None,
185                        slice_us: 20000,
186                        fifo: false,
187                        weight: DEFAULT_LAYER_WEIGHT,
188                        disallow_open_after_us: None,
189                        disallow_preempt_after_us: None,
190                        xllc_mig_min_us: 0.0,
191                        growth_algo: LayerGrowthAlgo::Sticky,
192                        perf: 1024,
193                        idle_resume_us: None,
194                        nodes: vec![],
195                        llcs: vec![],
196                        member_expire_ms: 0,
197                        placement: LayerPlacement::Standard,
198                    },
199                },
200            },
201            LayerSpec {
202                name: "stress-ng".into(),
203                comment: Some("stress-ng test layer".into()),
204                cpuset: None,
205                template: None,
206                matches: vec![
207                    vec![LayerMatch::CommPrefix("stress-ng".into()),],
208                    vec![LayerMatch::PcommPrefix("stress-ng".into()),]
209                ],
210                kind: LayerKind::Confined {
211                    cpus_range: None,
212                    util_range: (0.2, 0.8),
213                    protected: false,
214                    cpus_range_frac: None,
215                    membw_gb: None,
216                    common: LayerCommon {
217                        min_exec_us: 800,
218                        yield_ignore: 0.0,
219                        preempt: true,
220                        preempt_first: false,
221                        exclusive: false,
222                        allow_node_aligned: false,
223                        skip_remote_node: false,
224                        prev_over_idle_core: false,
225                        idle_smt: None,
226                        slice_us: 800,
227                        fifo: false,
228                        weight: DEFAULT_LAYER_WEIGHT,
229                        disallow_open_after_us: None,
230                        disallow_preempt_after_us: None,
231                        xllc_mig_min_us: 0.0,
232                        growth_algo: LayerGrowthAlgo::Topo,
233                        perf: 1024,
234                        idle_resume_us: None,
235                        nodes: vec![],
236                        llcs: vec![],
237                        member_expire_ms: 0,
238                        placement: LayerPlacement::Standard,
239                    },
240                },
241            },
242            LayerSpec {
243                name: "normal".into(),
244                comment: Some("the rest".into()),
245                cpuset: None,
246                template: None,
247                matches: vec![vec![]],
248                kind: LayerKind::Grouped {
249                    cpus_range: None,
250                    util_range: (0.5, 0.6),
251                    util_includes_open_cputime: true,
252                    protected: false,
253                    cpus_range_frac: None,
254                    membw_gb: None,
255                    common: LayerCommon {
256                        min_exec_us: 200,
257                        yield_ignore: 0.0,
258                        preempt: false,
259                        preempt_first: false,
260                        exclusive: false,
261                        allow_node_aligned: false,
262                        skip_remote_node: false,
263                        prev_over_idle_core: false,
264                        idle_smt: None,
265                        slice_us: 20000,
266                        fifo: false,
267                        weight: DEFAULT_LAYER_WEIGHT,
268                        disallow_open_after_us: None,
269                        disallow_preempt_after_us: None,
270                        xllc_mig_min_us: 100.0,
271                        growth_algo: LayerGrowthAlgo::Linear,
272                        perf: 1024,
273                        idle_resume_us: None,
274                        nodes: vec![],
275                        llcs: vec![],
276                        member_expire_ms: 0,
277                        placement: LayerPlacement::Standard,
278                    },
279                },
280            },
281        ],
282    };
283}
284
285/// scx_layered: A highly configurable multi-layer sched_ext scheduler
286///
287/// scx_layered allows classifying tasks into multiple layers and applying
288/// different scheduling policies to them. The configuration is specified in
289/// json and composed of two parts - matches and policies.
290///
291/// Matches
292/// =======
293///
294/// Whenever a task is forked or its attributes are changed, the task goes
295/// through a series of matches to determine the layer it belongs to. A
296/// match set is composed of OR groups of AND blocks. An example:
297///
298///   "matches": [
299///     [
300///       {
301///         "CgroupPrefix": "system.slice/"
302///       }
303///     ],
304///     [
305///       {
306///         "CommPrefix": "fbagent"
307///       },
308///       {
309///         "NiceAbove": 0
310///       }
311///     ]
312///   ],
313///
314/// The outer array contains the OR groups and the inner AND blocks, so the
315/// above matches:
316///
317/// - Tasks which are in the cgroup sub-hierarchy under "system.slice".
318///
319/// - Or tasks whose comm starts with "fbagent" and have a nice value > 0.
320///
321/// Currently, the following matches are supported:
322///
323/// - CgroupPrefix: Matches the prefix of the cgroup that the task belongs
324///   to. As this is a string match, whether the pattern has the trailing
325///   '/' makes a difference. For example, "TOP/CHILD/" only matches tasks
326///   which are under that particular cgroup while "TOP/CHILD" also matches
327///   tasks under "TOP/CHILD0/" or "TOP/CHILD1/".
328///
329/// - CommPrefix: Matches the task's comm prefix.
330///
331/// - PcommPrefix: Matches the task's thread group leader's comm prefix.
332///
333/// - NiceAbove: Matches if the task's nice value is greater than the
334///   pattern.
335///
336/// - NiceBelow: Matches if the task's nice value is smaller than the
337///   pattern.
338///
339/// - NiceEquals: Matches if the task's nice value is exactly equal to
340///   the pattern.
341///
342/// - UIDEquals: Matches if the task's effective user id matches the value
343///
344/// - GIDEquals: Matches if the task's effective group id matches the value.
345///
346/// - PIDEquals: Matches if the task's pid matches the value.
347///
348/// - PPIDEquals: Matches if the task's ppid matches the value.
349///
350/// - TGIDEquals: Matches if the task's tgid matches the value.
351///
352/// - NSPIDEquals: Matches if the task's namespace id and pid matches the values.
353///
354/// - NSEquals: Matches if the task's namespace id matches the values.
355///
356/// - IsGroupLeader: Bool. When true, matches if the task is group leader
357///   (i.e. PID == TGID), aka the thread from which other threads are made.
358///   When false, matches if the task is *not* the group leader (i.e. the rest).
359///
360/// - CmdJoin: Matches when the task uses pthread_setname_np to send a join/leave
361/// command to the scheduler. See examples/cmdjoin.c for more details.
362///
363/// - UsedGpuTid: Bool. When true, matches if the tasks which have used
364///   gpus by tid.
365///
366/// - UsedGpuPid: Bool. When true, matches if the tasks which have used gpu
367///   by tgid/pid.
368///
369/// - [EXPERIMENTAL] AvgRuntime: (u64, u64). Match tasks whose average runtime
370///   is within the provided values [min, max).
371///
372/// - HintEquals: u64. Match tasks whose hint value equals this value.
373///   The value must be in the range [0, 1024].
374///
375/// - SystemCpuUtilBelow: f64. Match when the system CPU utilization fraction
376///   is below the specified threshold (a value in the range [0.0, 1.0]). This
377///   option can only be used in conjunction with HintEquals.
378///
379/// - DsqInsertBelow: f64. Match when the layer DSQ insertion fraction is below
380///   the specified threshold (a value in the range [0.0, 1.0]). This option can
381///   only be used in conjunction with HintEquals.
382///
383/// While there are complexity limitations as the matches are performed in
384/// BPF, it is straightforward to add more types of matches.
385///
386/// Templates
387/// ---------
388///
389/// Templates let us create a variable number of layers dynamically at initialization
390/// time out of a cgroup name suffix/prefix. Sometimes we know there are multiple
391/// applications running on a machine, each with their own cgroup but do not know the
392/// exact names of the applications or cgroups, e.g., in cloud computing contexts where
393/// workloads are placed on machines dynamically and run under cgroups whose name is
394/// autogenerated. In that case, we cannot hardcode the cgroup match rules when writing
395/// the configuration. We thus cannot easily prevent tasks from different cgroups from
396/// falling into the same layer and affecting each other's performance.
397///
398///
399/// Templates offer a solution to this problem by generating one layer for each such cgroup,
400/// provided these cgroups share a suffix, and that the suffix is unique to them. Templates
401/// have a cgroup suffix rule that we use to find the relevant cgroups in the system. For each
402/// such cgroup, we copy the layer config and add a matching rule that matches just this cgroup.
403///
404///
405/// Policies
406/// ========
407///
408/// The following is an example policy configuration for a layer.
409///
410///   "kind": {
411///     "Confined": {
412///       "cpus_range": [1, 8],
413///       "util_range": [0.8, 0.9]
414///     }
415///   }
416///
417/// It's of "Confined" kind, which tries to concentrate the layer's tasks
418/// into a limited number of CPUs. In the above case, the number of CPUs
419/// assigned to the layer is scaled between 1 and 8 so that the per-cpu
420/// utilization is kept between 80% and 90%. If the CPUs are loaded higher
421/// than 90%, more CPUs are allocated to the layer. If the utilization drops
422/// below 80%, the layer loses CPUs.
423///
424/// Currently, the following policy kinds are supported:
425///
426/// - Confined: Tasks are restricted to the allocated CPUs. The number of
427///   CPUs allocated is modulated to keep the per-CPU utilization in
428///   "util_range". The range can optionally be restricted with the
429///   "cpus_range" property.
430///
431/// - Grouped: Similar to Confined but tasks may spill outside if there are
432///   idle CPUs outside the allocated ones. The range can optionally be
433///   restricted with the "cpus_range" property.
434///
435/// - Open: Prefer the CPUs which are not occupied by Confined or Grouped
436///   layers. Tasks in this group will spill into occupied CPUs if there are
437///   no unoccupied idle CPUs.
438///
439/// All layers take the following options:
440///
441/// - min_exec_us: Minimum execution time in microseconds. Whenever a task
442///   is scheduled in, this is the minimum CPU time that it's charged no
443///   matter how short the actual execution time may be.
444///
445/// - yield_ignore: Yield ignore ratio. If 0.0, yield(2) forfeits a whole
446///   execution slice. 0.25 yields three quarters of an execution slice and
447///   so on. If 1.0, yield is completely ignored.
448///
449/// - slice_us: Scheduling slice duration in microseconds.
450///
451/// - fifo: Use FIFO queues within the layer instead of the default vtime.
452///
453/// - preempt: If true, tasks in the layer will preempt tasks which belong
454///   to other non-preempting layers when no idle CPUs are available.
455///
456/// - preempt_first: If true, tasks in the layer will try to preempt tasks
457///   in their previous CPUs before trying to find idle CPUs.
458///
459/// - exclusive: If true, tasks in the layer will occupy the whole core. The
460///   other logical CPUs sharing the same core will be kept idle. This isn't
461///   a hard guarantee, so don't depend on it for security purposes.
462///
463/// - allow_node_aligned: Put node aligned tasks on layer DSQs instead of lo
464///   fallback. This is a hack to support node-affine tasks without making
465///   the whole scheduler node aware and should only be used with open
466///   layers on non-saturated machines to avoid possible stalls.
467///
468/// - prev_over_idle_core: On SMT enabled systems, prefer using the same CPU
469///   when picking a CPU for tasks on this layer, even if that CPUs SMT
470///   sibling is processing a task.
471///
472/// - weight: Weight of the layer, which is a range from 1 to 10000 with a
473///   default of 100. Layer weights are used during contention to prevent
474///   starvation across layers. Weights are used in combination with
475///   utilization to determine the infeasible adjusted weight with higher
476///   weights having a larger adjustment in adjusted utilization.
477///
478/// - disallow_open_after_us: Duration to wait after machine reaches saturation
479///   before confining tasks in Open layers.
480///
481/// - cpus_range_frac: Array of 2 floats between 0 and 1.0. Lower and upper
482///   bound fractions of all CPUs to give to a layer. Mutually exclusive
483///   with cpus_range.
484///
485/// - disallow_preempt_after_us: Duration to wait after machine reaches saturation
486///   before confining tasks to preempt.
487///
488/// - xllc_mig_min_us: Skip cross-LLC migrations if they are likely to run on
489///   their existing LLC sooner than this.
490///
491/// - idle_smt: *** DEPRECATED ****
492///
493/// - growth_algo: When a layer is allocated new CPUs different algorithms can
494///   be used to determine which CPU should be allocated next. The default
495///   algorithm is a "sticky" algorithm that attempts to spread layers evenly
496///   across cores.
497///
498/// - perf: CPU performance target. 0 means no configuration. A value
499///   between 1 and 1024 indicates the performance level CPUs running tasks
500///   in this layer are configured to using scx_bpf_cpuperf_set().
501///
502/// - idle_resume_us: Sets the idle resume QoS value. CPU idle time governors are expected to
503///   regard the minimum of the global (effective) CPU latency limit and the effective resume
504///   latency constraint for the given CPU as the upper limit for the exit latency of the idle
505///   states. See the latest kernel docs for more details:
506///   https://www.kernel.org/doc/html/latest/admin-guide/pm/cpuidle.html
507///
508/// - nodes: If set the layer will use the set of NUMA nodes for scheduling
509///   decisions. If unset then all available NUMA nodes will be used. If the
510///   llcs value is set the cpuset of NUMA nodes will be or'ed with the LLC
511///   config.
512///
513/// - llcs: If set the layer will use the set of LLCs (last level caches)
514///   for scheduling decisions. If unset then all LLCs will be used. If
515///   the nodes value is set the cpuset of LLCs will be or'ed with the nodes
516///   config.
517///
518///
519/// Similar to matches, adding new policies and extending existing ones
520/// should be relatively straightforward.
521///
522/// Configuration example and running scx_layered
523/// =============================================
524///
525/// An scx_layered config is composed of layer configs. A layer config is
526/// composed of a name, a set of matches, and a policy block. Running the
527/// following will write an example configuration into example.json.
528///
529///   $ scx_layered -e example.json
530///
531/// Note that the last layer in the configuration must have an empty match set
532/// as a catch-all for tasks which haven't been matched into previous layers.
533///
534/// The configuration can be specified in multiple json files and
535/// command line arguments, which are concatenated in the specified
536/// order. Each must contain valid layer configurations.
537///
538/// By default, an argument to scx_layered is interpreted as a JSON string. If
539/// the argument is a pointer to a JSON file, it should be prefixed with file:
540/// or f: as follows:
541///
542///   $ scx_layered file:example.json
543///   ...
544///   $ scx_layered f:example.json
545///
546/// Monitoring Statistics
547/// =====================
548///
549/// Run with `--stats INTERVAL` to enable stats monitoring. There is
550/// also an scx_stat server listening on /var/run/scx/root/stat that can
551/// be monitored by running `scx_layered --monitor INTERVAL` separately.
552///
553///   ```bash
554///   $ scx_layered --monitor 1
555///   tot= 117909 local=86.20 open_idle= 0.21 affn_viol= 1.37 proc=6ms
556///   busy= 34.2 util= 1733.6 load=  21744.1 fallback_cpu=  1
557///     batch    : util/frac=   11.8/  0.7 load/frac=     29.7:  0.1 tasks=  2597
558///                tot=   3478 local=67.80 open_idle= 0.00 preempt= 0.00 affn_viol= 0.00
559///                cpus=  2 [  2,  2] 04000001 00000000
560///     immediate: util/frac= 1218.8/ 70.3 load/frac=  21399.9: 98.4 tasks=  1107
561///                tot=  68997 local=90.57 open_idle= 0.26 preempt= 9.36 affn_viol= 0.00
562///                cpus= 50 [ 50, 50] fbfffffe 000fffff
563///     normal   : util/frac=  502.9/ 29.0 load/frac=    314.5:  1.4 tasks=  3512
564///                tot=  45434 local=80.97 open_idle= 0.16 preempt= 0.00 affn_viol= 3.56
565///                cpus= 50 [ 50, 50] fbfffffe 000fffff
566///   ```
567///
568/// Global statistics: see [`SysStats`]
569///
570/// Per-layer statistics: see [`LayerStats`]
571///
572#[derive(Debug, Parser)]
573#[command(verbatim_doc_comment)]
574struct Opts {
575    /// Scheduling slice duration in microseconds.
576    #[clap(short = 's', long, default_value = "20000")]
577    slice_us: u64,
578
579    /// Maximum consecutive execution time in microseconds. A task may be
580    /// allowed to keep executing on a CPU for this long. Note that this is
581    /// the upper limit and a task may have to moved off the CPU earlier. 0
582    /// indicates default - 20 * slice_us.
583    #[clap(short = 'M', long, default_value = "0")]
584    max_exec_us: u64,
585
586    /// Scheduling interval in seconds.
587    #[clap(short = 'i', long, default_value = "0.1")]
588    interval: f64,
589
590    /// ***DEPRECATED*** Disable load-fraction based max layer CPU limit.
591    /// recommended.
592    #[clap(short = 'n', long, default_value = "false")]
593    no_load_frac_limit: bool,
594
595    /// Exit debug dump buffer length. 0 indicates default.
596    #[clap(long, default_value = "0")]
597    exit_dump_len: u32,
598
599    /// Enable verbose output, including libbpf details. Specify multiple
600    /// times to increase verbosity.
601    #[clap(short = 'v', long, action = clap::ArgAction::Count)]
602    verbose: u8,
603
604    /// Disable topology awareness. When enabled, the "nodes" and "llcs" settings on
605    /// a layer are ignored. Defaults to false on topologies with multiple NUMA nodes
606    /// or LLCs, and true otherwise.
607    #[arg(short = 't', long, num_args = 0..=1, default_missing_value = "true", require_equals = true)]
608    disable_topology: Option<bool>,
609
610    /// Enable cross NUMA preemption.
611    #[clap(long)]
612    xnuma_preemption: bool,
613
614    /// Disable monitor
615    #[clap(long)]
616    monitor_disable: bool,
617
618    /// Write example layer specifications into the file and exit.
619    #[clap(short = 'e', long)]
620    example: Option<String>,
621
622    /// ***DEPRECATED*** Disables preemption if the weighted load fraction
623    /// of a layer (load_frac_adj) exceeds the threshold. The default is
624    /// disabled (0.0).
625    #[clap(long, default_value = "0.0")]
626    layer_preempt_weight_disable: f64,
627
628    /// ***DEPRECATED*** Disables layer growth if the weighted load fraction
629    /// of a layer (load_frac_adj) exceeds the threshold. The default is
630    /// disabled (0.0).
631    #[clap(long, default_value = "0.0")]
632    layer_growth_weight_disable: f64,
633
634    /// Enable stats monitoring with the specified interval.
635    #[clap(long)]
636    stats: Option<f64>,
637
638    /// Run in stats monitoring mode with the specified interval. Scheduler
639    /// is not launched.
640    #[clap(long)]
641    monitor: Option<f64>,
642
643    /// Run with example layer specifications (useful for e.g. CI pipelines)
644    #[clap(long)]
645    run_example: bool,
646
647    /// ***DEPRECATED *** Enables iteration over local LLCs first for
648    /// dispatch.
649    #[clap(long, default_value = "false")]
650    local_llc_iteration: bool,
651
652    /// Low priority fallback DSQs are used to execute tasks with custom CPU
653    /// affinities. These DSQs are immediately executed iff a CPU is
654    /// otherwise idle. However, after the specified wait, they are
655    /// guranteed upto --lo-fb-share fraction of each CPU.
656    #[clap(long, default_value = "10000")]
657    lo_fb_wait_us: u64,
658
659    /// The fraction of CPU time guaranteed to low priority fallback DSQs.
660    /// See --lo-fb-wait-us.
661    #[clap(long, default_value = ".05")]
662    lo_fb_share: f64,
663
664    /// Disable antistall
665    #[clap(long, default_value = "false")]
666    disable_antistall: bool,
667
668    /// Enable numa topology based gpu task affinitization.
669    #[clap(long, default_value = "false")]
670    enable_gpu_affinitize: bool,
671
672    /// Interval at which to reaffinitize gpu tasks to numa nodes.
673    /// Defaults to 900s
674    #[clap(long, default_value = "900")]
675    gpu_affinitize_secs: u64,
676
677    /// Enable match debug
678    /// This stores a mapping of task tid
679    /// to layer id such that bpftool map dump
680    /// can be used to debug layer matches.
681    #[clap(long, default_value = "false")]
682    enable_match_debug: bool,
683
684    /// Maximum task runnable_at delay (in seconds) before antistall turns on
685    #[clap(long, default_value = "3")]
686    antistall_sec: u64,
687
688    /// Enable gpu support
689    #[clap(long, default_value = "false")]
690    enable_gpu_support: bool,
691
692    /// Gpu Kprobe Level
693    /// The value set here determines how agressive
694    /// the kprobes enabled on gpu driver functions are.
695    /// Higher values are more aggressive, incurring more system overhead
696    /// and more accurately identifying PIDs using GPUs in a more timely manner.
697    /// Lower values incur less system overhead, at the cost of less accurately
698    /// identifying GPU pids and taking longer to do so.
699    #[clap(long, default_value = "3")]
700    gpu_kprobe_level: u64,
701
702    /// Enable netdev IRQ balancing. This is experimental and should be used with caution.
703    #[clap(long, default_value = "false")]
704    netdev_irq_balance: bool,
705
706    /// Disable queued wakeup optimization.
707    #[clap(long, default_value = "false")]
708    disable_queued_wakeup: bool,
709
710    /// Per-cpu kthreads are preempting by default. Make it not so.
711    #[clap(long, default_value = "false")]
712    disable_percpu_kthread_preempt: bool,
713
714    /// Only highpri (nice < 0) per-cpu kthreads are preempting by default.
715    /// Make every per-cpu kthread preempting. Meaningful only if
716    /// --disable-percpu-kthread-preempt is not set.
717    #[clap(long, default_value = "false")]
718    percpu_kthread_preempt_all: bool,
719
720    /// Print scheduler version and exit.
721    #[clap(short = 'V', long, action = clap::ArgAction::SetTrue)]
722    version: bool,
723
724    /// Show descriptions for statistics.
725    #[clap(long)]
726    help_stats: bool,
727
728    /// Layer specification. See --help.
729    specs: Vec<String>,
730
731    /// Periodically force tasks in layers using the AvgRuntime match rule to reevaluate which layer they belong to. Default period of 2s.
732    /// turns this off.
733    #[clap(long, default_value = "2000")]
734    layer_refresh_ms_avgruntime: u64,
735
736    /// Set the path for pinning the task hint map.
737    #[clap(long, default_value = "")]
738    task_hint_map: String,
739
740    /// Print the config (after template expansion) and exit.
741    #[clap(long, default_value = "false")]
742    print_and_exit: bool,
743
744    /// Enable affinitized task to use hi fallback queue to get more CPU time.
745    #[clap(long, default_value = "")]
746    hi_fb_thread_name: String,
747
748    #[clap(flatten, next_help_heading = "Topology Options")]
749    topology: TopologyArgs,
750
751    #[clap(flatten, next_help_heading = "Libbpf Options")]
752    pub libbpf: LibbpfOpts,
753}
754
755// Cgroup event types for inter-thread communication
756#[derive(Debug, Clone)]
757enum CgroupEvent {
758    Created {
759        path: String,
760        cgroup_id: u64,    // inode number
761        match_bitmap: u64, // bitmap of matched regex rules
762    },
763    Removed {
764        path: String,
765        cgroup_id: u64, // inode number
766    },
767}
768
769fn read_total_cpu(reader: &fb_procfs::ProcReader) -> Result<fb_procfs::CpuStat> {
770    reader
771        .read_stat()
772        .context("Failed to read procfs")?
773        .total_cpu
774        .ok_or_else(|| anyhow!("Could not read total cpu stat in proc"))
775}
776
777fn calc_util(curr: &fb_procfs::CpuStat, prev: &fb_procfs::CpuStat) -> Result<f64> {
778    match (curr, prev) {
779        (
780            fb_procfs::CpuStat {
781                user_usec: Some(curr_user),
782                nice_usec: Some(curr_nice),
783                system_usec: Some(curr_system),
784                idle_usec: Some(curr_idle),
785                iowait_usec: Some(curr_iowait),
786                irq_usec: Some(curr_irq),
787                softirq_usec: Some(curr_softirq),
788                stolen_usec: Some(curr_stolen),
789                ..
790            },
791            fb_procfs::CpuStat {
792                user_usec: Some(prev_user),
793                nice_usec: Some(prev_nice),
794                system_usec: Some(prev_system),
795                idle_usec: Some(prev_idle),
796                iowait_usec: Some(prev_iowait),
797                irq_usec: Some(prev_irq),
798                softirq_usec: Some(prev_softirq),
799                stolen_usec: Some(prev_stolen),
800                ..
801            },
802        ) => {
803            let idle_usec = curr_idle.saturating_sub(*prev_idle);
804            let iowait_usec = curr_iowait.saturating_sub(*prev_iowait);
805            let user_usec = curr_user.saturating_sub(*prev_user);
806            let system_usec = curr_system.saturating_sub(*prev_system);
807            let nice_usec = curr_nice.saturating_sub(*prev_nice);
808            let irq_usec = curr_irq.saturating_sub(*prev_irq);
809            let softirq_usec = curr_softirq.saturating_sub(*prev_softirq);
810            let stolen_usec = curr_stolen.saturating_sub(*prev_stolen);
811
812            let busy_usec =
813                user_usec + system_usec + nice_usec + irq_usec + softirq_usec + stolen_usec;
814            let total_usec = idle_usec + busy_usec + iowait_usec;
815            if total_usec > 0 {
816                Ok(((busy_usec as f64) / (total_usec as f64)).clamp(0.0, 1.0))
817            } else {
818                Ok(1.0)
819            }
820        }
821        _ => bail!("Missing stats in cpustat"),
822    }
823}
824
825fn copy_into_cstr(dst: &mut [i8], src: &str) {
826    let cstr = CString::new(src).unwrap();
827    let bytes = unsafe { std::mem::transmute::<&[u8], &[i8]>(cstr.as_bytes_with_nul()) };
828    dst[0..bytes.len()].copy_from_slice(bytes);
829}
830
831fn nodemask_from_nodes(nodes: &Vec<usize>) -> usize {
832    let mut mask = 0;
833    for node in nodes {
834        mask |= 1 << node;
835    }
836    mask
837}
838
839fn llcmask_from_llcs(llcs: &BTreeMap<usize, Arc<Llc>>) -> usize {
840    let mut mask = 0;
841    for (_, cache) in llcs {
842        mask |= 1 << cache.id;
843    }
844    mask
845}
846
847fn read_cpu_ctxs(skel: &BpfSkel) -> Result<Vec<bpf_intf::cpu_ctx>> {
848    let mut cpu_ctxs = vec![];
849    let cpu_ctxs_vec = skel
850        .maps
851        .cpu_ctxs
852        .lookup_percpu(&0u32.to_ne_bytes(), libbpf_rs::MapFlags::ANY)
853        .context("Failed to lookup cpu_ctx")?
854        .unwrap();
855    for cpu in 0..*NR_CPUS_POSSIBLE {
856        cpu_ctxs.push(*unsafe {
857            &*(cpu_ctxs_vec[cpu].as_slice().as_ptr() as *const bpf_intf::cpu_ctx)
858        });
859    }
860    Ok(cpu_ctxs)
861}
862
863#[derive(Clone, Debug)]
864struct BpfStats {
865    gstats: Vec<u64>,
866    lstats: Vec<Vec<u64>>,
867    lstats_sums: Vec<u64>,
868    llc_lstats: Vec<Vec<Vec<u64>>>, // [layer][llc][stat]
869}
870
871impl BpfStats {
872    fn read(skel: &BpfSkel, cpu_ctxs: &[bpf_intf::cpu_ctx]) -> Self {
873        let nr_layers = skel.maps.rodata_data.as_ref().unwrap().nr_layers as usize;
874        let nr_llcs = skel.maps.rodata_data.as_ref().unwrap().nr_llcs as usize;
875        let mut gstats = vec![0u64; NR_GSTATS];
876        let mut lstats = vec![vec![0u64; NR_LSTATS]; nr_layers];
877        let mut llc_lstats = vec![vec![vec![0u64; NR_LLC_LSTATS]; nr_llcs]; nr_layers];
878
879        for cpu in 0..*NR_CPUS_POSSIBLE {
880            for stat in 0..NR_GSTATS {
881                gstats[stat] += cpu_ctxs[cpu].gstats[stat];
882            }
883            for layer in 0..nr_layers {
884                for stat in 0..NR_LSTATS {
885                    lstats[layer][stat] += cpu_ctxs[cpu].lstats[layer][stat];
886                }
887            }
888        }
889
890        let mut lstats_sums = vec![0u64; NR_LSTATS];
891        for layer in 0..nr_layers {
892            for stat in 0..NR_LSTATS {
893                lstats_sums[stat] += lstats[layer][stat];
894            }
895        }
896
897        for llc_id in 0..nr_llcs {
898            // XXX - This would be a lot easier if llc_ctx were in
899            // the bss. Unfortunately, kernel < v6.12 crashes and
900            // kernel >= v6.12 fails verification after such
901            // conversion due to seemingly verifier bugs. Convert to
902            // bss maps later.
903            let key = llc_id as u32;
904            let llc_id_slice =
905                unsafe { std::slice::from_raw_parts((&key as *const u32) as *const u8, 4) };
906            let v = skel
907                .maps
908                .llc_data
909                .lookup(llc_id_slice, libbpf_rs::MapFlags::ANY)
910                .unwrap()
911                .unwrap();
912            let llcc = unsafe { *(v.as_slice().as_ptr() as *const bpf_intf::llc_ctx) };
913
914            for layer_id in 0..nr_layers {
915                for stat_id in 0..NR_LLC_LSTATS {
916                    llc_lstats[layer_id][llc_id][stat_id] = llcc.lstats[layer_id][stat_id];
917                }
918            }
919        }
920
921        Self {
922            gstats,
923            lstats,
924            lstats_sums,
925            llc_lstats,
926        }
927    }
928}
929
930impl<'a, 'b> Sub<&'b BpfStats> for &'a BpfStats {
931    type Output = BpfStats;
932
933    fn sub(self, rhs: &'b BpfStats) -> BpfStats {
934        let vec_sub = |l: &[u64], r: &[u64]| l.iter().zip(r.iter()).map(|(l, r)| *l - *r).collect();
935        BpfStats {
936            gstats: vec_sub(&self.gstats, &rhs.gstats),
937            lstats: self
938                .lstats
939                .iter()
940                .zip(rhs.lstats.iter())
941                .map(|(l, r)| vec_sub(l, r))
942                .collect(),
943            lstats_sums: vec_sub(&self.lstats_sums, &rhs.lstats_sums),
944            llc_lstats: self
945                .llc_lstats
946                .iter()
947                .zip(rhs.llc_lstats.iter())
948                .map(|(l_layer, r_layer)| {
949                    l_layer
950                        .iter()
951                        .zip(r_layer.iter())
952                        .map(|(l_llc, r_llc)| {
953                            let (l_llc, mut r_llc) = (l_llc.clone(), r_llc.clone());
954                            // Lat is not subtractable, take L side.
955                            r_llc[bpf_intf::llc_layer_stat_id_LLC_LSTAT_LAT as usize] = 0;
956                            vec_sub(&l_llc, &r_llc)
957                        })
958                        .collect()
959                })
960                .collect(),
961        }
962    }
963}
964
965#[derive(Clone, Debug)]
966struct Stats {
967    at: Instant,
968    elapsed: Duration,
969    nr_layers: usize,
970    nr_layer_tasks: Vec<usize>,
971    nr_nodes: usize,
972
973    total_util: f64, // Running AVG of sum of layer_utils
974    layer_utils: Vec<Vec<f64>>,
975    prev_layer_usages: Vec<Vec<u64>>,
976
977    layer_membws: Vec<Vec<f64>>, // Estimated memory bandsidth consumption
978    prev_layer_membw_agg: Vec<Vec<u64>>, // Estimated aggregate membw consumption
979
980    cpu_busy: f64, // Read from /proc, maybe higher than total_util
981    prev_total_cpu: fb_procfs::CpuStat,
982    prev_pmu_resctrl_membw: (u64, u64), // (PMU-reported membw, resctrl-reported membw)
983
984    system_cpu_util_ewma: f64,       // 10s EWMA of system CPU utilization
985    layer_dsq_insert_ewma: Vec<f64>, // 10s EWMA of per-layer DSQ insertion ratio
986
987    bpf_stats: BpfStats,
988    prev_bpf_stats: BpfStats,
989
990    processing_dur: Duration,
991    prev_processing_dur: Duration,
992
993    layer_slice_us: Vec<u64>,
994
995    gpu_tasks_affinitized: u64,
996    gpu_task_affinitization_ms: u64,
997}
998
999impl Stats {
1000    fn read_layer_usages(cpu_ctxs: &[bpf_intf::cpu_ctx], nr_layers: usize) -> Vec<Vec<u64>> {
1001        let mut layer_usages = vec![vec![0u64; NR_LAYER_USAGES]; nr_layers];
1002
1003        for cpu in 0..*NR_CPUS_POSSIBLE {
1004            for layer in 0..nr_layers {
1005                for usage in 0..NR_LAYER_USAGES {
1006                    layer_usages[layer][usage] += cpu_ctxs[cpu].layer_usages[layer][usage];
1007                }
1008            }
1009        }
1010
1011        layer_usages
1012    }
1013
1014    // Same as above, but for aggregate memory bandwidth consumption.
1015    fn read_layer_membw_agg(cpu_ctxs: &[bpf_intf::cpu_ctx], nr_layers: usize) -> Vec<Vec<u64>> {
1016        let mut layer_membw_agg = vec![vec![0u64; NR_LAYER_USAGES]; nr_layers];
1017
1018        for cpu in 0..*NR_CPUS_POSSIBLE {
1019            for layer in 0..nr_layers {
1020                for usage in 0..NR_LAYER_USAGES {
1021                    layer_membw_agg[layer][usage] += cpu_ctxs[cpu].layer_membw_agg[layer][usage];
1022                }
1023            }
1024        }
1025
1026        layer_membw_agg
1027    }
1028
1029    /// Use the membw reported by resctrl to normalize the values reported by hw counters.
1030    /// We have the following problem:
1031    /// 1) We want per-task memory bandwidth reporting. We cannot do this with resctrl, much
1032    /// less transparently, since we would require different RMID for each task.
1033    /// 2) We want to directly use perf counters for tracking per-task memory bandwidth, but
1034    /// we can't: Non-resctrl counters do not measure the right thing (e.g., they only measure
1035    /// proxies like load operations),
1036    /// 3) Resctrl counters are not accessible directly so we cannot read them from the BPF side.
1037    ///
1038    /// Approximate per-task memory bandwidth using perf counters to measure _relative_ memory
1039    /// bandwidth usage.
1040    fn resctrl_read_total_membw() -> Result<u64> {
1041        let mut total_membw = 0u64;
1042        for entry in WalkDir::new("/sys/fs/resctrl/mon_data")
1043            .min_depth(1)
1044            .into_iter()
1045            .filter_map(Result::ok)
1046            .filter(|x| x.path().is_dir())
1047        {
1048            let mut path = entry.path().to_path_buf();
1049            path.push("mbm_total_bytes");
1050            total_membw += fs::read_to_string(path)?.trim().parse::<u64>()?;
1051        }
1052
1053        Ok(total_membw)
1054    }
1055
1056    fn new(
1057        skel: &mut BpfSkel,
1058        proc_reader: &fb_procfs::ProcReader,
1059        gpu_task_affinitizer: &GpuTaskAffinitizer,
1060    ) -> Result<Self> {
1061        let nr_layers = skel.maps.rodata_data.as_ref().unwrap().nr_layers as usize;
1062        let cpu_ctxs = read_cpu_ctxs(skel)?;
1063        let bpf_stats = BpfStats::read(skel, &cpu_ctxs);
1064        let nr_nodes = skel.maps.rodata_data.as_ref().unwrap().nr_nodes as usize;
1065        let pmu_membw = Self::read_layer_membw_agg(&cpu_ctxs, nr_layers);
1066
1067        Ok(Self {
1068            at: Instant::now(),
1069            elapsed: Default::default(),
1070            nr_layers,
1071            nr_layer_tasks: vec![0; nr_layers],
1072            nr_nodes,
1073
1074            total_util: 0.0,
1075            layer_utils: vec![vec![0.0; NR_LAYER_USAGES]; nr_layers],
1076            layer_membws: vec![vec![0.0; NR_LAYER_USAGES]; nr_layers],
1077            prev_layer_usages: Self::read_layer_usages(&cpu_ctxs, nr_layers),
1078            // This is not normalized because we don't have enough history to do so.
1079            // It should not matter too much, since the value is dropped on the first
1080            // iteration.
1081            prev_layer_membw_agg: pmu_membw,
1082            prev_pmu_resctrl_membw: (0, 0),
1083
1084            cpu_busy: 0.0,
1085            prev_total_cpu: read_total_cpu(proc_reader)?,
1086            system_cpu_util_ewma: 0.0,
1087            layer_dsq_insert_ewma: vec![0.0; nr_layers],
1088
1089            bpf_stats: bpf_stats.clone(),
1090            prev_bpf_stats: bpf_stats,
1091
1092            processing_dur: Default::default(),
1093            prev_processing_dur: Default::default(),
1094
1095            layer_slice_us: vec![0; nr_layers],
1096            gpu_tasks_affinitized: gpu_task_affinitizer.tasks_affinitized,
1097            gpu_task_affinitization_ms: gpu_task_affinitizer.last_task_affinitization_ms,
1098        })
1099    }
1100
1101    fn refresh(
1102        &mut self,
1103        skel: &mut BpfSkel,
1104        proc_reader: &fb_procfs::ProcReader,
1105        now: Instant,
1106        cur_processing_dur: Duration,
1107        gpu_task_affinitizer: &GpuTaskAffinitizer,
1108    ) -> Result<()> {
1109        let elapsed = now.duration_since(self.at);
1110        let elapsed_f64 = elapsed.as_secs_f64();
1111        let cpu_ctxs = read_cpu_ctxs(skel)?;
1112
1113        let nr_layer_tasks: Vec<usize> = skel
1114            .maps
1115            .bss_data
1116            .as_ref()
1117            .unwrap()
1118            .layers
1119            .iter()
1120            .take(self.nr_layers)
1121            .map(|layer| layer.nr_tasks as usize)
1122            .collect();
1123        let layer_slice_us: Vec<u64> = skel
1124            .maps
1125            .bss_data
1126            .as_ref()
1127            .unwrap()
1128            .layers
1129            .iter()
1130            .take(self.nr_layers)
1131            .map(|layer| layer.slice_ns / 1000_u64)
1132            .collect();
1133
1134        let cur_layer_usages = Self::read_layer_usages(&cpu_ctxs, self.nr_layers);
1135        let cur_layer_membw_agg = Self::read_layer_membw_agg(&cpu_ctxs, self.nr_layers);
1136
1137        // Memory BW normalization. It requires finding the delta according to perf, the delta
1138        // according to resctl, and finding the factor between them. This also helps in
1139        // determining whether this delta is stable.
1140        //
1141        // We scale only the raw PMC delta - the part of the counter incremented between two
1142        // periods. This is because the scaling factor is only valid for that time period.
1143        // Non-scaled samples are not comparable, while scaled ones are.
1144        let (pmu_prev, resctrl_prev) = self.prev_pmu_resctrl_membw;
1145        let pmu_cur: u64 = cur_layer_membw_agg
1146            .iter()
1147            .map(|membw_agg| membw_agg[LAYER_USAGE_OPEN] + membw_agg[LAYER_USAGE_OWNED])
1148            .sum();
1149        let resctrl_cur = Self::resctrl_read_total_membw()?;
1150        let factor = (resctrl_cur - resctrl_prev) as f64 / (pmu_cur - pmu_prev) as f64;
1151
1152        // Computes the runtime deltas and converts them from ns to s.
1153        let compute_diff = |cur_agg: &Vec<Vec<u64>>, prev_agg: &Vec<Vec<u64>>| {
1154            cur_agg
1155                .iter()
1156                .zip(prev_agg.iter())
1157                .map(|(cur, prev)| {
1158                    cur.iter()
1159                        .zip(prev.iter())
1160                        .map(|(c, p)| (c - p) as f64 / 1_000_000_000.0 / elapsed_f64)
1161                        .collect()
1162                })
1163                .collect()
1164        };
1165
1166        // Computes the total memory traffic done since last computation and normalizes to GBs.
1167        // We derive the rate of consumption elsewhere.
1168        let compute_mem_diff = |cur_agg: &Vec<Vec<u64>>, prev_agg: &Vec<Vec<u64>>| {
1169            cur_agg
1170                .iter()
1171                .zip(prev_agg.iter())
1172                .map(|(cur, prev)| {
1173                    cur.iter()
1174                        .zip(prev.iter())
1175                        .map(|(c, p)| (*c as i64 - *p as i64) as f64 / (1024 as f64).powf(3.0))
1176                        .collect()
1177                })
1178                .collect()
1179        };
1180
1181        let cur_layer_utils: Vec<Vec<f64>> =
1182            compute_diff(&cur_layer_usages, &self.prev_layer_usages);
1183
1184        // Scale the raw value delta by the resctrl/pmc computed factor.
1185        let cur_layer_membw: Vec<Vec<f64>> =
1186            compute_mem_diff(&cur_layer_membw_agg, &self.prev_layer_membw_agg);
1187
1188        let cur_layer_membw: Vec<Vec<f64>> = cur_layer_membw
1189            .iter()
1190            .map(|x| x.iter().map(|x| *x * factor).collect())
1191            .collect();
1192
1193        let metric_decay =
1194            |cur_metric: Vec<Vec<f64>>, prev_metric: &Vec<Vec<f64>>, decay_rate: f64| {
1195                cur_metric
1196                    .iter()
1197                    .zip(prev_metric.iter())
1198                    .map(|(cur, prev)| {
1199                        cur.iter()
1200                            .zip(prev.iter())
1201                            .map(|(c, p)| {
1202                                let decay = decay_rate.powf(elapsed_f64);
1203                                p * decay + c * (1.0 - decay)
1204                            })
1205                            .collect()
1206                    })
1207                    .collect()
1208            };
1209
1210        let layer_utils: Vec<Vec<f64>> =
1211            metric_decay(cur_layer_utils, &self.layer_utils, *USAGE_DECAY);
1212        let layer_membws: Vec<Vec<f64>> = metric_decay(cur_layer_membw, &self.layer_membws, 0.0);
1213
1214        let cur_total_cpu = read_total_cpu(proc_reader)?;
1215        let cpu_busy = calc_util(&cur_total_cpu, &self.prev_total_cpu)?;
1216
1217        // Calculate system CPU utilization EWMA (10 second window)
1218        const SYS_CPU_UTIL_EWMA_SECS: f64 = 10.0;
1219        let elapsed_f64 = elapsed.as_secs_f64();
1220        let alpha = elapsed_f64 / SYS_CPU_UTIL_EWMA_SECS.max(elapsed_f64);
1221        let system_cpu_util_ewma = alpha * cpu_busy + (1.0 - alpha) * self.system_cpu_util_ewma;
1222
1223        let cur_bpf_stats = BpfStats::read(skel, &cpu_ctxs);
1224        let bpf_stats = &cur_bpf_stats - &self.prev_bpf_stats;
1225
1226        // Calculate per-layer DSQ insertion EWMA (10 second window)
1227        const DSQ_INSERT_EWMA_SECS: f64 = 10.0;
1228        let dsq_alpha = elapsed_f64 / DSQ_INSERT_EWMA_SECS.max(elapsed_f64);
1229        let layer_dsq_insert_ewma: Vec<f64> = (0..self.nr_layers)
1230            .map(|layer_id| {
1231                let sel_local = bpf_stats.lstats[layer_id]
1232                    [bpf_intf::layer_stat_id_LSTAT_SEL_LOCAL as usize]
1233                    as f64;
1234                let enq_local = bpf_stats.lstats[layer_id]
1235                    [bpf_intf::layer_stat_id_LSTAT_ENQ_LOCAL as usize]
1236                    as f64;
1237                let enq_dsq = bpf_stats.lstats[layer_id]
1238                    [bpf_intf::layer_stat_id_LSTAT_ENQ_DSQ as usize]
1239                    as f64;
1240                let total_dispatches = sel_local + enq_local + enq_dsq;
1241
1242                let cur_ratio = if total_dispatches > 0.0 {
1243                    enq_dsq / total_dispatches
1244                } else {
1245                    0.0
1246                };
1247
1248                dsq_alpha * cur_ratio + (1.0 - dsq_alpha) * self.layer_dsq_insert_ewma[layer_id]
1249            })
1250            .collect();
1251
1252        let processing_dur = cur_processing_dur
1253            .checked_sub(self.prev_processing_dur)
1254            .unwrap();
1255
1256        *self = Self {
1257            at: now,
1258            elapsed,
1259            nr_layers: self.nr_layers,
1260            nr_layer_tasks,
1261            nr_nodes: self.nr_nodes,
1262
1263            total_util: layer_utils
1264                .iter()
1265                .map(|x| x.iter().take(LAYER_USAGE_SUM_UPTO + 1).sum::<f64>())
1266                .sum(),
1267            layer_utils,
1268            layer_membws,
1269            prev_layer_usages: cur_layer_usages,
1270            prev_layer_membw_agg: cur_layer_membw_agg,
1271            // Was updated during normalization.
1272            prev_pmu_resctrl_membw: (pmu_cur, resctrl_cur),
1273
1274            cpu_busy,
1275            prev_total_cpu: cur_total_cpu,
1276            system_cpu_util_ewma,
1277            layer_dsq_insert_ewma,
1278
1279            bpf_stats,
1280            prev_bpf_stats: cur_bpf_stats,
1281
1282            processing_dur,
1283            prev_processing_dur: cur_processing_dur,
1284
1285            layer_slice_us,
1286            gpu_tasks_affinitized: gpu_task_affinitizer.tasks_affinitized,
1287            gpu_task_affinitization_ms: gpu_task_affinitizer.last_task_affinitization_ms,
1288        };
1289        Ok(())
1290    }
1291}
1292
1293#[derive(Debug)]
1294struct Layer {
1295    name: String,
1296    kind: LayerKind,
1297    growth_algo: LayerGrowthAlgo,
1298    core_order: Vec<usize>,
1299
1300    target_llc_cpus: (usize, usize),
1301    assigned_llcs: Vec<usize>,
1302
1303    nr_cpus: usize,
1304    nr_llc_cpus: Vec<usize>,
1305    cpus: Cpumask,
1306    allowed_cpus: Cpumask,
1307}
1308
1309fn get_kallsyms_addr(sym_name: &str) -> Result<u64> {
1310    fs::read_to_string("/proc/kallsyms")?
1311        .lines()
1312        .find(|line| line.contains(sym_name))
1313        .and_then(|line| line.split_whitespace().next())
1314        .and_then(|addr| u64::from_str_radix(addr, 16).ok())
1315        .ok_or_else(|| anyhow!("Symbol '{}' not found", sym_name))
1316}
1317
1318fn resolve_cpus_pct_range(
1319    cpus_range: &Option<(usize, usize)>,
1320    cpus_range_frac: &Option<(f64, f64)>,
1321    max_cpus: usize,
1322) -> Result<(usize, usize)> {
1323    match (cpus_range, cpus_range_frac) {
1324        (Some(_x), Some(_y)) => {
1325            bail!("cpus_range cannot be used with cpus_pct.");
1326        }
1327        (Some((cpus_range_min, cpus_range_max)), None) => Ok((*cpus_range_min, *cpus_range_max)),
1328        (None, Some((cpus_frac_min, cpus_frac_max))) => {
1329            if *cpus_frac_min < 0_f64
1330                || *cpus_frac_min > 1_f64
1331                || *cpus_frac_max < 0_f64
1332                || *cpus_frac_max > 1_f64
1333            {
1334                bail!("cpus_range_frac values must be between 0.0 and 1.0");
1335            }
1336            let cpus_min_count = ((max_cpus as f64) * cpus_frac_min).round_ties_even() as usize;
1337            let cpus_max_count = ((max_cpus as f64) * cpus_frac_max).round_ties_even() as usize;
1338            Ok((
1339                std::cmp::max(cpus_min_count, 1),
1340                std::cmp::min(cpus_max_count, max_cpus),
1341            ))
1342        }
1343        (None, None) => Ok((0, max_cpus)),
1344    }
1345}
1346
1347impl Layer {
1348    fn new(spec: &LayerSpec, topo: &Topology, core_order: &Vec<usize>) -> Result<Self> {
1349        let name = &spec.name;
1350        let kind = spec.kind.clone();
1351        let mut allowed_cpus = Cpumask::new();
1352        match &kind {
1353            LayerKind::Confined {
1354                cpus_range,
1355                cpus_range_frac,
1356                common: LayerCommon { nodes, llcs, .. },
1357                ..
1358            } => {
1359                let cpus_range =
1360                    resolve_cpus_pct_range(cpus_range, cpus_range_frac, topo.all_cpus.len())?;
1361                if cpus_range.0 > cpus_range.1 || cpus_range.1 == 0 {
1362                    bail!("invalid cpus_range {:?}", cpus_range);
1363                }
1364                if nodes.is_empty() && llcs.is_empty() {
1365                    allowed_cpus.set_all();
1366                } else {
1367                    // build up the cpus bitset
1368                    for (node_id, node) in &topo.nodes {
1369                        // first do the matching for nodes
1370                        if nodes.contains(node_id) {
1371                            for &id in node.all_cpus.keys() {
1372                                allowed_cpus.set_cpu(id)?;
1373                            }
1374                        }
1375                        // next match on any LLCs
1376                        for (llc_id, llc) in &node.llcs {
1377                            if llcs.contains(llc_id) {
1378                                for &id in llc.all_cpus.keys() {
1379                                    allowed_cpus.set_cpu(id)?;
1380                                }
1381                            }
1382                        }
1383                    }
1384                }
1385            }
1386            LayerKind::Grouped {
1387                common: LayerCommon { nodes, llcs, .. },
1388                ..
1389            }
1390            | LayerKind::Open {
1391                common: LayerCommon { nodes, llcs, .. },
1392                ..
1393            } => {
1394                if nodes.is_empty() && llcs.is_empty() {
1395                    allowed_cpus.set_all();
1396                } else {
1397                    // build up the cpus bitset
1398                    for (node_id, node) in &topo.nodes {
1399                        // first do the matching for nodes
1400                        if nodes.contains(node_id) {
1401                            for &id in node.all_cpus.keys() {
1402                                allowed_cpus.set_cpu(id)?;
1403                            }
1404                        }
1405                        // next match on any LLCs
1406                        for (llc_id, llc) in &node.llcs {
1407                            if llcs.contains(llc_id) {
1408                                for &id in llc.all_cpus.keys() {
1409                                    allowed_cpus.set_cpu(id)?;
1410                                }
1411                            }
1412                        }
1413                    }
1414                }
1415            }
1416        }
1417
1418        // Util can be above 1.0 for grouped layers if
1419        // util_includes_open_cputime is set.
1420        if let Some(util_range) = kind.util_range() {
1421            if util_range.0 < 0.0 || util_range.1 < 0.0 || util_range.0 >= util_range.1 {
1422                bail!("invalid util_range {:?}", util_range);
1423            }
1424        }
1425
1426        let layer_growth_algo = kind.common().growth_algo.clone();
1427
1428        debug!(
1429            "layer: {} algo: {:?} core order: {:?}",
1430            name, &layer_growth_algo, core_order
1431        );
1432
1433        Ok(Self {
1434            name: name.into(),
1435            kind,
1436            growth_algo: layer_growth_algo,
1437            core_order: core_order.clone(),
1438
1439            target_llc_cpus: (0, 0),
1440            assigned_llcs: vec![],
1441
1442            nr_cpus: 0,
1443            nr_llc_cpus: vec![0; topo.all_llcs.len()],
1444            cpus: Cpumask::new(),
1445            allowed_cpus,
1446        })
1447    }
1448
1449    fn free_some_cpus(&mut self, cpu_pool: &mut CpuPool, max_to_free: usize) -> Result<usize> {
1450        let cpus_to_free = match cpu_pool.next_to_free(&self.cpus, self.core_order.iter().rev())? {
1451            Some(ret) => ret.clone(),
1452            None => return Ok(0),
1453        };
1454
1455        let nr_to_free = cpus_to_free.weight();
1456
1457        Ok(if nr_to_free <= max_to_free {
1458            trace!("[{}] freeing CPUs: {}", self.name, &cpus_to_free);
1459            self.cpus &= &cpus_to_free.not();
1460            self.nr_cpus -= nr_to_free;
1461            for cpu in cpus_to_free.iter() {
1462                self.nr_llc_cpus[cpu_pool.topo.all_cpus[&cpu].llc_id] -= 1;
1463            }
1464            cpu_pool.free(&cpus_to_free)?;
1465            nr_to_free
1466        } else {
1467            0
1468        })
1469    }
1470
1471    fn alloc_some_cpus(&mut self, cpu_pool: &mut CpuPool) -> Result<usize> {
1472        let new_cpus = match cpu_pool
1473            .alloc_cpus(&self.allowed_cpus, &self.core_order)
1474            .clone()
1475        {
1476            Some(ret) => ret.clone(),
1477            None => {
1478                trace!("layer-{} can't grow, no CPUs", &self.name);
1479                return Ok(0);
1480            }
1481        };
1482
1483        let nr_new_cpus = new_cpus.weight();
1484
1485        trace!("[{}] adding CPUs: {}", &self.name, &new_cpus);
1486        self.cpus |= &new_cpus;
1487        self.nr_cpus += nr_new_cpus;
1488        for cpu in new_cpus.iter() {
1489            self.nr_llc_cpus[cpu_pool.topo.all_cpus[&cpu].llc_id] += 1;
1490        }
1491        Ok(nr_new_cpus)
1492    }
1493}
1494#[derive(Debug, Clone)]
1495struct NodeInfo {
1496    node_mask: nix::sched::CpuSet,
1497    _node_id: usize,
1498}
1499
1500#[derive(Debug)]
1501struct GpuTaskAffinitizer {
1502    // This struct tracks information neccessary to numa affinitize
1503    // gpu tasks periodically when needed.
1504    gpu_devs_to_node_info: HashMap<u32, NodeInfo>,
1505    gpu_pids_to_devs: HashMap<Pid, u32>,
1506    last_process_time: Option<Instant>,
1507    sys: System,
1508    pid_map: HashMap<Pid, Vec<Pid>>,
1509    poll_interval: Duration,
1510    enable: bool,
1511    tasks_affinitized: u64,
1512    last_task_affinitization_ms: u64,
1513}
1514
1515impl GpuTaskAffinitizer {
1516    pub fn new(poll_interval: u64, enable: bool) -> GpuTaskAffinitizer {
1517        GpuTaskAffinitizer {
1518            gpu_devs_to_node_info: HashMap::new(),
1519            gpu_pids_to_devs: HashMap::new(),
1520            last_process_time: None,
1521            sys: System::default(),
1522            pid_map: HashMap::new(),
1523            poll_interval: Duration::from_secs(poll_interval),
1524            enable,
1525            tasks_affinitized: 0,
1526            last_task_affinitization_ms: 0,
1527        }
1528    }
1529
1530    fn find_one_cpu(&self, affinity: Vec<u64>) -> Result<u32> {
1531        for (chunk, &mask) in affinity.iter().enumerate() {
1532            let mut inner_offset: u64 = 1;
1533            for _ in 0..64 {
1534                if (mask & inner_offset) != 0 {
1535                    return Ok((64 * chunk + u64::trailing_zeros(inner_offset) as usize) as u32);
1536                }
1537                inner_offset = inner_offset << 1;
1538            }
1539        }
1540        anyhow::bail!("unable to get CPU from NVML bitmask");
1541    }
1542
1543    fn node_to_cpuset(&self, node: &scx_utils::Node) -> Result<CpuSet> {
1544        let mut cpuset = CpuSet::new();
1545        for (cpu_id, _cpu) in &node.all_cpus {
1546            cpuset.set(*cpu_id)?;
1547        }
1548        Ok(cpuset)
1549    }
1550
1551    fn init_dev_node_map(&mut self, topo: Arc<Topology>) -> Result<()> {
1552        let nvml = nvml()?;
1553        let device_count = nvml.device_count()?;
1554
1555        for idx in 0..device_count {
1556            let dev = nvml.device_by_index(idx)?;
1557            // For machines w/ up to 1024 CPUs.
1558            let cpu = dev.cpu_affinity(16)?;
1559            let ideal_cpu = self.find_one_cpu(cpu)?;
1560            if let Some(cpu) = topo.all_cpus.get(&(ideal_cpu as usize)) {
1561                self.gpu_devs_to_node_info.insert(
1562                    idx,
1563                    NodeInfo {
1564                        node_mask: self.node_to_cpuset(
1565                            topo.nodes.get(&cpu.node_id).expect("topo missing node"),
1566                        )?,
1567                        _node_id: cpu.node_id,
1568                    },
1569                );
1570            }
1571        }
1572        Ok(())
1573    }
1574
1575    fn update_gpu_pids(&mut self) -> Result<()> {
1576        let nvml = nvml()?;
1577        for i in 0..nvml.device_count()? {
1578            let device = nvml.device_by_index(i)?;
1579            for proc in device
1580                .running_compute_processes()?
1581                .into_iter()
1582                .chain(device.running_graphics_processes()?.into_iter())
1583            {
1584                self.gpu_pids_to_devs.insert(Pid::from_u32(proc.pid), i);
1585            }
1586        }
1587        Ok(())
1588    }
1589
1590    fn update_process_info(&mut self) -> Result<()> {
1591        self.sys.refresh_processes_specifics(
1592            ProcessesToUpdate::All,
1593            true,
1594            ProcessRefreshKind::nothing(),
1595        );
1596        self.pid_map.clear();
1597        for (pid, proc_) in self.sys.processes() {
1598            if let Some(ppid) = proc_.parent() {
1599                self.pid_map.entry(ppid).or_default().push(*pid);
1600            }
1601        }
1602        Ok(())
1603    }
1604
1605    fn get_child_pids_and_tids(&self, root_pid: Pid) -> HashSet<Pid> {
1606        let mut work = VecDeque::from([root_pid]);
1607        let mut pids_and_tids: HashSet<Pid> = HashSet::new();
1608
1609        while let Some(pid) = work.pop_front() {
1610            if pids_and_tids.insert(pid) {
1611                if let Some(kids) = self.pid_map.get(&pid) {
1612                    work.extend(kids);
1613                }
1614                if let Some(proc_) = self.sys.process(pid) {
1615                    if let Some(tasks) = proc_.tasks() {
1616                        pids_and_tids.extend(tasks.iter().copied());
1617                    }
1618                }
1619            }
1620        }
1621        pids_and_tids
1622    }
1623
1624    fn affinitize_gpu_pids(&mut self) -> Result<()> {
1625        if !self.enable {
1626            return Ok(());
1627        }
1628        for (pid, dev) in &self.gpu_pids_to_devs {
1629            let node_info = self
1630                .gpu_devs_to_node_info
1631                .get(&dev)
1632                .expect("Unable to get gpu pid node mask");
1633            for child in self.get_child_pids_and_tids(*pid) {
1634                match nix::sched::sched_setaffinity(
1635                    nix::unistd::Pid::from_raw(child.as_u32() as i32),
1636                    &node_info.node_mask,
1637                ) {
1638                    Ok(_) => {
1639                        // Increment the global counter for successful affinitization
1640                        self.tasks_affinitized += 1;
1641                    }
1642                    Err(_) => {
1643                        debug!(
1644                            "Error affinitizing gpu pid {} to node {:#?}",
1645                            child.as_u32(),
1646                            node_info
1647                        );
1648                    }
1649                };
1650            }
1651        }
1652        Ok(())
1653    }
1654
1655    pub fn maybe_affinitize(&mut self) {
1656        if !self.enable {
1657            return;
1658        }
1659        let now = Instant::now();
1660
1661        if let Some(last_process_time) = self.last_process_time {
1662            if (now - last_process_time) < self.poll_interval {
1663                return;
1664            }
1665        }
1666
1667        match self.update_gpu_pids() {
1668            Ok(_) => {}
1669            Err(e) => {
1670                error!("Error updating GPU PIDs: {}", e);
1671            }
1672        };
1673        match self.update_process_info() {
1674            Ok(_) => {}
1675            Err(e) => {
1676                error!("Error updating process info to affinitize GPU PIDs: {}", e);
1677            }
1678        };
1679        match self.affinitize_gpu_pids() {
1680            Ok(_) => {}
1681            Err(e) => {
1682                error!("Error updating GPU PIDs: {}", e);
1683            }
1684        };
1685        self.last_process_time = Some(now);
1686        self.last_task_affinitization_ms = (Instant::now() - now).as_millis() as u64;
1687
1688        return;
1689    }
1690
1691    pub fn init(&mut self, topo: Arc<Topology>) {
1692        if !self.enable || self.last_process_time.is_some() {
1693            return;
1694        }
1695
1696        match self.init_dev_node_map(topo) {
1697            Ok(_) => {}
1698            Err(e) => {
1699                error!("Error initializing gpu node dev map: {}", e);
1700            }
1701        };
1702        self.sys = System::new_all();
1703        return;
1704    }
1705}
1706
1707struct Scheduler<'a> {
1708    skel: BpfSkel<'a>,
1709    struct_ops: Option<libbpf_rs::Link>,
1710    layer_specs: Vec<LayerSpec>,
1711
1712    sched_intv: Duration,
1713    layer_refresh_intv: Duration,
1714
1715    cpu_pool: CpuPool,
1716    layers: Vec<Layer>,
1717    idle_qos_enabled: bool,
1718
1719    proc_reader: fb_procfs::ProcReader,
1720    sched_stats: Stats,
1721
1722    cgroup_regexes: Option<HashMap<u32, Regex>>,
1723
1724    nr_layer_cpus_ranges: Vec<(usize, usize)>,
1725    processing_dur: Duration,
1726
1727    topo: Arc<Topology>,
1728    netdevs: BTreeMap<String, NetDev>,
1729    stats_server: StatsServer<StatsReq, StatsRes>,
1730    gpu_task_handler: GpuTaskAffinitizer,
1731}
1732
1733impl<'a> Scheduler<'a> {
1734    fn init_layers(
1735        skel: &mut OpenBpfSkel,
1736        specs: &[LayerSpec],
1737        topo: &Topology,
1738    ) -> Result<HashMap<u32, Regex>> {
1739        skel.maps.rodata_data.as_mut().unwrap().nr_layers = specs.len() as u32;
1740        let mut perf_set = false;
1741
1742        let mut layer_iteration_order = (0..specs.len()).collect::<Vec<_>>();
1743        let mut layer_weights: Vec<usize> = vec![];
1744        let mut cgroup_regex_id = 0;
1745        let mut cgroup_regexes = HashMap::new();
1746
1747        for (spec_i, spec) in specs.iter().enumerate() {
1748            let layer = &mut skel.maps.bss_data.as_mut().unwrap().layers[spec_i];
1749
1750            for (or_i, or) in spec.matches.iter().enumerate() {
1751                for (and_i, and) in or.iter().enumerate() {
1752                    let mt = &mut layer.matches[or_i].matches[and_i];
1753
1754                    // Rules are allowlist-based by default
1755                    mt.exclude.write(false);
1756
1757                    match and {
1758                        LayerMatch::CgroupPrefix(prefix) => {
1759                            mt.kind = bpf_intf::layer_match_kind_MATCH_CGROUP_PREFIX as i32;
1760                            copy_into_cstr(&mut mt.cgroup_prefix, prefix.as_str());
1761                        }
1762                        LayerMatch::CgroupSuffix(suffix) => {
1763                            mt.kind = bpf_intf::layer_match_kind_MATCH_CGROUP_SUFFIX as i32;
1764                            copy_into_cstr(&mut mt.cgroup_suffix, suffix.as_str());
1765                        }
1766                        LayerMatch::CgroupRegex(regex_str) => {
1767                            if cgroup_regex_id >= bpf_intf::consts_MAX_CGROUP_REGEXES {
1768                                bail!(
1769                                    "Too many cgroup regex rules. Maximum allowed: {}",
1770                                    bpf_intf::consts_MAX_CGROUP_REGEXES
1771                                );
1772                            }
1773
1774                            // CgroupRegex matching handled in userspace via cgroup watcher
1775                            mt.kind = bpf_intf::layer_match_kind_MATCH_CGROUP_REGEX as i32;
1776                            mt.cgroup_regex_id = cgroup_regex_id;
1777
1778                            let regex = Regex::new(regex_str).with_context(|| {
1779                                format!("Invalid regex '{}' in layer '{}'", regex_str, spec.name)
1780                            })?;
1781                            cgroup_regexes.insert(cgroup_regex_id, regex);
1782                            cgroup_regex_id += 1;
1783                        }
1784                        LayerMatch::CgroupContains(substr) => {
1785                            mt.kind = bpf_intf::layer_match_kind_MATCH_CGROUP_CONTAINS as i32;
1786                            copy_into_cstr(&mut mt.cgroup_substr, substr.as_str());
1787                        }
1788                        LayerMatch::CommPrefix(prefix) => {
1789                            mt.kind = bpf_intf::layer_match_kind_MATCH_COMM_PREFIX as i32;
1790                            copy_into_cstr(&mut mt.comm_prefix, prefix.as_str());
1791                        }
1792                        LayerMatch::CommPrefixExclude(prefix) => {
1793                            mt.kind = bpf_intf::layer_match_kind_MATCH_COMM_PREFIX as i32;
1794                            mt.exclude.write(true);
1795                            copy_into_cstr(&mut mt.comm_prefix, prefix.as_str());
1796                        }
1797                        LayerMatch::PcommPrefix(prefix) => {
1798                            mt.kind = bpf_intf::layer_match_kind_MATCH_PCOMM_PREFIX as i32;
1799                            copy_into_cstr(&mut mt.pcomm_prefix, prefix.as_str());
1800                        }
1801                        LayerMatch::PcommPrefixExclude(prefix) => {
1802                            mt.kind = bpf_intf::layer_match_kind_MATCH_PCOMM_PREFIX as i32;
1803                            mt.exclude.write(true);
1804                            copy_into_cstr(&mut mt.pcomm_prefix, prefix.as_str());
1805                        }
1806                        LayerMatch::NiceAbove(nice) => {
1807                            mt.kind = bpf_intf::layer_match_kind_MATCH_NICE_ABOVE as i32;
1808                            mt.nice = *nice;
1809                        }
1810                        LayerMatch::NiceBelow(nice) => {
1811                            mt.kind = bpf_intf::layer_match_kind_MATCH_NICE_BELOW as i32;
1812                            mt.nice = *nice;
1813                        }
1814                        LayerMatch::NiceEquals(nice) => {
1815                            mt.kind = bpf_intf::layer_match_kind_MATCH_NICE_EQUALS as i32;
1816                            mt.nice = *nice;
1817                        }
1818                        LayerMatch::UIDEquals(user_id) => {
1819                            mt.kind = bpf_intf::layer_match_kind_MATCH_USER_ID_EQUALS as i32;
1820                            mt.user_id = *user_id;
1821                        }
1822                        LayerMatch::GIDEquals(group_id) => {
1823                            mt.kind = bpf_intf::layer_match_kind_MATCH_GROUP_ID_EQUALS as i32;
1824                            mt.group_id = *group_id;
1825                        }
1826                        LayerMatch::PIDEquals(pid) => {
1827                            mt.kind = bpf_intf::layer_match_kind_MATCH_PID_EQUALS as i32;
1828                            mt.pid = *pid;
1829                        }
1830                        LayerMatch::PPIDEquals(ppid) => {
1831                            mt.kind = bpf_intf::layer_match_kind_MATCH_PPID_EQUALS as i32;
1832                            mt.ppid = *ppid;
1833                        }
1834                        LayerMatch::TGIDEquals(tgid) => {
1835                            mt.kind = bpf_intf::layer_match_kind_MATCH_TGID_EQUALS as i32;
1836                            mt.tgid = *tgid;
1837                        }
1838                        LayerMatch::NSPIDEquals(nsid, pid) => {
1839                            mt.kind = bpf_intf::layer_match_kind_MATCH_NSPID_EQUALS as i32;
1840                            mt.nsid = *nsid;
1841                            mt.pid = *pid;
1842                        }
1843                        LayerMatch::NSEquals(nsid) => {
1844                            mt.kind = bpf_intf::layer_match_kind_MATCH_NS_EQUALS as i32;
1845                            mt.nsid = *nsid as u64;
1846                        }
1847                        LayerMatch::CmdJoin(joincmd) => {
1848                            mt.kind = bpf_intf::layer_match_kind_MATCH_SCXCMD_JOIN as i32;
1849                            copy_into_cstr(&mut mt.comm_prefix, joincmd);
1850                        }
1851                        LayerMatch::IsGroupLeader(polarity) => {
1852                            mt.kind = bpf_intf::layer_match_kind_MATCH_IS_GROUP_LEADER as i32;
1853                            mt.is_group_leader.write(*polarity);
1854                        }
1855                        LayerMatch::IsKthread(polarity) => {
1856                            mt.kind = bpf_intf::layer_match_kind_MATCH_IS_KTHREAD as i32;
1857                            mt.is_kthread.write(*polarity);
1858                        }
1859                        LayerMatch::UsedGpuTid(polarity) => {
1860                            mt.kind = bpf_intf::layer_match_kind_MATCH_USED_GPU_TID as i32;
1861                            mt.used_gpu_tid.write(*polarity);
1862                        }
1863                        LayerMatch::UsedGpuPid(polarity) => {
1864                            mt.kind = bpf_intf::layer_match_kind_MATCH_USED_GPU_PID as i32;
1865                            mt.used_gpu_pid.write(*polarity);
1866                        }
1867                        LayerMatch::AvgRuntime(min, max) => {
1868                            mt.kind = bpf_intf::layer_match_kind_MATCH_AVG_RUNTIME as i32;
1869                            mt.min_avg_runtime_us = *min;
1870                            mt.max_avg_runtime_us = *max;
1871                        }
1872                        LayerMatch::HintEquals(hint) => {
1873                            mt.kind = bpf_intf::layer_match_kind_MATCH_HINT_EQUALS as i32;
1874                            mt.hint = *hint;
1875                        }
1876                        LayerMatch::SystemCpuUtilBelow(threshold) => {
1877                            mt.kind = bpf_intf::layer_match_kind_MATCH_SYSTEM_CPU_UTIL_BELOW as i32;
1878                            mt.system_cpu_util_below = (*threshold * 10000.0) as u64;
1879                        }
1880                        LayerMatch::DsqInsertBelow(threshold) => {
1881                            mt.kind = bpf_intf::layer_match_kind_MATCH_DSQ_INSERT_BELOW as i32;
1882                            mt.dsq_insert_below = (*threshold * 10000.0) as u64;
1883                        }
1884                    }
1885                }
1886                layer.matches[or_i].nr_match_ands = or.len() as i32;
1887            }
1888
1889            layer.nr_match_ors = spec.matches.len() as u32;
1890            layer.kind = spec.kind.as_bpf_enum();
1891
1892            {
1893                let LayerCommon {
1894                    min_exec_us,
1895                    yield_ignore,
1896                    perf,
1897                    preempt,
1898                    preempt_first,
1899                    exclusive,
1900                    allow_node_aligned,
1901                    skip_remote_node,
1902                    prev_over_idle_core,
1903                    growth_algo,
1904                    nodes,
1905                    slice_us,
1906                    fifo,
1907                    weight,
1908                    disallow_open_after_us,
1909                    disallow_preempt_after_us,
1910                    xllc_mig_min_us,
1911                    placement,
1912                    member_expire_ms,
1913                    ..
1914                } = spec.kind.common();
1915
1916                layer.slice_ns = *slice_us * 1000;
1917                layer.fifo.write(*fifo);
1918                layer.min_exec_ns = min_exec_us * 1000;
1919                layer.yield_step_ns = if *yield_ignore > 0.999 {
1920                    0
1921                } else if *yield_ignore < 0.001 {
1922                    layer.slice_ns
1923                } else {
1924                    (layer.slice_ns as f64 * (1.0 - *yield_ignore)) as u64
1925                };
1926                let mut layer_name: String = spec.name.clone();
1927                layer_name.truncate(MAX_LAYER_NAME);
1928                copy_into_cstr(&mut layer.name, layer_name.as_str());
1929                layer.preempt.write(*preempt);
1930                layer.preempt_first.write(*preempt_first);
1931                layer.excl.write(*exclusive);
1932                layer.allow_node_aligned.write(*allow_node_aligned);
1933                layer.skip_remote_node.write(*skip_remote_node);
1934                layer.prev_over_idle_core.write(*prev_over_idle_core);
1935                layer.growth_algo = growth_algo.as_bpf_enum();
1936                layer.weight = *weight;
1937                layer.member_expire_ms = *member_expire_ms;
1938                layer.disallow_open_after_ns = match disallow_open_after_us.unwrap() {
1939                    v if v == u64::MAX => v,
1940                    v => v * 1000,
1941                };
1942                layer.disallow_preempt_after_ns = match disallow_preempt_after_us.unwrap() {
1943                    v if v == u64::MAX => v,
1944                    v => v * 1000,
1945                };
1946                layer.xllc_mig_min_ns = (xllc_mig_min_us * 1000.0) as u64;
1947                layer_weights.push(layer.weight.try_into().unwrap());
1948                layer.perf = u32::try_from(*perf)?;
1949                layer.node_mask = nodemask_from_nodes(nodes) as u64;
1950                for (topo_node_id, topo_node) in &topo.nodes {
1951                    if !nodes.is_empty() && !nodes.contains(topo_node_id) {
1952                        continue;
1953                    }
1954                    layer.llc_mask |= llcmask_from_llcs(&topo_node.llcs) as u64;
1955                }
1956
1957                let task_place = |place: u32| crate::types::layer_task_place(place);
1958                layer.task_place = match placement {
1959                    LayerPlacement::Standard => {
1960                        task_place(bpf_intf::layer_task_place_PLACEMENT_STD as u32)
1961                    }
1962                    LayerPlacement::Sticky => {
1963                        task_place(bpf_intf::layer_task_place_PLACEMENT_STICK as u32)
1964                    }
1965                    LayerPlacement::Floating => {
1966                        task_place(bpf_intf::layer_task_place_PLACEMENT_FLOAT as u32)
1967                    }
1968                };
1969            }
1970
1971            layer.is_protected.write(match spec.kind {
1972                LayerKind::Open { .. } => false,
1973                LayerKind::Confined { protected, .. } | LayerKind::Grouped { protected, .. } => {
1974                    protected
1975                }
1976            });
1977
1978            match &spec.cpuset {
1979                Some(mask) => {
1980                    Self::update_cpumask(&mask, &mut layer.cpuset);
1981                }
1982                None => {
1983                    for i in 0..layer.cpuset.len() {
1984                        layer.cpuset[i] = u8::MAX;
1985                    }
1986                }
1987            };
1988
1989            perf_set |= layer.perf > 0;
1990        }
1991
1992        layer_iteration_order.sort_by(|i, j| layer_weights[*i].cmp(&layer_weights[*j]));
1993        for (idx, layer_idx) in layer_iteration_order.iter().enumerate() {
1994            skel.maps
1995                .rodata_data
1996                .as_mut()
1997                .unwrap()
1998                .layer_iteration_order[idx] = *layer_idx as u32;
1999        }
2000
2001        if perf_set && !compat::ksym_exists("scx_bpf_cpuperf_set")? {
2002            warn!("cpufreq support not available, ignoring perf configurations");
2003        }
2004
2005        Ok(cgroup_regexes)
2006    }
2007
2008    fn init_nodes(skel: &mut OpenBpfSkel, _opts: &Opts, topo: &Topology) {
2009        skel.maps.rodata_data.as_mut().unwrap().nr_nodes = topo.nodes.len() as u32;
2010        skel.maps.rodata_data.as_mut().unwrap().nr_llcs = 0;
2011
2012        for (&node_id, node) in &topo.nodes {
2013            debug!("configuring node {}, LLCs {:?}", node_id, node.llcs.len());
2014            skel.maps.rodata_data.as_mut().unwrap().nr_llcs += node.llcs.len() as u32;
2015            let raw_numa_slice = node.span.as_raw_slice();
2016            let node_cpumask_slice =
2017                &mut skel.maps.rodata_data.as_mut().unwrap().numa_cpumasks[node_id];
2018            let (left, _) = node_cpumask_slice.split_at_mut(raw_numa_slice.len());
2019            left.clone_from_slice(raw_numa_slice);
2020            debug!(
2021                "node {} mask: {:?}",
2022                node_id,
2023                skel.maps.rodata_data.as_ref().unwrap().numa_cpumasks[node_id]
2024            );
2025
2026            for llc in node.llcs.values() {
2027                debug!("configuring llc {:?} for node {:?}", llc.id, node_id);
2028                skel.maps.rodata_data.as_mut().unwrap().llc_numa_id_map[llc.id] = node_id as u32;
2029            }
2030        }
2031
2032        for cpu in topo.all_cpus.values() {
2033            skel.maps.rodata_data.as_mut().unwrap().cpu_llc_id_map[cpu.id] = cpu.llc_id as u32;
2034        }
2035    }
2036
2037    fn init_cpu_prox_map(topo: &Topology, cpu_ctxs: &mut [bpf_intf::cpu_ctx]) {
2038        let radiate = |mut vec: Vec<usize>, center_id: usize| -> Vec<usize> {
2039            vec.sort_by_key(|&id| (center_id as i32 - id as i32).abs());
2040            vec
2041        };
2042        let radiate_cpu =
2043            |mut vec: Vec<usize>, center_cpu: usize, center_core: usize| -> Vec<usize> {
2044                vec.sort_by_key(|&id| {
2045                    (
2046                        (center_core as i32 - topo.all_cpus.get(&id).unwrap().core_id as i32).abs(),
2047                        (center_cpu as i32 - id as i32).abs(),
2048                    )
2049                });
2050                vec
2051            };
2052
2053        for (&cpu_id, cpu) in &topo.all_cpus {
2054            // Collect the spans.
2055            let mut core_span = topo.all_cores[&cpu.core_id].span.clone();
2056            let llc_span = &topo.all_llcs[&cpu.llc_id].span;
2057            let node_span = &topo.nodes[&cpu.node_id].span;
2058            let sys_span = &topo.span;
2059
2060            // Make the spans exclusive.
2061            let sys_span = sys_span.and(&node_span.not());
2062            let node_span = node_span.and(&llc_span.not());
2063            let llc_span = llc_span.and(&core_span.not());
2064            core_span.clear_cpu(cpu_id).unwrap();
2065
2066            // Convert them into arrays.
2067            let mut sys_order: Vec<usize> = sys_span.iter().collect();
2068            let mut node_order: Vec<usize> = node_span.iter().collect();
2069            let mut llc_order: Vec<usize> = llc_span.iter().collect();
2070            let mut core_order: Vec<usize> = core_span.iter().collect();
2071
2072            // Shuffle them so that different CPUs follow different orders.
2073            // Each CPU radiates in both directions based on the cpu id and
2074            // radiates out to the closest cores based on core ids.
2075
2076            sys_order = radiate_cpu(sys_order, cpu_id, cpu.core_id);
2077            node_order = radiate(node_order, cpu.node_id);
2078            llc_order = radiate_cpu(llc_order, cpu_id, cpu.core_id);
2079            core_order = radiate_cpu(core_order, cpu_id, cpu.core_id);
2080
2081            // Concatenate and record the topology boundaries.
2082            let mut order: Vec<usize> = vec![];
2083            let mut idx: usize = 0;
2084
2085            idx += 1;
2086            order.push(cpu_id);
2087
2088            idx += core_order.len();
2089            order.append(&mut core_order);
2090            let core_end = idx;
2091
2092            idx += llc_order.len();
2093            order.append(&mut llc_order);
2094            let llc_end = idx;
2095
2096            idx += node_order.len();
2097            order.append(&mut node_order);
2098            let node_end = idx;
2099
2100            idx += sys_order.len();
2101            order.append(&mut sys_order);
2102            let sys_end = idx;
2103
2104            debug!(
2105                "CPU[{}] proximity map[{}/{}/{}/{}]: {:?}",
2106                cpu_id, core_end, llc_end, node_end, sys_end, &order
2107            );
2108
2109            // Record in cpu_ctx.
2110            let pmap = &mut cpu_ctxs[cpu_id].prox_map;
2111            for (i, &cpu) in order.iter().enumerate() {
2112                pmap.cpus[i] = cpu as u16;
2113            }
2114            pmap.core_end = core_end as u32;
2115            pmap.llc_end = llc_end as u32;
2116            pmap.node_end = node_end as u32;
2117            pmap.sys_end = sys_end as u32;
2118        }
2119    }
2120
2121    fn convert_cpu_ctxs(cpu_ctxs: Vec<bpf_intf::cpu_ctx>) -> Vec<Vec<u8>> {
2122        cpu_ctxs
2123            .into_iter()
2124            .map(|cpu_ctx| {
2125                let bytes = unsafe {
2126                    std::slice::from_raw_parts(
2127                        &cpu_ctx as *const bpf_intf::cpu_ctx as *const u8,
2128                        std::mem::size_of::<bpf_intf::cpu_ctx>(),
2129                    )
2130                };
2131                bytes.to_vec()
2132            })
2133            .collect()
2134    }
2135
2136    fn init_cpus(skel: &BpfSkel, layer_specs: &[LayerSpec], topo: &Topology) -> Result<()> {
2137        let key = (0_u32).to_ne_bytes();
2138        let mut cpu_ctxs: Vec<bpf_intf::cpu_ctx> = vec![];
2139        let cpu_ctxs_vec = skel
2140            .maps
2141            .cpu_ctxs
2142            .lookup_percpu(&key, libbpf_rs::MapFlags::ANY)
2143            .context("Failed to lookup cpu_ctx")?
2144            .unwrap();
2145
2146        let op_layers: Vec<u32> = layer_specs
2147            .iter()
2148            .enumerate()
2149            .filter(|(_idx, spec)| match &spec.kind {
2150                LayerKind::Open { .. } => spec.kind.common().preempt,
2151                _ => false,
2152            })
2153            .map(|(idx, _)| idx as u32)
2154            .collect();
2155        let on_layers: Vec<u32> = layer_specs
2156            .iter()
2157            .enumerate()
2158            .filter(|(_idx, spec)| match &spec.kind {
2159                LayerKind::Open { .. } => !spec.kind.common().preempt,
2160                _ => false,
2161            })
2162            .map(|(idx, _)| idx as u32)
2163            .collect();
2164        let gp_layers: Vec<u32> = layer_specs
2165            .iter()
2166            .enumerate()
2167            .filter(|(_idx, spec)| match &spec.kind {
2168                LayerKind::Grouped { .. } => spec.kind.common().preempt,
2169                _ => false,
2170            })
2171            .map(|(idx, _)| idx as u32)
2172            .collect();
2173        let gn_layers: Vec<u32> = layer_specs
2174            .iter()
2175            .enumerate()
2176            .filter(|(_idx, spec)| match &spec.kind {
2177                LayerKind::Grouped { .. } => !spec.kind.common().preempt,
2178                _ => false,
2179            })
2180            .map(|(idx, _)| idx as u32)
2181            .collect();
2182
2183        // FIXME - this incorrectly assumes all possible CPUs are consecutive.
2184        for cpu in 0..*NR_CPUS_POSSIBLE {
2185            cpu_ctxs.push(*unsafe {
2186                &*(cpu_ctxs_vec[cpu].as_slice().as_ptr() as *const bpf_intf::cpu_ctx)
2187            });
2188
2189            let topo_cpu = topo.all_cpus.get(&cpu).unwrap();
2190            let is_big = topo_cpu.core_type == CoreType::Big { turbo: true };
2191            cpu_ctxs[cpu].cpu = cpu as i32;
2192            cpu_ctxs[cpu].layer_id = MAX_LAYERS as u32;
2193            cpu_ctxs[cpu].is_big = is_big;
2194
2195            fastrand::seed(cpu as u64);
2196
2197            let mut ogp_order = op_layers.clone();
2198            ogp_order.append(&mut gp_layers.clone());
2199            fastrand::shuffle(&mut ogp_order);
2200
2201            let mut ogn_order = on_layers.clone();
2202            ogn_order.append(&mut gn_layers.clone());
2203            fastrand::shuffle(&mut ogn_order);
2204
2205            let mut op_order = op_layers.clone();
2206            fastrand::shuffle(&mut op_order);
2207
2208            let mut on_order = on_layers.clone();
2209            fastrand::shuffle(&mut on_order);
2210
2211            let mut gp_order = gp_layers.clone();
2212            fastrand::shuffle(&mut gp_order);
2213
2214            let mut gn_order = gn_layers.clone();
2215            fastrand::shuffle(&mut gn_order);
2216
2217            for i in 0..MAX_LAYERS {
2218                cpu_ctxs[cpu].ogp_layer_order[i] =
2219                    ogp_order.get(i).cloned().unwrap_or(MAX_LAYERS as u32);
2220                cpu_ctxs[cpu].ogn_layer_order[i] =
2221                    ogn_order.get(i).cloned().unwrap_or(MAX_LAYERS as u32);
2222
2223                cpu_ctxs[cpu].op_layer_order[i] =
2224                    op_order.get(i).cloned().unwrap_or(MAX_LAYERS as u32);
2225                cpu_ctxs[cpu].on_layer_order[i] =
2226                    on_order.get(i).cloned().unwrap_or(MAX_LAYERS as u32);
2227                cpu_ctxs[cpu].gp_layer_order[i] =
2228                    gp_order.get(i).cloned().unwrap_or(MAX_LAYERS as u32);
2229                cpu_ctxs[cpu].gn_layer_order[i] =
2230                    gn_order.get(i).cloned().unwrap_or(MAX_LAYERS as u32);
2231            }
2232        }
2233
2234        Self::init_cpu_prox_map(topo, &mut cpu_ctxs);
2235
2236        skel.maps
2237            .cpu_ctxs
2238            .update_percpu(
2239                &key,
2240                &Self::convert_cpu_ctxs(cpu_ctxs),
2241                libbpf_rs::MapFlags::ANY,
2242            )
2243            .context("Failed to update cpu_ctx")?;
2244
2245        Ok(())
2246    }
2247
2248    fn init_llc_prox_map(skel: &mut BpfSkel, topo: &Topology) -> Result<()> {
2249        for (&llc_id, llc) in &topo.all_llcs {
2250            // Collect the orders.
2251            let mut node_order: Vec<usize> =
2252                topo.nodes[&llc.node_id].llcs.keys().cloned().collect();
2253            let mut sys_order: Vec<usize> = topo.all_llcs.keys().cloned().collect();
2254
2255            // Make the orders exclusive.
2256            sys_order.retain(|id| !node_order.contains(id));
2257            node_order.retain(|&id| id != llc_id);
2258
2259            // Shufle so that different LLCs follow different orders. See
2260            // init_cpu_prox_map().
2261            fastrand::seed(llc_id as u64);
2262            fastrand::shuffle(&mut sys_order);
2263            fastrand::shuffle(&mut node_order);
2264
2265            // Concatenate and record the node boundary.
2266            let mut order: Vec<usize> = vec![];
2267            let mut idx: usize = 0;
2268
2269            idx += 1;
2270            order.push(llc_id);
2271
2272            idx += node_order.len();
2273            order.append(&mut node_order);
2274            let node_end = idx;
2275
2276            idx += sys_order.len();
2277            order.append(&mut sys_order);
2278            let sys_end = idx;
2279
2280            debug!(
2281                "LLC[{}] proximity map[{}/{}]: {:?}",
2282                llc_id, node_end, sys_end, &order
2283            );
2284
2285            // Record in llc_ctx.
2286            //
2287            // XXX - This would be a lot easier if llc_ctx were in the bss.
2288            // See BpfStats::read().
2289            let key = llc_id as u32;
2290            let llc_id_slice =
2291                unsafe { std::slice::from_raw_parts((&key as *const u32) as *const u8, 4) };
2292            let v = skel
2293                .maps
2294                .llc_data
2295                .lookup(llc_id_slice, libbpf_rs::MapFlags::ANY)
2296                .unwrap()
2297                .unwrap();
2298            let mut llcc = unsafe { *(v.as_slice().as_ptr() as *const bpf_intf::llc_ctx) };
2299
2300            let pmap = &mut llcc.prox_map;
2301            for (i, &llc_id) in order.iter().enumerate() {
2302                pmap.llcs[i] = llc_id as u16;
2303            }
2304            pmap.node_end = node_end as u32;
2305            pmap.sys_end = sys_end as u32;
2306
2307            let v = unsafe {
2308                std::slice::from_raw_parts(
2309                    &llcc as *const bpf_intf::llc_ctx as *const u8,
2310                    std::mem::size_of::<bpf_intf::llc_ctx>(),
2311                )
2312            };
2313
2314            skel.maps
2315                .llc_data
2316                .update(llc_id_slice, v, libbpf_rs::MapFlags::ANY)?
2317        }
2318
2319        Ok(())
2320    }
2321
2322    fn init(
2323        opts: &'a Opts,
2324        layer_specs: &[LayerSpec],
2325        open_object: &'a mut MaybeUninit<OpenObject>,
2326        hint_to_layer_map: &HashMap<u64, HintLayerInfo>,
2327        membw_tracking: bool,
2328    ) -> Result<Self> {
2329        let nr_layers = layer_specs.len();
2330        let mut disable_topology = opts.disable_topology.unwrap_or(false);
2331
2332        let topo = Arc::new(if disable_topology {
2333            Topology::with_flattened_llc_node()?
2334        } else if opts.topology.virt_llc.is_some() {
2335            Topology::with_args(&opts.topology)?
2336        } else {
2337            Topology::new()?
2338        });
2339
2340        /*
2341         * FIXME: scx_layered incorrectly assumes that node, LLC and CPU IDs
2342         * are consecutive. Verify that they are on this system and bail if
2343         * not. It's lucky that core ID is not used anywhere as core IDs are
2344         * not consecutive on some Ryzen CPUs.
2345         */
2346        if topo.nodes.keys().enumerate().any(|(i, &k)| i != k) {
2347            bail!("Holes in node IDs detected: {:?}", topo.nodes.keys());
2348        }
2349        if topo.all_llcs.keys().enumerate().any(|(i, &k)| i != k) {
2350            bail!("Holes in LLC IDs detected: {:?}", topo.all_llcs.keys());
2351        }
2352        if topo.all_cpus.keys().enumerate().any(|(i, &k)| i != k) {
2353            bail!("Holes in CPU IDs detected: {:?}", topo.all_cpus.keys());
2354        }
2355
2356        let netdevs = if opts.netdev_irq_balance {
2357            warn!(
2358                "Experimental netdev IRQ balancing enabled. Reset IRQ masks of network devices after use!!!"
2359            );
2360            read_netdevs()?
2361        } else {
2362            BTreeMap::new()
2363        };
2364
2365        if !disable_topology {
2366            if topo.nodes.len() == 1 && topo.nodes[&0].llcs.len() == 1 {
2367                disable_topology = true;
2368            };
2369            info!(
2370                "Topology awareness not specified, selecting {} based on hardware",
2371                if disable_topology {
2372                    "disabled"
2373                } else {
2374                    "enabled"
2375                }
2376            );
2377        };
2378
2379        let cpu_pool = CpuPool::new(topo.clone())?;
2380
2381        // If disabling topology awareness clear out any set NUMA/LLC configs and
2382        // it will fallback to using all cores.
2383        let layer_specs: Vec<_> = if disable_topology {
2384            info!("Disabling topology awareness");
2385            layer_specs
2386                .iter()
2387                .cloned()
2388                .map(|mut s| {
2389                    s.kind.common_mut().nodes.clear();
2390                    s.kind.common_mut().llcs.clear();
2391                    s
2392                })
2393                .collect()
2394        } else {
2395            layer_specs.to_vec()
2396        };
2397
2398        // Check kernel features
2399        init_libbpf_logging(None);
2400        let kfuncs_in_syscall = scx_bpf_compat::kfuncs_supported_in_syscall()?;
2401        if !kfuncs_in_syscall {
2402            warn!("Using slow path: kfuncs not supported in syscall programs (a8e03b6bbb2c ∉ ker)");
2403        }
2404
2405        // Open the BPF prog first for verification.
2406        let mut skel_builder = BpfSkelBuilder::default();
2407        skel_builder.obj_builder.debug(opts.verbose > 1);
2408        info!(
2409            "Running scx_layered (build ID: {})",
2410            build_id::full_version(env!("CARGO_PKG_VERSION"))
2411        );
2412        let open_opts = opts.libbpf.clone().into_bpf_open_opts();
2413        let mut skel = scx_ops_open!(skel_builder, open_object, layered, open_opts)?;
2414
2415        // No memory BW tracking by default
2416        skel.progs.scx_pmu_switch_tc.set_autoload(membw_tracking);
2417        skel.progs.scx_pmu_tick_tc.set_autoload(membw_tracking);
2418
2419        // enable autoloads for conditionally loaded things
2420        // immediately after creating skel (because this is always before loading)
2421        if opts.enable_gpu_support {
2422            // by default, enable open if gpu support is enabled.
2423            // open has been observed to be relatively cheap to kprobe.
2424            if opts.gpu_kprobe_level >= 1 {
2425                compat::cond_kprobe_enable("nvidia_open", &skel.progs.kprobe_nvidia_open)?;
2426            }
2427            // enable the rest progressively based upon how often they are called
2428            // for observed workloads
2429            if opts.gpu_kprobe_level >= 2 {
2430                compat::cond_kprobe_enable("nvidia_mmap", &skel.progs.kprobe_nvidia_mmap)?;
2431            }
2432            if opts.gpu_kprobe_level >= 3 {
2433                compat::cond_kprobe_enable("nvidia_poll", &skel.progs.kprobe_nvidia_poll)?;
2434            }
2435        }
2436
2437        let ext_sched_class_addr = get_kallsyms_addr("ext_sched_class");
2438        let idle_sched_class_addr = get_kallsyms_addr("idle_sched_class");
2439
2440        let event = if membw_tracking {
2441            setup_membw_tracking(&mut skel)?
2442        } else {
2443            0
2444        };
2445
2446        let rodata = skel.maps.rodata_data.as_mut().unwrap();
2447
2448        if ext_sched_class_addr.is_ok() && idle_sched_class_addr.is_ok() {
2449            rodata.ext_sched_class_addr = ext_sched_class_addr.unwrap();
2450            rodata.idle_sched_class_addr = idle_sched_class_addr.unwrap();
2451        } else {
2452            warn!(
2453                "Unable to get sched_class addresses from /proc/kallsyms, disabling skip_preempt."
2454            );
2455        }
2456
2457        rodata.slice_ns = scx_enums.SCX_SLICE_DFL;
2458        rodata.max_exec_ns = 20 * scx_enums.SCX_SLICE_DFL;
2459
2460        // Initialize skel according to @opts.
2461        skel.struct_ops.layered_mut().exit_dump_len = opts.exit_dump_len;
2462
2463        if !opts.disable_queued_wakeup {
2464            match *compat::SCX_OPS_ALLOW_QUEUED_WAKEUP {
2465                0 => info!("Kernel does not support queued wakeup optimization"),
2466                v => skel.struct_ops.layered_mut().flags |= v,
2467            }
2468        }
2469
2470        rodata.percpu_kthread_preempt = !opts.disable_percpu_kthread_preempt;
2471        rodata.percpu_kthread_preempt_all =
2472            !opts.disable_percpu_kthread_preempt && opts.percpu_kthread_preempt_all;
2473        rodata.debug = opts.verbose as u32;
2474        rodata.slice_ns = opts.slice_us * 1000;
2475        rodata.max_exec_ns = if opts.max_exec_us > 0 {
2476            opts.max_exec_us * 1000
2477        } else {
2478            opts.slice_us * 1000 * 20
2479        };
2480        rodata.nr_cpu_ids = *NR_CPU_IDS as u32;
2481        rodata.nr_possible_cpus = *NR_CPUS_POSSIBLE as u32;
2482        rodata.smt_enabled = topo.smt_enabled;
2483        rodata.has_little_cores = topo.has_little_cores();
2484        rodata.xnuma_preemption = opts.xnuma_preemption;
2485        rodata.antistall_sec = opts.antistall_sec;
2486        rodata.monitor_disable = opts.monitor_disable;
2487        rodata.lo_fb_wait_ns = opts.lo_fb_wait_us * 1000;
2488        rodata.lo_fb_share_ppk = ((opts.lo_fb_share * 1024.0) as u32).clamp(1, 1024);
2489        rodata.enable_antistall = !opts.disable_antistall;
2490        rodata.enable_match_debug = opts.enable_match_debug;
2491        rodata.enable_gpu_support = opts.enable_gpu_support;
2492        rodata.kfuncs_supported_in_syscall = kfuncs_in_syscall;
2493
2494        for (cpu, sib) in topo.sibling_cpus().iter().enumerate() {
2495            rodata.__sibling_cpu[cpu] = *sib;
2496        }
2497        for cpu in topo.all_cpus.keys() {
2498            rodata.all_cpus[cpu / 8] |= 1 << (cpu % 8);
2499        }
2500
2501        rodata.nr_op_layers = layer_specs
2502            .iter()
2503            .filter(|spec| match &spec.kind {
2504                LayerKind::Open { .. } => spec.kind.common().preempt,
2505                _ => false,
2506            })
2507            .count() as u32;
2508        rodata.nr_on_layers = layer_specs
2509            .iter()
2510            .filter(|spec| match &spec.kind {
2511                LayerKind::Open { .. } => !spec.kind.common().preempt,
2512                _ => false,
2513            })
2514            .count() as u32;
2515        rodata.nr_gp_layers = layer_specs
2516            .iter()
2517            .filter(|spec| match &spec.kind {
2518                LayerKind::Grouped { .. } => spec.kind.common().preempt,
2519                _ => false,
2520            })
2521            .count() as u32;
2522        rodata.nr_gn_layers = layer_specs
2523            .iter()
2524            .filter(|spec| match &spec.kind {
2525                LayerKind::Grouped { .. } => !spec.kind.common().preempt,
2526                _ => false,
2527            })
2528            .count() as u32;
2529        rodata.nr_excl_layers = layer_specs
2530            .iter()
2531            .filter(|spec| spec.kind.common().exclusive)
2532            .count() as u32;
2533
2534        let mut min_open = u64::MAX;
2535        let mut min_preempt = u64::MAX;
2536
2537        for spec in layer_specs.iter() {
2538            if let LayerKind::Open { common, .. } = &spec.kind {
2539                min_open = min_open.min(common.disallow_open_after_us.unwrap());
2540                min_preempt = min_preempt.min(common.disallow_preempt_after_us.unwrap());
2541            }
2542        }
2543
2544        rodata.min_open_layer_disallow_open_after_ns = match min_open {
2545            u64::MAX => *DFL_DISALLOW_OPEN_AFTER_US,
2546            v => v,
2547        };
2548        rodata.min_open_layer_disallow_preempt_after_ns = match min_preempt {
2549            u64::MAX => *DFL_DISALLOW_PREEMPT_AFTER_US,
2550            v => v,
2551        };
2552
2553        // Consider all layers empty at the beginning.
2554        for i in 0..layer_specs.len() {
2555            skel.maps.bss_data.as_mut().unwrap().empty_layer_ids[i] = i as u32;
2556        }
2557        skel.maps.bss_data.as_mut().unwrap().nr_empty_layer_ids = nr_layers as u32;
2558
2559        // We set the pin path before loading the skeleton. This will ensure
2560        // libbpf creates and pins the map, or reuses the pinned map fd for us,
2561        // so that we can keep reusing the older map already pinned on scheduler
2562        // restarts.
2563        let layered_task_hint_map_path = &opts.task_hint_map;
2564        let hint_map = &mut skel.maps.scx_layered_task_hint_map;
2565        // Only set pin path if a path is provided.
2566        if layered_task_hint_map_path.is_empty() == false {
2567            hint_map.set_pin_path(layered_task_hint_map_path).unwrap();
2568            rodata.task_hint_map_enabled = true;
2569        }
2570
2571        if !opts.hi_fb_thread_name.is_empty() {
2572            let bpf_hi_fb_thread_name = &mut rodata.hi_fb_thread_name;
2573            copy_into_cstr(bpf_hi_fb_thread_name, opts.hi_fb_thread_name.as_str());
2574            rodata.enable_hi_fb_thread_name_match = true;
2575        }
2576
2577        let cgroup_regexes = Self::init_layers(&mut skel, &layer_specs, &topo)?;
2578        skel.maps.rodata_data.as_mut().unwrap().nr_cgroup_regexes = cgroup_regexes.len() as u32;
2579        Self::init_nodes(&mut skel, opts, &topo);
2580
2581        let mut skel = scx_ops_load!(skel, layered, uei)?;
2582
2583        // Populate the mapping of hints to layer IDs for faster lookups
2584        if hint_to_layer_map.len() != 0 {
2585            for (k, v) in hint_to_layer_map.iter() {
2586                let key: u32 = *k as u32;
2587
2588                // Create hint_layer_info struct
2589                let mut info_bytes = vec![0u8; std::mem::size_of::<bpf_intf::hint_layer_info>()];
2590                let info_ptr = info_bytes.as_mut_ptr() as *mut bpf_intf::hint_layer_info;
2591                unsafe {
2592                    (*info_ptr).layer_id = v.layer_id as u32;
2593                    (*info_ptr).system_cpu_util_below = match v.system_cpu_util_below {
2594                        Some(threshold) => (threshold * 10000.0) as u64,
2595                        None => u64::MAX, // disabled sentinel
2596                    };
2597                    (*info_ptr).dsq_insert_below = match v.dsq_insert_below {
2598                        Some(threshold) => (threshold * 10000.0) as u64,
2599                        None => u64::MAX, // disabled sentinel
2600                    };
2601                }
2602
2603                skel.maps.hint_to_layer_id_map.update(
2604                    &key.to_ne_bytes(),
2605                    &info_bytes,
2606                    libbpf_rs::MapFlags::ANY,
2607                )?;
2608            }
2609        }
2610
2611        if membw_tracking {
2612            create_perf_fds(&mut skel, event)?;
2613        }
2614
2615        let mut layers = vec![];
2616        let layer_growth_orders =
2617            LayerGrowthAlgo::layer_core_orders(&cpu_pool, &layer_specs, &topo)?;
2618        for (idx, spec) in layer_specs.iter().enumerate() {
2619            let growth_order = layer_growth_orders
2620                .get(&idx)
2621                .with_context(|| "layer has no growth order".to_string())?;
2622            layers.push(Layer::new(spec, &topo, growth_order)?);
2623        }
2624
2625        let mut idle_qos_enabled = layers
2626            .iter()
2627            .any(|layer| layer.kind.common().idle_resume_us.unwrap_or(0) > 0);
2628        if idle_qos_enabled && !cpu_idle_resume_latency_supported() {
2629            warn!("idle_resume_us not supported, ignoring");
2630            idle_qos_enabled = false;
2631        }
2632
2633        Self::init_cpus(&skel, &layer_specs, &topo)?;
2634        Self::init_llc_prox_map(&mut skel, &topo)?;
2635
2636        // Other stuff.
2637        let proc_reader = fb_procfs::ProcReader::new();
2638
2639        // Handle setup if layered is running in a pid namespace.
2640        let input = ProgramInput {
2641            ..Default::default()
2642        };
2643        let prog = &mut skel.progs.initialize_pid_namespace;
2644
2645        let _ = prog.test_run(input);
2646
2647        // XXX If we try to refresh the cpumasks here before attaching, we
2648        // sometimes (non-deterministically) don't see the updated values in
2649        // BPF. It would be better to update the cpumasks here before we
2650        // attach, but the value will quickly converge anyways so it's not a
2651        // huge problem in the interim until we figure it out.
2652
2653        // Allow all tasks to open and write to BPF task hint map, now that
2654        // we should have it pinned at the desired location.
2655        if layered_task_hint_map_path.is_empty() == false {
2656            let path = CString::new(layered_task_hint_map_path.as_bytes()).unwrap();
2657            let mode: libc::mode_t = 0o666;
2658            unsafe {
2659                if libc::chmod(path.as_ptr(), mode) != 0 {
2660                    trace!("'chmod' to 666 of task hint map failed, continuing...");
2661                }
2662            }
2663        }
2664
2665        // Attach.
2666        let struct_ops = scx_ops_attach!(skel, layered)?;
2667        let stats_server = StatsServer::new(stats::server_data()).launch()?;
2668        let mut gpu_task_handler =
2669            GpuTaskAffinitizer::new(opts.gpu_affinitize_secs, opts.enable_gpu_affinitize);
2670        gpu_task_handler.init(topo.clone());
2671
2672        let sched = Self {
2673            struct_ops: Some(struct_ops),
2674            layer_specs,
2675
2676            sched_intv: Duration::from_secs_f64(opts.interval),
2677            layer_refresh_intv: Duration::from_millis(opts.layer_refresh_ms_avgruntime),
2678
2679            cpu_pool,
2680            layers,
2681            idle_qos_enabled,
2682
2683            sched_stats: Stats::new(&mut skel, &proc_reader, &gpu_task_handler)?,
2684
2685            cgroup_regexes: Some(cgroup_regexes),
2686            nr_layer_cpus_ranges: vec![(0, 0); nr_layers],
2687            processing_dur: Default::default(),
2688
2689            proc_reader,
2690            skel,
2691
2692            topo,
2693            netdevs,
2694            stats_server,
2695            gpu_task_handler,
2696        };
2697
2698        info!("Layered Scheduler Attached. Run `scx_layered --monitor` for metrics.");
2699
2700        Ok(sched)
2701    }
2702
2703    fn update_cpumask(mask: &Cpumask, bpfmask: &mut [u8]) {
2704        for cpu in 0..mask.len() {
2705            if mask.test_cpu(cpu) {
2706                bpfmask[cpu / 8] |= 1 << (cpu % 8);
2707            } else {
2708                bpfmask[cpu / 8] &= !(1 << (cpu % 8));
2709            }
2710        }
2711    }
2712
2713    fn update_bpf_layer_cpumask(layer: &Layer, bpf_layer: &mut types::layer) {
2714        trace!("[{}] Updating BPF CPUs: {}", layer.name, &layer.cpus);
2715        Self::update_cpumask(&layer.cpus, &mut bpf_layer.cpus);
2716
2717        bpf_layer.nr_cpus = layer.nr_cpus as u32;
2718        for (llc_id, &nr_llc_cpus) in layer.nr_llc_cpus.iter().enumerate() {
2719            bpf_layer.nr_llc_cpus[llc_id] = nr_llc_cpus as u32;
2720        }
2721
2722        bpf_layer.refresh_cpus = 1;
2723    }
2724
2725    fn update_netdev_cpumasks(&mut self) -> Result<()> {
2726        let available_cpus = self.cpu_pool.available_cpus();
2727        if available_cpus.is_empty() {
2728            return Ok(());
2729        }
2730
2731        for (iface, netdev) in self.netdevs.iter_mut() {
2732            let node = self
2733                .topo
2734                .nodes
2735                .values()
2736                .take_while(|n| n.id == netdev.node())
2737                .next()
2738                .ok_or_else(|| anyhow!("Failed to get netdev node"))?;
2739            let node_cpus = node.span.clone();
2740            for (irq, irqmask) in netdev.irqs.iter_mut() {
2741                irqmask.clear_all();
2742                for cpu in available_cpus.iter() {
2743                    if !node_cpus.test_cpu(cpu) {
2744                        continue;
2745                    }
2746                    let _ = irqmask.set_cpu(cpu);
2747                }
2748                // If no CPUs are available in the node then spread the load across the node
2749                if irqmask.weight() == 0 {
2750                    for cpu in node_cpus.iter() {
2751                        let _ = irqmask.set_cpu(cpu);
2752                    }
2753                }
2754                trace!("{} updating irq {} cpumask {:?}", iface, irq, irqmask);
2755            }
2756            netdev.apply_cpumasks()?;
2757        }
2758
2759        Ok(())
2760    }
2761
2762    fn clamp_target_by_membw(
2763        &self,
2764        layer: &Layer,
2765        membw_limit: f64,
2766        membw: f64,
2767        curtarget: u64,
2768    ) -> usize {
2769        let ncpu: u64 = layer.cpus.weight() as u64;
2770        let membw = (membw * (1024 as f64).powf(3.0)).round() as u64;
2771        let membw_limit = (membw_limit * (1024 as f64).powf(3.0)).round() as u64;
2772        let last_membw_percpu = if ncpu > 0 { membw / ncpu } else { 0 };
2773
2774        // Either there is no memory bandwidth limit set, or the counters
2775        // are not fully initialized yet. Just return the current target.
2776        if membw_limit == 0 || last_membw_percpu == 0 {
2777            return curtarget as usize;
2778        }
2779
2780        return (membw_limit / last_membw_percpu) as usize;
2781    }
2782
2783    /// Calculate how many CPUs each layer would like to have if there were
2784    /// no competition. The CPU range is determined by applying the inverse
2785    /// of util_range and then capping by cpus_range. If the current
2786    /// allocation is within the acceptable range, no change is made.
2787    /// Returns (target, min) pair for each layer.
2788    fn calc_target_nr_cpus(&self) -> Vec<(usize, usize)> {
2789        let nr_cpus = self.cpu_pool.topo.all_cpus.len();
2790        let utils = &self.sched_stats.layer_utils;
2791        let membws = &self.sched_stats.layer_membws;
2792
2793        let mut records: Vec<(u64, u64, u64, usize, usize, usize)> = vec![];
2794        let mut targets: Vec<(usize, usize)> = vec![];
2795
2796        for (idx, layer) in self.layers.iter().enumerate() {
2797            targets.push(match &layer.kind {
2798                LayerKind::Confined {
2799                    util_range,
2800                    cpus_range,
2801                    cpus_range_frac,
2802                    membw_gb,
2803                    ..
2804                }
2805                | LayerKind::Grouped {
2806                    util_range,
2807                    cpus_range,
2808                    cpus_range_frac,
2809                    membw_gb,
2810                    ..
2811                } => {
2812                    let cpus_range =
2813                        resolve_cpus_pct_range(cpus_range, cpus_range_frac, nr_cpus).unwrap();
2814
2815                    // A grouped layer can choose to include open cputime
2816                    // for sizing. Also, as an empty layer can only get CPU
2817                    // time through fallback (counted as owned) or open
2818                    // execution, add open cputime for empty layers.
2819                    let owned = utils[idx][LAYER_USAGE_OWNED];
2820                    let open = utils[idx][LAYER_USAGE_OPEN];
2821
2822                    let membw_owned = membws[idx][LAYER_USAGE_OWNED];
2823                    let membw_open = membws[idx][LAYER_USAGE_OPEN];
2824
2825                    let mut util = owned;
2826                    let mut membw = membw_owned;
2827                    if layer.kind.util_includes_open_cputime() || layer.nr_cpus == 0 {
2828                        util += open;
2829                        membw += membw_open;
2830                    }
2831
2832                    let util = if util < 0.01 { 0.0 } else { util };
2833                    let low = (util / util_range.1).ceil() as usize;
2834                    let high = ((util / util_range.0).floor() as usize).max(low);
2835
2836                    let membw_limit = match membw_gb {
2837                        Some(membw_limit) => *membw_limit,
2838                        None => 0.0,
2839                    };
2840
2841                    trace!(
2842                        "layer {0} (membw, membw_limit): ({membw} gi_b, {membw_limit} gi_b)",
2843                        layer.name
2844                    );
2845
2846                    let target = layer.cpus.weight().clamp(low, high);
2847
2848                    records.push((
2849                        (owned * 100.0) as u64,
2850                        (open * 100.0) as u64,
2851                        (util * 100.0) as u64,
2852                        low,
2853                        high,
2854                        target,
2855                    ));
2856
2857                    let target = target.clamp(cpus_range.0, cpus_range.1);
2858                    let membw_target = self.clamp_target_by_membw(
2859                        &layer,
2860                        membw_limit as f64,
2861                        membw as f64,
2862                        target as u64,
2863                    );
2864
2865                    trace!("CPU target pre- and post-membw adjustment: {target} -> {membw_target}");
2866
2867                    // If there's no way to drop our memory usage down enough,
2868                    // pin the target CPUs to low and drop a warning.
2869                    if membw_target < cpus_range.0 {
2870                        warn!("cannot satisfy memory bw limit for layer {}", layer.name);
2871                        warn!("membw_target {membw_target} low {}", cpus_range.0);
2872                    };
2873
2874                    // Memory bandwidth target cannot override imposed limits or bump
2875                    // the target above what CPU usage-based throttling requires.
2876                    let target = membw_target.clamp(cpus_range.0, target);
2877
2878                    (target, cpus_range.0)
2879                }
2880                LayerKind::Open { .. } => (0, 0),
2881            });
2882        }
2883
2884        trace!("(owned, open, util, low, high, target): {:?}", &records);
2885        targets
2886    }
2887
2888    /// Given (target, min) pair for each layer which was determined
2889    /// assuming infinite number of CPUs, distribute the actual CPUs
2890    /// according to their weights.
2891    fn weighted_target_nr_cpus(&self, targets: &[(usize, usize)]) -> Vec<usize> {
2892        let mut nr_left = self.cpu_pool.topo.all_cpus.len();
2893        let weights: Vec<usize> = self
2894            .layers
2895            .iter()
2896            .map(|layer| layer.kind.common().weight as usize)
2897            .collect();
2898        let mut cands: BTreeMap<usize, (usize, usize, usize)> = targets
2899            .iter()
2900            .zip(&weights)
2901            .enumerate()
2902            .map(|(i, ((target, min), weight))| (i, (*target, *min, *weight)))
2903            .collect();
2904        let mut weight_sum: usize = weights.iter().sum();
2905        let mut weighted: Vec<usize> = vec![0; self.layers.len()];
2906
2907        trace!("cands: {:?}", &cands);
2908
2909        // First, accept all layers that are <= min.
2910        cands.retain(|&i, &mut (target, min, weight)| {
2911            if target <= min {
2912                let target = target.min(nr_left);
2913                weighted[i] = target;
2914                weight_sum -= weight;
2915                nr_left -= target;
2916                false
2917            } else {
2918                true
2919            }
2920        });
2921
2922        trace!("cands after accepting mins: {:?}", &cands);
2923
2924        // Keep accepting ones under their allotted share.
2925        let calc_share = |nr_left, weight, weight_sum| {
2926            (((nr_left * weight) as f64 / weight_sum as f64).ceil() as usize).min(nr_left)
2927        };
2928
2929        while !cands.is_empty() {
2930            let mut progress = false;
2931
2932            cands.retain(|&i, &mut (target, _min, weight)| {
2933                let share = calc_share(nr_left, weight, weight_sum);
2934                if target <= share {
2935                    weighted[i] = target;
2936                    weight_sum -= weight;
2937                    nr_left -= target;
2938                    progress = true;
2939                    false
2940                } else {
2941                    true
2942                }
2943            });
2944
2945            if !progress {
2946                break;
2947            }
2948        }
2949
2950        trace!("cands after accepting under allotted: {:?}", &cands);
2951
2952        // The remaining candidates are in contention with each other,
2953        // distribute according to the shares.
2954        let nr_to_share = nr_left;
2955        for (i, (_target, _min, weight)) in cands.into_iter() {
2956            let share = calc_share(nr_to_share, weight, weight_sum).min(nr_left);
2957            weighted[i] = share;
2958            nr_left -= share;
2959        }
2960
2961        trace!("weighted: {:?}", &weighted);
2962
2963        weighted
2964    }
2965
2966    // Figure out a tuple (LLCs, extra_cpus) in terms of the target CPUs
2967    // computed by weighted_target_nr_cpus. Returns the number of full LLCs
2968    // occupied by a layer, and any extra CPUs that don't occupy a full LLC.
2969    fn compute_target_llcs(target: usize, topo: &Topology) -> (usize, usize) {
2970        // TODO(kkd): We assume each LLC has equal number of cores.
2971        let cores_per_llc = topo.all_cores.len() / topo.all_llcs.len();
2972        // TODO(kkd): We assume each core has fixed number of threads.
2973        let cpus_per_core = topo.all_cores.first_key_value().unwrap().1.cpus.len();
2974        let cpus_per_llc = cores_per_llc * cpus_per_core;
2975
2976        let full = target / cpus_per_llc;
2977        let extra = target % cpus_per_llc;
2978
2979        (full, extra.div_ceil(cpus_per_core))
2980    }
2981
2982    // Recalculate the core order for layers using StickyDynamic growth
2983    // algorithm. Tuples from compute_target_llcs are used to decide how many
2984    // LLCs and cores should be assigned to each layer, logic to alloc and free
2985    // CPUs operates on that core order. This happens in three logical steps, we
2986    // first free LLCs from layers that shrunk from last recomputation, then
2987    // distribute freed LLCs to growing layers, and then spill over remaining
2988    // cores in free LLCs.
2989    fn recompute_layer_core_order(&mut self, layer_targets: &Vec<(usize, usize)>) {
2990        // Collect freed LLCs from shrinking layers.
2991        debug!(
2992            " free: before pass: free_llcs={:?}",
2993            self.cpu_pool.free_llcs
2994        );
2995        for &(idx, target) in layer_targets.iter().rev() {
2996            let layer = &mut self.layers[idx];
2997            let old_tlc = layer.target_llc_cpus;
2998            let new_tlc = Self::compute_target_llcs(target, &self.topo);
2999
3000            if layer.growth_algo != LayerGrowthAlgo::StickyDynamic {
3001                continue;
3002            }
3003
3004            let mut to_free = (old_tlc.0 as i32 - new_tlc.0 as i32).max(0) as usize;
3005
3006            debug!(
3007                " free: layer={} old_tlc={:?} new_tlc={:?} to_free={} assigned={} free={}",
3008                layer.name,
3009                old_tlc,
3010                new_tlc,
3011                to_free,
3012                layer.assigned_llcs.len(),
3013                self.cpu_pool.free_llcs.len()
3014            );
3015
3016            while to_free > 0 && layer.assigned_llcs.len() > 0 {
3017                let llc = layer.assigned_llcs.pop().unwrap();
3018                self.cpu_pool.free_llcs.push((llc, 0));
3019                to_free -= 1;
3020
3021                debug!(" layer={} freed_llc={}", layer.name, llc);
3022            }
3023        }
3024        debug!(" free: after pass: free_llcs={:?}", self.cpu_pool.free_llcs);
3025
3026        // Redistribute the freed LLCs to growing layers.
3027        for &(idx, target) in layer_targets.iter().rev() {
3028            let layer = &mut self.layers[idx];
3029            let old_tlc = layer.target_llc_cpus;
3030            let new_tlc = Self::compute_target_llcs(target, &self.topo);
3031
3032            if layer.growth_algo != LayerGrowthAlgo::StickyDynamic {
3033                continue;
3034            }
3035
3036            let mut to_alloc = (new_tlc.0 as i32 - old_tlc.0 as i32).max(0) as usize;
3037
3038            debug!(
3039                " alloc: layer={} old_tlc={:?} new_tlc={:?} to_alloc={} assigned={} free={}",
3040                layer.name,
3041                old_tlc,
3042                new_tlc,
3043                to_alloc,
3044                layer.assigned_llcs.len(),
3045                self.cpu_pool.free_llcs.len()
3046            );
3047
3048            while to_alloc > 0
3049                && self.cpu_pool.free_llcs.len() > 0
3050                && to_alloc <= self.cpu_pool.free_llcs.len()
3051            {
3052                let llc = self.cpu_pool.free_llcs.pop().unwrap().0;
3053                layer.assigned_llcs.push(llc);
3054                to_alloc -= 1;
3055
3056                debug!(" layer={} alloc_llc={}", layer.name, llc);
3057            }
3058
3059            debug!(
3060                " alloc: layer={} assigned_llcs={:?}",
3061                layer.name, layer.assigned_llcs
3062            );
3063
3064            // Update for next iteration.
3065            layer.target_llc_cpus = new_tlc;
3066        }
3067
3068        // Spillover overflowing cores into free LLCs. Bigger layers get to take
3069        // a chunk before smaller layers.
3070        for &(idx, _) in layer_targets.iter() {
3071            let mut core_order = vec![];
3072            let layer = &mut self.layers[idx];
3073
3074            if layer.growth_algo != LayerGrowthAlgo::StickyDynamic {
3075                continue;
3076            }
3077
3078            let tlc = layer.target_llc_cpus;
3079            let mut extra = tlc.1;
3080            // TODO(kkd): Move this logic into cpu_pool? What's the best place?
3081            let cores_per_llc = self.topo.all_cores.len() / self.topo.all_llcs.len();
3082            let cpus_per_core = self.topo.all_cores.first_key_value().unwrap().1.cpus.len();
3083            let cpus_per_llc = cores_per_llc * cpus_per_core;
3084
3085            // Consume from front since we pop from the back.
3086            for i in 0..self.cpu_pool.free_llcs.len() {
3087                let free_vec = &mut self.cpu_pool.free_llcs;
3088                // Available CPUs in LLC.
3089                let avail = cpus_per_llc - free_vec[i].1;
3090                // The amount we'll use.
3091                let mut used = extra.min(avail);
3092
3093                let shift = free_vec[i].1;
3094                free_vec[i].1 += used;
3095
3096                let llc_id = free_vec[i].0;
3097                let llc = self.topo.all_llcs.get(&llc_id).unwrap();
3098
3099                for core in llc.cores.iter().skip(shift) {
3100                    core_order.push(core.1.id);
3101                    if used == 0 {
3102                        break;
3103                    }
3104                    used -= 1;
3105                }
3106
3107                extra -= used;
3108                if extra == 0 {
3109                    break;
3110                }
3111            }
3112
3113            core_order.reverse();
3114            layer.core_order = core_order;
3115        }
3116
3117        // Reset consumed entries in free LLCs.
3118        for i in 0..self.cpu_pool.free_llcs.len() {
3119            self.cpu_pool.free_llcs[i].1 = 0;
3120        }
3121
3122        for &(idx, _) in layer_targets.iter() {
3123            let layer = &mut self.layers[idx];
3124
3125            if layer.growth_algo != LayerGrowthAlgo::StickyDynamic {
3126                continue;
3127            }
3128
3129            for core in self.topo.all_cores.iter() {
3130                let llc_id = core.1.llc_id;
3131                if layer.assigned_llcs.contains(&llc_id) {
3132                    layer.core_order.push(core.1.id);
3133                }
3134            }
3135            // Update core_order for the layer, but reverse to keep the start stable.
3136            layer.core_order.reverse();
3137
3138            debug!(
3139                " alloc: layer={} core_order={:?}",
3140                layer.name, layer.core_order
3141            );
3142        }
3143    }
3144
3145    fn refresh_cpumasks(&mut self) -> Result<()> {
3146        let layer_is_open = |layer: &Layer| matches!(layer.kind, LayerKind::Open { .. });
3147
3148        let mut updated = false;
3149        let targets = self.calc_target_nr_cpus();
3150        let targets = self.weighted_target_nr_cpus(&targets);
3151
3152        let mut ascending: Vec<(usize, usize)> = targets.iter().copied().enumerate().collect();
3153        ascending.sort_by(|a, b| a.1.cmp(&b.1));
3154
3155        self.recompute_layer_core_order(&ascending);
3156
3157        // If any layer is growing, guarantee that the largest layer that is
3158        // freeing CPUs frees at least one CPU.
3159        let mut force_free = self
3160            .layers
3161            .iter()
3162            .zip(targets.iter())
3163            .any(|(layer, &target)| layer.nr_cpus < target);
3164
3165        // Shrink all layers first so that CPUs are available for
3166        // redistribution. Do so in the descending target number of CPUs
3167        // order.
3168        for &(idx, target) in ascending.iter().rev() {
3169            let layer = &mut self.layers[idx];
3170            if layer_is_open(layer) {
3171                continue;
3172            }
3173
3174            let nr_cur = layer.cpus.weight();
3175            if nr_cur <= target {
3176                continue;
3177            }
3178            let mut nr_to_free = nr_cur - target;
3179
3180            // There's some dampening built into util metrics but slow down
3181            // freeing further to avoid unnecessary changes. This is solely
3182            // based on intution. Drop or update according to real-world
3183            // behavior.
3184            let nr_to_break_at = nr_to_free / 2;
3185
3186            let mut freed = false;
3187
3188            while nr_to_free > 0 {
3189                let max_to_free = if force_free {
3190                    force_free = false;
3191                    layer.nr_cpus
3192                } else {
3193                    nr_to_free
3194                };
3195
3196                let nr_freed = layer.free_some_cpus(&mut self.cpu_pool, max_to_free)?;
3197                if nr_freed == 0 {
3198                    break;
3199                }
3200
3201                nr_to_free = nr_to_free.saturating_sub(nr_freed);
3202                freed = true;
3203
3204                if nr_to_free <= nr_to_break_at {
3205                    break;
3206                }
3207            }
3208
3209            if freed {
3210                Self::update_bpf_layer_cpumask(
3211                    layer,
3212                    &mut self.skel.maps.bss_data.as_mut().unwrap().layers[idx],
3213                );
3214                updated = true;
3215            }
3216        }
3217
3218        // Grow layers. Do so in the ascending target number of CPUs order
3219        // so that we're always more generous to smaller layers. This avoids
3220        // starving small layers and shouldn't make noticable difference for
3221        // bigger layers as work conservation should still be achieved
3222        // through open execution.
3223        for &(idx, target) in &ascending {
3224            let layer = &mut self.layers[idx];
3225
3226            if layer_is_open(layer) {
3227                continue;
3228            }
3229
3230            let nr_cur = layer.cpus.weight();
3231            if nr_cur >= target {
3232                continue;
3233            }
3234
3235            let mut nr_to_alloc = target - nr_cur;
3236            let mut alloced = false;
3237
3238            while nr_to_alloc > 0 {
3239                let nr_alloced = layer.alloc_some_cpus(&mut self.cpu_pool)?;
3240                if nr_alloced == 0 {
3241                    break;
3242                }
3243                alloced = true;
3244                nr_to_alloc -= nr_alloced.min(nr_to_alloc);
3245            }
3246
3247            if alloced {
3248                Self::update_bpf_layer_cpumask(
3249                    layer,
3250                    &mut self.skel.maps.bss_data.as_mut().unwrap().layers[idx],
3251                );
3252                updated = true;
3253            }
3254        }
3255
3256        // Give the rest to the open layers.
3257        if updated {
3258            for (idx, layer) in self.layers.iter_mut().enumerate() {
3259                if !layer_is_open(layer) {
3260                    continue;
3261                }
3262
3263                let bpf_layer = &mut self.skel.maps.bss_data.as_mut().unwrap().layers[idx];
3264                let available_cpus = self.cpu_pool.available_cpus().and(&layer.allowed_cpus);
3265                let nr_available_cpus = available_cpus.weight();
3266
3267                // Open layers need the intersection of allowed cpus and
3268                // available cpus.
3269                layer.cpus = available_cpus;
3270                layer.nr_cpus = nr_available_cpus;
3271                Self::update_bpf_layer_cpumask(layer, bpf_layer);
3272            }
3273
3274            self.skel.maps.bss_data.as_mut().unwrap().fallback_cpu =
3275                self.cpu_pool.fallback_cpu as u32;
3276
3277            for (lidx, layer) in self.layers.iter().enumerate() {
3278                self.nr_layer_cpus_ranges[lidx] = (
3279                    self.nr_layer_cpus_ranges[lidx].0.min(layer.nr_cpus),
3280                    self.nr_layer_cpus_ranges[lidx].1.max(layer.nr_cpus),
3281                );
3282            }
3283
3284            // Trigger updates on the BPF side.
3285            let input = ProgramInput {
3286                ..Default::default()
3287            };
3288            let prog = &mut self.skel.progs.refresh_layer_cpumasks;
3289            let _ = prog.test_run(input);
3290
3291            // Update empty_layers.
3292            let empty_layer_ids: Vec<u32> = self
3293                .layers
3294                .iter()
3295                .enumerate()
3296                .filter(|(_idx, layer)| layer.nr_cpus == 0)
3297                .map(|(idx, _layer)| idx as u32)
3298                .collect();
3299            for i in 0..self.layers.len() {
3300                self.skel.maps.bss_data.as_mut().unwrap().empty_layer_ids[i] =
3301                    empty_layer_ids.get(i).cloned().unwrap_or(MAX_LAYERS as u32);
3302            }
3303            self.skel.maps.bss_data.as_mut().unwrap().nr_empty_layer_ids =
3304                empty_layer_ids.len() as u32;
3305        }
3306
3307        let _ = self.update_netdev_cpumasks();
3308        Ok(())
3309    }
3310
3311    fn refresh_idle_qos(&mut self) -> Result<()> {
3312        if !self.idle_qos_enabled {
3313            return Ok(());
3314        }
3315
3316        let mut cpu_idle_qos = vec![0; *NR_CPU_IDS];
3317        for layer in self.layers.iter() {
3318            let idle_resume_us = layer.kind.common().idle_resume_us.unwrap_or(0) as i32;
3319            for cpu in layer.cpus.iter() {
3320                cpu_idle_qos[cpu] = idle_resume_us;
3321            }
3322        }
3323
3324        for (cpu, idle_resume_usec) in cpu_idle_qos.iter().enumerate() {
3325            update_cpu_idle_resume_latency(cpu, *idle_resume_usec)?;
3326        }
3327
3328        Ok(())
3329    }
3330
3331    fn step(&mut self) -> Result<()> {
3332        let started_at = Instant::now();
3333        self.sched_stats.refresh(
3334            &mut self.skel,
3335            &self.proc_reader,
3336            started_at,
3337            self.processing_dur,
3338            &self.gpu_task_handler,
3339        )?;
3340
3341        // Update BPF with EWMA values
3342        self.skel
3343            .maps
3344            .bss_data
3345            .as_mut()
3346            .unwrap()
3347            .system_cpu_util_ewma = (self.sched_stats.system_cpu_util_ewma * 10000.0) as u64;
3348
3349        for layer_id in 0..self.sched_stats.nr_layers {
3350            self.skel
3351                .maps
3352                .bss_data
3353                .as_mut()
3354                .unwrap()
3355                .layer_dsq_insert_ewma[layer_id] =
3356                (self.sched_stats.layer_dsq_insert_ewma[layer_id] * 10000.0) as u64;
3357        }
3358
3359        self.refresh_cpumasks()?;
3360        self.refresh_idle_qos()?;
3361        self.gpu_task_handler.maybe_affinitize();
3362        self.processing_dur += Instant::now().duration_since(started_at);
3363        Ok(())
3364    }
3365
3366    fn generate_sys_stats(
3367        &mut self,
3368        stats: &Stats,
3369        cpus_ranges: &mut [(usize, usize)],
3370    ) -> Result<SysStats> {
3371        let bstats = &stats.bpf_stats;
3372        let mut sys_stats = SysStats::new(stats, bstats, self.cpu_pool.fallback_cpu)?;
3373
3374        for (lidx, (spec, layer)) in self.layer_specs.iter().zip(self.layers.iter()).enumerate() {
3375            let layer_stats = LayerStats::new(lidx, layer, stats, bstats, cpus_ranges[lidx]);
3376            sys_stats.layers.insert(spec.name.to_string(), layer_stats);
3377            cpus_ranges[lidx] = (layer.nr_cpus, layer.nr_cpus);
3378        }
3379
3380        Ok(sys_stats)
3381    }
3382
3383    // Helper function to process a cgroup creation (common logic for walkdir and inotify)
3384    fn process_cgroup_creation(
3385        path: &Path,
3386        cgroup_regexes: &HashMap<u32, Regex>,
3387        cgroup_path_to_id: &mut HashMap<String, u64>,
3388        sender: &crossbeam::channel::Sender<CgroupEvent>,
3389    ) {
3390        let path_str = path.to_string_lossy().to_string();
3391
3392        // Get cgroup ID (inode number)
3393        let cgroup_id = std::fs::metadata(path)
3394            .map(|metadata| {
3395                use std::os::unix::fs::MetadataExt;
3396                metadata.ino()
3397            })
3398            .unwrap_or(0);
3399
3400        // Build match bitmap by testing against CgroupRegex rules
3401        let mut match_bitmap = 0u64;
3402        for (rule_id, regex) in cgroup_regexes {
3403            if regex.is_match(&path_str) {
3404                match_bitmap |= 1u64 << rule_id;
3405            }
3406        }
3407
3408        // Store in hash
3409        cgroup_path_to_id.insert(path_str.clone(), cgroup_id);
3410
3411        // Send event
3412        if let Err(e) = sender.send(CgroupEvent::Created {
3413            path: path_str,
3414            cgroup_id,
3415            match_bitmap,
3416        }) {
3417            error!("Failed to send cgroup creation event: {}", e);
3418        }
3419    }
3420
3421    fn start_cgroup_watcher(
3422        shutdown: Arc<AtomicBool>,
3423        cgroup_regexes: HashMap<u32, Regex>,
3424    ) -> Result<Receiver<CgroupEvent>> {
3425        let mut inotify = Inotify::init().context("Failed to initialize inotify")?;
3426        let mut wd_to_path = HashMap::new();
3427
3428        // Create crossbeam channel for cgroup events (bounded to prevent memory issues)
3429        let (sender, receiver) = crossbeam::channel::bounded::<CgroupEvent>(1024);
3430
3431        // Watch for directory creation and deletion events
3432        let root_wd = inotify
3433            .watches()
3434            .add("/sys/fs/cgroup", WatchMask::CREATE | WatchMask::DELETE)
3435            .context("Failed to add watch for /sys/fs/cgroup")?;
3436        wd_to_path.insert(root_wd, PathBuf::from("/sys/fs/cgroup"));
3437
3438        // Also recursively watch existing directories for new subdirectories
3439        Self::add_recursive_watches(&mut inotify, &mut wd_to_path, Path::new("/sys/fs/cgroup"))?;
3440
3441        // Spawn watcher thread
3442        std::thread::spawn(move || {
3443            let mut buffer = [0; 4096];
3444            let inotify_fd = inotify.as_raw_fd();
3445            // Maintain hash of cgroup path -> cgroup ID (inode number)
3446            let mut cgroup_path_to_id = HashMap::<String, u64>::new();
3447
3448            // Populate existing cgroups
3449            for entry in WalkDir::new("/sys/fs/cgroup")
3450                .into_iter()
3451                .filter_map(|e| e.ok())
3452                .filter(|e| e.file_type().is_dir())
3453            {
3454                let path = entry.path();
3455                Self::process_cgroup_creation(
3456                    path,
3457                    &cgroup_regexes,
3458                    &mut cgroup_path_to_id,
3459                    &sender,
3460                );
3461            }
3462
3463            while !shutdown.load(Ordering::Relaxed) {
3464                // Use select to wait for events with a 100ms timeout
3465                let ready = unsafe {
3466                    let mut read_fds: libc::fd_set = std::mem::zeroed();
3467                    libc::FD_ZERO(&mut read_fds);
3468                    libc::FD_SET(inotify_fd, &mut read_fds);
3469
3470                    let mut timeout = libc::timeval {
3471                        tv_sec: 0,
3472                        tv_usec: 100_000, // 100ms
3473                    };
3474
3475                    libc::select(
3476                        inotify_fd + 1,
3477                        &mut read_fds,
3478                        std::ptr::null_mut(),
3479                        std::ptr::null_mut(),
3480                        &mut timeout,
3481                    )
3482                };
3483
3484                if ready <= 0 {
3485                    // Timeout or error, continue loop to check shutdown
3486                    continue;
3487                }
3488
3489                // Read events non-blocking
3490                let events = match inotify.read_events(&mut buffer) {
3491                    Ok(events) => events,
3492                    Err(e) => {
3493                        error!("Error reading inotify events: {}", e);
3494                        break;
3495                    }
3496                };
3497
3498                for event in events {
3499                    if !event.mask.contains(inotify::EventMask::CREATE)
3500                        && !event.mask.contains(inotify::EventMask::DELETE)
3501                    {
3502                        continue;
3503                    }
3504
3505                    let name = match event.name {
3506                        Some(name) => name,
3507                        None => continue,
3508                    };
3509
3510                    let parent_path = match wd_to_path.get(&event.wd) {
3511                        Some(parent) => parent,
3512                        None => {
3513                            warn!("Unknown watch descriptor: {:?}", event.wd);
3514                            continue;
3515                        }
3516                    };
3517
3518                    let path = parent_path.join(name.to_string_lossy().as_ref());
3519
3520                    if event.mask.contains(inotify::EventMask::CREATE) {
3521                        if !path.is_dir() {
3522                            continue;
3523                        }
3524
3525                        Self::process_cgroup_creation(
3526                            &path,
3527                            &cgroup_regexes,
3528                            &mut cgroup_path_to_id,
3529                            &sender,
3530                        );
3531
3532                        // Add watch for this new directory
3533                        match inotify
3534                            .watches()
3535                            .add(&path, WatchMask::CREATE | WatchMask::DELETE)
3536                        {
3537                            Ok(wd) => {
3538                                wd_to_path.insert(wd, path.clone());
3539                            }
3540                            Err(e) => {
3541                                warn!(
3542                                    "Failed to add watch for new cgroup {}: {}",
3543                                    path.display(),
3544                                    e
3545                                );
3546                            }
3547                        }
3548                    } else if event.mask.contains(inotify::EventMask::DELETE) {
3549                        let path_str = path.to_string_lossy().to_string();
3550
3551                        // Get cgroup ID from our hash (since the directory is gone, we can't stat it)
3552                        let cgroup_id = cgroup_path_to_id.remove(&path_str).unwrap_or(0);
3553
3554                        // Send removal event to main thread
3555                        if let Err(e) = sender.send(CgroupEvent::Removed {
3556                            path: path_str,
3557                            cgroup_id,
3558                        }) {
3559                            error!("Failed to send cgroup removal event: {}", e);
3560                        }
3561
3562                        // Find and remove the watch descriptor for this path
3563                        let wd_to_remove = wd_to_path.iter().find_map(|(wd, watched_path)| {
3564                            if watched_path == &path {
3565                                Some(wd.clone())
3566                            } else {
3567                                None
3568                            }
3569                        });
3570                        if let Some(wd) = wd_to_remove {
3571                            wd_to_path.remove(&wd);
3572                        }
3573                    }
3574                }
3575            }
3576        });
3577
3578        Ok(receiver)
3579    }
3580
3581    fn add_recursive_watches(
3582        inotify: &mut Inotify,
3583        wd_to_path: &mut HashMap<inotify::WatchDescriptor, PathBuf>,
3584        path: &Path,
3585    ) -> Result<()> {
3586        for entry in WalkDir::new(path)
3587            .into_iter()
3588            .filter_map(|e| e.ok())
3589            .filter(|e| e.file_type().is_dir())
3590            .skip(1)
3591        {
3592            let entry_path = entry.path();
3593            // Add watch for this directory
3594            match inotify
3595                .watches()
3596                .add(entry_path, WatchMask::CREATE | WatchMask::DELETE)
3597            {
3598                Ok(wd) => {
3599                    wd_to_path.insert(wd, entry_path.to_path_buf());
3600                }
3601                Err(e) => {
3602                    debug!("Failed to add watch for {}: {}", entry_path.display(), e);
3603                }
3604            }
3605        }
3606        Ok(())
3607    }
3608
3609    fn run(&mut self, shutdown: Arc<AtomicBool>) -> Result<UserExitInfo> {
3610        let (res_ch, req_ch) = self.stats_server.channels();
3611        let mut next_sched_at = Instant::now() + self.sched_intv;
3612        let enable_layer_refresh = !self.layer_refresh_intv.is_zero();
3613        let mut next_layer_refresh_at = Instant::now() + self.layer_refresh_intv;
3614        let mut cpus_ranges = HashMap::<ThreadId, Vec<(usize, usize)>>::new();
3615
3616        // Start the cgroup watcher only if there are CgroupRegex rules
3617        let cgroup_regexes = self.cgroup_regexes.take().unwrap();
3618        let cgroup_event_rx = if !cgroup_regexes.is_empty() {
3619            Some(Self::start_cgroup_watcher(
3620                shutdown.clone(),
3621                cgroup_regexes,
3622            )?)
3623        } else {
3624            None
3625        };
3626
3627        while !shutdown.load(Ordering::Relaxed) && !uei_exited!(&self.skel, uei) {
3628            let now = Instant::now();
3629
3630            if now >= next_sched_at {
3631                self.step()?;
3632                while next_sched_at < now {
3633                    next_sched_at += self.sched_intv;
3634                }
3635            }
3636
3637            if enable_layer_refresh && now >= next_layer_refresh_at {
3638                self.skel
3639                    .maps
3640                    .bss_data
3641                    .as_mut()
3642                    .unwrap()
3643                    .layer_refresh_seq_avgruntime += 1;
3644                while next_layer_refresh_at < now {
3645                    next_layer_refresh_at += self.layer_refresh_intv;
3646                }
3647            }
3648
3649            // Handle both stats requests and cgroup events with timeout
3650            let timeout_duration = next_sched_at.saturating_duration_since(Instant::now());
3651            let never_rx = crossbeam::channel::never();
3652            let cgroup_rx = cgroup_event_rx.as_ref().unwrap_or(&never_rx);
3653
3654            select! {
3655                recv(req_ch) -> msg => match msg {
3656                    Ok(StatsReq::Hello(tid)) => {
3657                        cpus_ranges.insert(
3658                            tid,
3659                            self.layers.iter().map(|l| (l.nr_cpus, l.nr_cpus)).collect(),
3660                        );
3661                        let stats =
3662                            Stats::new(&mut self.skel, &self.proc_reader, &self.gpu_task_handler)?;
3663                        res_ch.send(StatsRes::Hello(stats))?;
3664                    }
3665                    Ok(StatsReq::Refresh(tid, mut stats)) => {
3666                        // Propagate self's layer cpu ranges into each stat's.
3667                        for i in 0..self.nr_layer_cpus_ranges.len() {
3668                            for (_, ranges) in cpus_ranges.iter_mut() {
3669                                ranges[i] = (
3670                                    ranges[i].0.min(self.nr_layer_cpus_ranges[i].0),
3671                                    ranges[i].1.max(self.nr_layer_cpus_ranges[i].1),
3672                                );
3673                            }
3674                            self.nr_layer_cpus_ranges[i] =
3675                                (self.layers[i].nr_cpus, self.layers[i].nr_cpus);
3676                        }
3677
3678                        stats.refresh(
3679                            &mut self.skel,
3680                            &self.proc_reader,
3681                            now,
3682                            self.processing_dur,
3683                            &self.gpu_task_handler,
3684                        )?;
3685                        let sys_stats =
3686                            self.generate_sys_stats(&stats, cpus_ranges.get_mut(&tid).unwrap())?;
3687                        res_ch.send(StatsRes::Refreshed((stats, sys_stats)))?;
3688                    }
3689                    Ok(StatsReq::Bye(tid)) => {
3690                        cpus_ranges.remove(&tid);
3691                        res_ch.send(StatsRes::Bye)?;
3692                    }
3693                    Err(e) => Err(e)?,
3694                },
3695
3696                recv(cgroup_rx) -> event => match event {
3697                    Ok(CgroupEvent::Created { path, cgroup_id, match_bitmap }) => {
3698                        // Insert into BPF map
3699                        self.skel.maps.cgroup_match_bitmap.update(
3700                            &cgroup_id.to_ne_bytes(),
3701                            &match_bitmap.to_ne_bytes(),
3702                            libbpf_rs::MapFlags::ANY,
3703                        ).with_context(|| format!(
3704                            "Failed to insert cgroup {}({}) into BPF map. Cgroup map may be full \
3705                             (max 16384 entries). Aborting.",
3706                            cgroup_id, path
3707                        ))?;
3708
3709                        debug!("Added cgroup {} to BPF map with bitmap 0x{:x}", cgroup_id, match_bitmap);
3710                    }
3711                    Ok(CgroupEvent::Removed { path, cgroup_id }) => {
3712                        // Delete from BPF map
3713                        if let Err(e) = self.skel.maps.cgroup_match_bitmap.delete(&cgroup_id.to_ne_bytes()) {
3714                            warn!("Failed to delete cgroup {} from BPF map: {}", cgroup_id, e);
3715                        } else {
3716                            debug!("Removed cgroup {}({}) from BPF map", cgroup_id, path);
3717                        }
3718                    }
3719                    Err(e) => {
3720                        error!("Error receiving cgroup event: {}", e);
3721                    }
3722                },
3723
3724                recv(crossbeam::channel::after(timeout_duration)) -> _ => {
3725                    // Timeout - continue main loop
3726                }
3727            }
3728        }
3729
3730        let _ = self.struct_ops.take();
3731        uei_report!(&self.skel, uei)
3732    }
3733}
3734
3735impl Drop for Scheduler<'_> {
3736    fn drop(&mut self) {
3737        info!("Unregister {SCHEDULER_NAME} scheduler");
3738
3739        if let Some(struct_ops) = self.struct_ops.take() {
3740            drop(struct_ops);
3741        }
3742    }
3743}
3744
3745fn write_example_file(path: &str) -> Result<()> {
3746    let mut f = fs::OpenOptions::new()
3747        .create_new(true)
3748        .write(true)
3749        .open(path)?;
3750    Ok(f.write_all(serde_json::to_string_pretty(&*EXAMPLE_CONFIG)?.as_bytes())?)
3751}
3752
3753struct HintLayerInfo {
3754    layer_id: usize,
3755    system_cpu_util_below: Option<f64>,
3756    dsq_insert_below: Option<f64>,
3757}
3758
3759fn verify_layer_specs(specs: &[LayerSpec]) -> Result<HashMap<u64, HintLayerInfo>> {
3760    let mut hint_to_layer_map = HashMap::<u64, (usize, String, Option<f64>, Option<f64>)>::new();
3761
3762    let nr_specs = specs.len();
3763    if nr_specs == 0 {
3764        bail!("No layer spec");
3765    }
3766    if nr_specs > MAX_LAYERS {
3767        bail!("Too many layer specs");
3768    }
3769
3770    for (idx, spec) in specs.iter().enumerate() {
3771        if idx < nr_specs - 1 {
3772            if spec.matches.is_empty() {
3773                bail!("Non-terminal spec {:?} has NULL matches", spec.name);
3774            }
3775        } else {
3776            if spec.matches.len() != 1 || !spec.matches[0].is_empty() {
3777                bail!("Terminal spec {:?} must have an empty match", spec.name);
3778            }
3779        }
3780
3781        if spec.matches.len() > MAX_LAYER_MATCH_ORS {
3782            bail!(
3783                "Spec {:?} has too many ({}) OR match blocks",
3784                spec.name,
3785                spec.matches.len()
3786            );
3787        }
3788
3789        for (ands_idx, ands) in spec.matches.iter().enumerate() {
3790            if ands.len() > NR_LAYER_MATCH_KINDS {
3791                bail!(
3792                    "Spec {:?}'s {}th OR block has too many ({}) match conditions",
3793                    spec.name,
3794                    ands_idx,
3795                    ands.len()
3796                );
3797            }
3798            let mut hint_equals_cnt = 0;
3799            let mut system_cpu_util_below_cnt = 0;
3800            let mut dsq_insert_below_cnt = 0;
3801            let mut hint_value: Option<u64> = None;
3802            let mut system_cpu_util_threshold: Option<f64> = None;
3803            let mut dsq_insert_threshold: Option<f64> = None;
3804            for one in ands.iter() {
3805                match one {
3806                    LayerMatch::CgroupPrefix(prefix) => {
3807                        if prefix.len() > MAX_PATH {
3808                            bail!("Spec {:?} has too long a cgroup prefix", spec.name);
3809                        }
3810                    }
3811                    LayerMatch::CgroupSuffix(suffix) => {
3812                        if suffix.len() > MAX_PATH {
3813                            bail!("Spec {:?} has too long a cgroup suffix", spec.name);
3814                        }
3815                    }
3816                    LayerMatch::CgroupContains(substr) => {
3817                        if substr.len() > MAX_PATH {
3818                            bail!("Spec {:?} has too long a cgroup substr", spec.name);
3819                        }
3820                    }
3821                    LayerMatch::CommPrefix(prefix) => {
3822                        if prefix.len() > MAX_COMM {
3823                            bail!("Spec {:?} has too long a comm prefix", spec.name);
3824                        }
3825                    }
3826                    LayerMatch::PcommPrefix(prefix) => {
3827                        if prefix.len() > MAX_COMM {
3828                            bail!("Spec {:?} has too long a process name prefix", spec.name);
3829                        }
3830                    }
3831                    LayerMatch::SystemCpuUtilBelow(threshold) => {
3832                        if *threshold < 0.0 || *threshold > 1.0 {
3833                            bail!(
3834                                "Spec {:?} has SystemCpuUtilBelow threshold outside the range [0.0, 1.0]",
3835                                spec.name
3836                            );
3837                        }
3838                        system_cpu_util_threshold = Some(*threshold);
3839                        system_cpu_util_below_cnt += 1;
3840                    }
3841                    LayerMatch::DsqInsertBelow(threshold) => {
3842                        if *threshold < 0.0 || *threshold > 1.0 {
3843                            bail!(
3844                                "Spec {:?} has DsqInsertBelow threshold outside the range [0.0, 1.0]",
3845                                spec.name
3846                            );
3847                        }
3848                        dsq_insert_threshold = Some(*threshold);
3849                        dsq_insert_below_cnt += 1;
3850                    }
3851                    LayerMatch::HintEquals(hint) => {
3852                        if *hint > 1024 {
3853                            bail!(
3854                                "Spec {:?} has hint value outside the range [0, 1024]",
3855                                spec.name
3856                            );
3857                        }
3858                        hint_value = Some(*hint);
3859                        hint_equals_cnt += 1;
3860                    }
3861                    _ => {}
3862                }
3863            }
3864            if hint_equals_cnt > 1 {
3865                bail!("Only 1 HintEquals match permitted per AND block");
3866            }
3867            let high_freq_matcher_cnt = system_cpu_util_below_cnt + dsq_insert_below_cnt;
3868            if high_freq_matcher_cnt > 0 {
3869                if hint_equals_cnt != 1 {
3870                    bail!("High-frequency matchers (SystemCpuUtilBelow, DsqInsertBelow) must be used with one HintEquals");
3871                }
3872                if system_cpu_util_below_cnt > 1 {
3873                    bail!("Only 1 SystemCpuUtilBelow match permitted per AND block");
3874                }
3875                if dsq_insert_below_cnt > 1 {
3876                    bail!("Only 1 DsqInsertBelow match permitted per AND block");
3877                }
3878                if ands.len() != hint_equals_cnt + system_cpu_util_below_cnt + dsq_insert_below_cnt
3879                {
3880                    bail!("High-frequency matchers must be used only with HintEquals (no other matchers)");
3881                }
3882            } else if hint_equals_cnt == 1 && ands.len() != 1 {
3883                bail!("HintEquals match cannot be in conjunction with other matches");
3884            }
3885
3886            // Insert hint into map if present
3887            if let Some(hint) = hint_value {
3888                if let Some((layer_id, name, _, _)) = hint_to_layer_map.get(&hint) {
3889                    if *layer_id != idx {
3890                        bail!(
3891                            "Spec {:?} has hint value ({}) that is already mapped to Spec {:?}",
3892                            spec.name,
3893                            hint,
3894                            name
3895                        );
3896                    }
3897                } else {
3898                    hint_to_layer_map.insert(
3899                        hint,
3900                        (
3901                            idx,
3902                            spec.name.clone(),
3903                            system_cpu_util_threshold,
3904                            dsq_insert_threshold,
3905                        ),
3906                    );
3907                }
3908            }
3909        }
3910
3911        match spec.kind {
3912            LayerKind::Confined {
3913                cpus_range,
3914                util_range,
3915                ..
3916            }
3917            | LayerKind::Grouped {
3918                cpus_range,
3919                util_range,
3920                ..
3921            } => {
3922                if let Some((cpus_min, cpus_max)) = cpus_range {
3923                    if cpus_min > cpus_max {
3924                        bail!(
3925                            "Spec {:?} has invalid cpus_range({}, {})",
3926                            spec.name,
3927                            cpus_min,
3928                            cpus_max
3929                        );
3930                    }
3931                }
3932                if util_range.0 >= util_range.1 {
3933                    bail!(
3934                        "Spec {:?} has invalid util_range ({}, {})",
3935                        spec.name,
3936                        util_range.0,
3937                        util_range.1
3938                    );
3939                }
3940            }
3941            _ => {}
3942        }
3943    }
3944
3945    Ok(hint_to_layer_map
3946        .into_iter()
3947        .map(|(k, v)| {
3948            (
3949                k,
3950                HintLayerInfo {
3951                    layer_id: v.0,
3952                    system_cpu_util_below: v.2,
3953                    dsq_insert_below: v.3,
3954                },
3955            )
3956        })
3957        .collect())
3958}
3959
3960fn name_suffix(cgroup: &str, len: usize) -> String {
3961    let suffixlen = std::cmp::min(len, cgroup.len());
3962    let suffixrev: String = cgroup.chars().rev().take(suffixlen).collect();
3963
3964    suffixrev.chars().rev().collect()
3965}
3966
3967fn traverse_sysfs(dir: &Path) -> Result<Vec<PathBuf>> {
3968    let mut paths = vec![];
3969
3970    if !dir.is_dir() {
3971        panic!("path {:?} does not correspond to directory", dir);
3972    }
3973
3974    let direntries = fs::read_dir(dir)?;
3975
3976    for entry in direntries {
3977        let path = entry?.path();
3978        if path.is_dir() {
3979            paths.append(&mut traverse_sysfs(&path)?);
3980            paths.push(path);
3981        }
3982    }
3983
3984    Ok(paths)
3985}
3986
3987fn find_cpumask(cgroup: &str) -> Cpumask {
3988    let mut path = String::from(cgroup);
3989    path.push_str("/cpuset.cpus.effective");
3990
3991    let description = fs::read_to_string(&mut path).unwrap();
3992
3993    Cpumask::from_cpulist(&description).unwrap()
3994}
3995
3996fn expand_template(rule: &LayerMatch) -> Result<Vec<(LayerMatch, Cpumask)>> {
3997    match rule {
3998        LayerMatch::CgroupSuffix(suffix) => Ok(traverse_sysfs(Path::new("/sys/fs/cgroup"))?
3999            .into_iter()
4000            .map(|cgroup| String::from(cgroup.to_str().expect("could not parse cgroup path")))
4001            .filter(|cgroup| cgroup.ends_with(suffix))
4002            .map(|cgroup| {
4003                (
4004                    {
4005                        let mut slashterminated = cgroup.clone();
4006                        slashterminated.push('/');
4007                        LayerMatch::CgroupSuffix(name_suffix(&slashterminated, 64))
4008                    },
4009                    find_cpumask(&cgroup),
4010                )
4011            })
4012            .collect()),
4013        LayerMatch::CgroupRegex(expr) => Ok(traverse_sysfs(Path::new("/sys/fs/cgroup"))?
4014            .into_iter()
4015            .map(|cgroup| String::from(cgroup.to_str().expect("could not parse cgroup path")))
4016            .filter(|cgroup| {
4017                let re = Regex::new(expr).unwrap();
4018                re.is_match(cgroup)
4019            })
4020            .map(|cgroup| {
4021                (
4022                    // Here we convert the regex match into a suffix match because we still need to
4023                    // do the matching on the bpf side and doing a regex match in bpf isn't
4024                    // easily done.
4025                    {
4026                        let mut slashterminated = cgroup.clone();
4027                        slashterminated.push('/');
4028                        LayerMatch::CgroupSuffix(name_suffix(&slashterminated, 64))
4029                    },
4030                    find_cpumask(&cgroup),
4031                )
4032            })
4033            .collect()),
4034        _ => panic!("Unimplemented template enum {:?}", rule),
4035    }
4036}
4037
4038fn create_perf_fds(skel: &mut BpfSkel, event: u64) -> Result<()> {
4039    let mut attr = perf::bindings::perf_event_attr::default();
4040    attr.size = std::mem::size_of::<perf::bindings::perf_event_attr>() as u32;
4041    attr.type_ = perf::bindings::PERF_TYPE_RAW;
4042    attr.config = event;
4043    attr.sample_type = 0u64;
4044    attr.__bindgen_anon_1.sample_period = 0u64;
4045    attr.set_disabled(0);
4046
4047    let perf_events_map = &skel.maps.scx_pmu_map;
4048    let map_fd = unsafe { libbpf_sys::bpf_map__fd(perf_events_map.as_libbpf_object().as_ptr()) };
4049
4050    let mut failures = 0u64;
4051
4052    for cpu in 0..*NR_CPUS_POSSIBLE {
4053        let fd = unsafe { perf::perf_event_open(&mut attr as *mut _, -1, cpu as i32, -1, 0) };
4054        if fd < 0 {
4055            failures += 1;
4056            trace!(
4057                "perf_event_open failed cpu={cpu} errno={}",
4058                std::io::Error::last_os_error()
4059            );
4060            continue;
4061        }
4062
4063        let key = cpu as u32;
4064        let val = fd as u32;
4065        let ret = unsafe {
4066            libbpf_sys::bpf_map_update_elem(
4067                map_fd,
4068                &key as *const _ as *const _,
4069                &val as *const _ as *const _,
4070                0,
4071            )
4072        };
4073        if ret != 0 {
4074            trace!("bpf_map_update_elem failed cpu={cpu} fd={fd} ret={ret}");
4075        } else {
4076            trace!("mapped cpu={cpu} -> fd={fd}");
4077        }
4078    }
4079
4080    if failures > 0 {
4081        println!("membw tracking: failed to install {failures} counters");
4082        // Keep going, do not fail the scheduler for this
4083    }
4084
4085    Ok(())
4086}
4087
4088// Set up the counters
4089fn setup_membw_tracking(skel: &mut OpenBpfSkel) -> Result<u64> {
4090    let pmumanager = PMUManager::new()?;
4091    let codename = &pmumanager.codename as &str;
4092
4093    let pmuspec = match codename {
4094        "amdzen1" | "amdzen2" | "amdzen3" => {
4095            trace!("found AMD codename {codename}");
4096            pmumanager.pmus.get("ls_any_fills_from_sys.mem_io_local")
4097        }
4098        "amdzen4" | "amdzen5" => {
4099            trace!("found AMD codename {codename}");
4100            pmumanager.pmus.get("ls_any_fills_from_sys.dram_io_all")
4101        }
4102
4103        "haswell" | "broadwell" | "broadwellde" | "broadwellx" | "skylake" | "skylakex"
4104        | "cascadelakex" | "arrowlake" | "meteorlake" | "sapphirerapids" | "emeraldrapids"
4105        | "graniterapids" => {
4106            trace!("found Intel codename {codename}");
4107            pmumanager.pmus.get("LONGEST_LAT_CACHE.MISS")
4108        }
4109
4110        _ => {
4111            trace!("found unknown codename {codename}");
4112            None
4113        }
4114    };
4115
4116    let spec = pmuspec.ok_or("not_found").unwrap();
4117    let config = (spec.umask << 8) | spec.event[0];
4118
4119    // Install the counter in the BPF map
4120    skel.maps.rodata_data.as_mut().unwrap().membw_event = config;
4121
4122    Ok(config)
4123}
4124
4125fn main() -> Result<()> {
4126    let opts = Opts::parse();
4127
4128    if opts.version {
4129        println!(
4130            "scx_layered {}",
4131            build_id::full_version(env!("CARGO_PKG_VERSION"))
4132        );
4133        return Ok(());
4134    }
4135
4136    if opts.help_stats {
4137        stats::server_data().describe_meta(&mut std::io::stdout(), None)?;
4138        return Ok(());
4139    }
4140
4141    if opts.no_load_frac_limit {
4142        warn!("--no-load-frac-limit is deprecated and noop");
4143    }
4144    if opts.layer_preempt_weight_disable != 0.0 {
4145        warn!("--layer-preempt-weight-disable is deprecated and noop");
4146    }
4147    if opts.layer_growth_weight_disable != 0.0 {
4148        warn!("--layer-growth-weight-disable is deprecated and noop");
4149    }
4150    if opts.local_llc_iteration {
4151        warn!("--local_llc_iteration is deprecated and noop");
4152    }
4153
4154    let llv = match opts.verbose {
4155        0 => simplelog::LevelFilter::Info,
4156        1 => simplelog::LevelFilter::Debug,
4157        _ => simplelog::LevelFilter::Trace,
4158    };
4159    let mut lcfg = simplelog::ConfigBuilder::new();
4160    lcfg.set_time_offset_to_local()
4161        .expect("Failed to set local time offset")
4162        .set_time_level(simplelog::LevelFilter::Error)
4163        .set_location_level(simplelog::LevelFilter::Off)
4164        .set_target_level(simplelog::LevelFilter::Off)
4165        .set_thread_level(simplelog::LevelFilter::Off);
4166    simplelog::TermLogger::init(
4167        llv,
4168        lcfg.build(),
4169        simplelog::TerminalMode::Stderr,
4170        simplelog::ColorChoice::Auto,
4171    )?;
4172
4173    debug!("opts={:?}", &opts);
4174
4175    let shutdown = Arc::new(AtomicBool::new(false));
4176    let shutdown_clone = shutdown.clone();
4177    ctrlc::set_handler(move || {
4178        shutdown_clone.store(true, Ordering::Relaxed);
4179    })
4180    .context("Error setting Ctrl-C handler")?;
4181
4182    if let Some(intv) = opts.monitor.or(opts.stats) {
4183        let shutdown_copy = shutdown.clone();
4184        let jh = std::thread::spawn(move || {
4185            match stats::monitor(Duration::from_secs_f64(intv), shutdown_copy) {
4186                Ok(_) => {
4187                    debug!("stats monitor thread finished successfully")
4188                }
4189                Err(error_object) => {
4190                    warn!(
4191                        "stats monitor thread finished because of an error {}",
4192                        error_object
4193                    )
4194                }
4195            }
4196        });
4197        if opts.monitor.is_some() {
4198            let _ = jh.join();
4199            return Ok(());
4200        }
4201    }
4202
4203    if let Some(path) = &opts.example {
4204        write_example_file(path)?;
4205        return Ok(());
4206    }
4207
4208    let mut layer_config = match opts.run_example {
4209        true => EXAMPLE_CONFIG.clone(),
4210        false => LayerConfig { specs: vec![] },
4211    };
4212
4213    for (idx, input) in opts.specs.iter().enumerate() {
4214        let specs = LayerSpec::parse(input)
4215            .context(format!("Failed to parse specs[{}] ({:?})", idx, input))?;
4216
4217        for spec in specs {
4218            match spec.template {
4219                Some(ref rule) => {
4220                    let matches = expand_template(&rule)?;
4221                    // in the absence of matching cgroups, have template layers
4222                    // behave as non-template layers do.
4223                    if matches.is_empty() {
4224                        layer_config.specs.push(spec);
4225                    } else {
4226                        for (mt, mask) in matches {
4227                            let mut genspec = spec.clone();
4228
4229                            genspec.cpuset = Some(mask);
4230
4231                            // Push the new "and" rule into each "or" term.
4232                            for orterm in &mut genspec.matches {
4233                                orterm.push(mt.clone());
4234                            }
4235
4236                            match &mt {
4237                                LayerMatch::CgroupSuffix(cgroup) => genspec.name.push_str(cgroup),
4238                                _ => bail!("Template match has unexpected type"),
4239                            }
4240
4241                            // Push the generated layer into the config
4242                            layer_config.specs.push(genspec);
4243                        }
4244                    }
4245                }
4246
4247                None => {
4248                    layer_config.specs.push(spec);
4249                }
4250            }
4251        }
4252    }
4253
4254    for spec in layer_config.specs.iter_mut() {
4255        let common = spec.kind.common_mut();
4256
4257        if common.slice_us == 0 {
4258            common.slice_us = opts.slice_us;
4259        }
4260
4261        if common.weight == 0 {
4262            common.weight = DEFAULT_LAYER_WEIGHT;
4263        }
4264        common.weight = common.weight.clamp(MIN_LAYER_WEIGHT, MAX_LAYER_WEIGHT);
4265
4266        if common.preempt {
4267            if common.disallow_open_after_us.is_some() {
4268                warn!(
4269                    "Preempt layer {} has non-null disallow_open_after_us, ignored",
4270                    &spec.name
4271                );
4272            }
4273            if common.disallow_preempt_after_us.is_some() {
4274                warn!(
4275                    "Preempt layer {} has non-null disallow_preempt_after_us, ignored",
4276                    &spec.name
4277                );
4278            }
4279            common.disallow_open_after_us = Some(u64::MAX);
4280            common.disallow_preempt_after_us = Some(u64::MAX);
4281        } else {
4282            if common.disallow_open_after_us.is_none() {
4283                common.disallow_open_after_us = Some(*DFL_DISALLOW_OPEN_AFTER_US);
4284            }
4285
4286            if common.disallow_preempt_after_us.is_none() {
4287                common.disallow_preempt_after_us = Some(*DFL_DISALLOW_PREEMPT_AFTER_US);
4288            }
4289        }
4290
4291        if common.idle_smt.is_some() {
4292            warn!("Layer {} has deprecated flag \"idle_smt\"", &spec.name);
4293        }
4294    }
4295
4296    let membw_required = layer_config.specs.iter().any(|spec| match spec.kind {
4297        LayerKind::Confined { membw_gb, .. } | LayerKind::Grouped { membw_gb, .. } => {
4298            membw_gb.is_some()
4299        }
4300        LayerKind::Open { .. } => false,
4301    });
4302
4303    if opts.print_and_exit {
4304        println!("specs={}", serde_json::to_string_pretty(&layer_config)?);
4305        return Ok(());
4306    }
4307
4308    debug!("specs={}", serde_json::to_string_pretty(&layer_config)?);
4309    let hint_to_layer_map = verify_layer_specs(&layer_config.specs)?;
4310
4311    let mut open_object = MaybeUninit::uninit();
4312    loop {
4313        let mut sched = Scheduler::init(
4314            &opts,
4315            &layer_config.specs,
4316            &mut open_object,
4317            &hint_to_layer_map,
4318            membw_required,
4319        )?;
4320        if !sched.run(shutdown.clone())?.should_restart() {
4321            break;
4322        }
4323    }
4324
4325    Ok(())
4326}