scx_layered/
main.rs

1// Copyright (c) Meta Platforms, Inc. and affiliates.
2
3// This software may be used and distributed according to the terms of the
4// GNU General Public License version 2.
5mod bpf_skel;
6mod stats;
7
8use std::collections::BTreeMap;
9use std::collections::HashMap;
10use std::collections::HashSet;
11use std::ffi::CString;
12use std::fs;
13use std::io::Write;
14use std::mem::MaybeUninit;
15use std::ops::Sub;
16use std::path::Path;
17use std::path::PathBuf;
18use std::sync::atomic::AtomicBool;
19use std::sync::atomic::Ordering;
20use std::sync::Arc;
21use std::thread::ThreadId;
22use std::time::Duration;
23use std::time::Instant;
24
25use anyhow::anyhow;
26use anyhow::bail;
27use anyhow::Context;
28use anyhow::Result;
29pub use bpf_skel::*;
30use clap::Parser;
31use crossbeam::channel::RecvTimeoutError;
32use lazy_static::lazy_static;
33use libbpf_rs::libbpf_sys;
34use libbpf_rs::AsRawLibbpf;
35use libbpf_rs::MapCore as _;
36use libbpf_rs::OpenObject;
37use libbpf_rs::ProgramInput;
38use log::info;
39use log::trace;
40use log::warn;
41use log::{debug, error};
42use nix::sched::CpuSet;
43use nvml_wrapper::error::NvmlError;
44use nvml_wrapper::Nvml;
45use once_cell::sync::OnceCell;
46use regex::Regex;
47use scx_bpf_compat;
48use scx_layered::*;
49use scx_raw_pmu::PMUManager;
50use scx_stats::prelude::*;
51use scx_utils::build_id;
52use scx_utils::compat;
53use scx_utils::init_libbpf_logging;
54use scx_utils::libbpf_clap_opts::LibbpfOpts;
55use scx_utils::perf;
56use scx_utils::pm::{cpu_idle_resume_latency_supported, update_cpu_idle_resume_latency};
57use scx_utils::read_netdevs;
58use scx_utils::scx_enums;
59use scx_utils::scx_ops_attach;
60use scx_utils::scx_ops_load;
61use scx_utils::scx_ops_open;
62use scx_utils::uei_exited;
63use scx_utils::uei_report;
64use scx_utils::CoreType;
65use scx_utils::Cpumask;
66use scx_utils::Llc;
67use scx_utils::NetDev;
68use scx_utils::Topology;
69use scx_utils::UserExitInfo;
70use scx_utils::NR_CPUS_POSSIBLE;
71use scx_utils::NR_CPU_IDS;
72use stats::LayerStats;
73use stats::StatsReq;
74use stats::StatsRes;
75use stats::SysStats;
76use std::collections::VecDeque;
77use sysinfo::{Pid, ProcessRefreshKind, ProcessesToUpdate, System};
78
79const SCHEDULER_NAME: &str = "scx_layered";
80const MAX_PATH: usize = bpf_intf::consts_MAX_PATH as usize;
81const MAX_COMM: usize = bpf_intf::consts_MAX_COMM as usize;
82const MAX_LAYER_WEIGHT: u32 = bpf_intf::consts_MAX_LAYER_WEIGHT;
83const MIN_LAYER_WEIGHT: u32 = bpf_intf::consts_MIN_LAYER_WEIGHT;
84const MAX_LAYER_MATCH_ORS: usize = bpf_intf::consts_MAX_LAYER_MATCH_ORS as usize;
85const MAX_LAYER_NAME: usize = bpf_intf::consts_MAX_LAYER_NAME as usize;
86const MAX_LAYERS: usize = bpf_intf::consts_MAX_LAYERS as usize;
87const DEFAULT_LAYER_WEIGHT: u32 = bpf_intf::consts_DEFAULT_LAYER_WEIGHT;
88const USAGE_HALF_LIFE: u32 = bpf_intf::consts_USAGE_HALF_LIFE;
89const USAGE_HALF_LIFE_F64: f64 = USAGE_HALF_LIFE as f64 / 1_000_000_000.0;
90
91const LAYER_USAGE_OWNED: usize = bpf_intf::layer_usage_LAYER_USAGE_OWNED as usize;
92const LAYER_USAGE_OPEN: usize = bpf_intf::layer_usage_LAYER_USAGE_OPEN as usize;
93const LAYER_USAGE_SUM_UPTO: usize = bpf_intf::layer_usage_LAYER_USAGE_SUM_UPTO as usize;
94const LAYER_USAGE_PROTECTED: usize = bpf_intf::layer_usage_LAYER_USAGE_PROTECTED as usize;
95const LAYER_USAGE_PROTECTED_PREEMPT: usize =
96    bpf_intf::layer_usage_LAYER_USAGE_PROTECTED_PREEMPT as usize;
97const NR_LAYER_USAGES: usize = bpf_intf::layer_usage_NR_LAYER_USAGES as usize;
98
99const NR_GSTATS: usize = bpf_intf::global_stat_id_NR_GSTATS as usize;
100const NR_LSTATS: usize = bpf_intf::layer_stat_id_NR_LSTATS as usize;
101const NR_LLC_LSTATS: usize = bpf_intf::llc_layer_stat_id_NR_LLC_LSTATS as usize;
102
103const NR_LAYER_MATCH_KINDS: usize = bpf_intf::layer_match_kind_NR_LAYER_MATCH_KINDS as usize;
104
105static NVML: OnceCell<Nvml> = OnceCell::new();
106
107fn nvml() -> Result<&'static Nvml, NvmlError> {
108    NVML.get_or_try_init(Nvml::init)
109}
110
111lazy_static! {
112    static ref USAGE_DECAY: f64 = 0.5f64.powf(1.0 / USAGE_HALF_LIFE_F64);
113    static ref DFL_DISALLOW_OPEN_AFTER_US: u64 = 2 * scx_enums.SCX_SLICE_DFL / 1000;
114    static ref DFL_DISALLOW_PREEMPT_AFTER_US: u64 = 4 * scx_enums.SCX_SLICE_DFL / 1000;
115    static ref EXAMPLE_CONFIG: LayerConfig = LayerConfig {
116        specs: vec![
117            LayerSpec {
118                name: "batch".into(),
119                comment: Some("tasks under system.slice or tasks with nice value > 0".into()),
120                cpuset: None,
121                template: None,
122                matches: vec![
123                    vec![LayerMatch::CgroupPrefix("system.slice/".into())],
124                    vec![LayerMatch::NiceAbove(0)],
125                ],
126                kind: LayerKind::Confined {
127                    util_range: (0.8, 0.9),
128                    cpus_range: Some((0, 16)),
129                    cpus_range_frac: None,
130                    protected: false,
131                    membw_gb: None,
132                    common: LayerCommon {
133                        min_exec_us: 1000,
134                        yield_ignore: 0.0,
135                        preempt: false,
136                        preempt_first: false,
137                        exclusive: false,
138                        allow_node_aligned: false,
139                        skip_remote_node: false,
140                        prev_over_idle_core: false,
141                        idle_smt: None,
142                        slice_us: 20000,
143                        fifo: false,
144                        weight: DEFAULT_LAYER_WEIGHT,
145                        disallow_open_after_us: None,
146                        disallow_preempt_after_us: None,
147                        xllc_mig_min_us: 1000.0,
148                        growth_algo: LayerGrowthAlgo::Sticky,
149                        idle_resume_us: None,
150                        perf: 1024,
151                        nodes: vec![],
152                        llcs: vec![],
153                        placement: LayerPlacement::Standard,
154                    },
155                },
156            },
157            LayerSpec {
158                name: "immediate".into(),
159                comment: Some("tasks under workload.slice with nice value < 0".into()),
160                cpuset: None,
161                template: None,
162                matches: vec![vec![
163                    LayerMatch::CgroupPrefix("workload.slice/".into()),
164                    LayerMatch::NiceBelow(0),
165                ]],
166                kind: LayerKind::Open {
167                    common: LayerCommon {
168                        min_exec_us: 100,
169                        yield_ignore: 0.25,
170                        preempt: true,
171                        preempt_first: false,
172                        exclusive: true,
173                        allow_node_aligned: true,
174                        skip_remote_node: false,
175                        prev_over_idle_core: true,
176                        idle_smt: None,
177                        slice_us: 20000,
178                        fifo: false,
179                        weight: DEFAULT_LAYER_WEIGHT,
180                        disallow_open_after_us: None,
181                        disallow_preempt_after_us: None,
182                        xllc_mig_min_us: 0.0,
183                        growth_algo: LayerGrowthAlgo::Sticky,
184                        perf: 1024,
185                        idle_resume_us: None,
186                        nodes: vec![],
187                        llcs: vec![],
188                        placement: LayerPlacement::Standard,
189                    },
190                },
191            },
192            LayerSpec {
193                name: "stress-ng".into(),
194                comment: Some("stress-ng test layer".into()),
195                cpuset: None,
196                template: None,
197                matches: vec![
198                    vec![LayerMatch::CommPrefix("stress-ng".into()),],
199                    vec![LayerMatch::PcommPrefix("stress-ng".into()),]
200                ],
201                kind: LayerKind::Confined {
202                    cpus_range: None,
203                    util_range: (0.2, 0.8),
204                    protected: false,
205                    cpus_range_frac: None,
206                    membw_gb: None,
207                    common: LayerCommon {
208                        min_exec_us: 800,
209                        yield_ignore: 0.0,
210                        preempt: true,
211                        preempt_first: false,
212                        exclusive: false,
213                        allow_node_aligned: false,
214                        skip_remote_node: false,
215                        prev_over_idle_core: false,
216                        idle_smt: None,
217                        slice_us: 800,
218                        fifo: false,
219                        weight: DEFAULT_LAYER_WEIGHT,
220                        disallow_open_after_us: None,
221                        disallow_preempt_after_us: None,
222                        xllc_mig_min_us: 0.0,
223                        growth_algo: LayerGrowthAlgo::Topo,
224                        perf: 1024,
225                        idle_resume_us: None,
226                        nodes: vec![],
227                        llcs: vec![],
228                        placement: LayerPlacement::Standard,
229                    },
230                },
231            },
232            LayerSpec {
233                name: "normal".into(),
234                comment: Some("the rest".into()),
235                cpuset: None,
236                template: None,
237                matches: vec![vec![]],
238                kind: LayerKind::Grouped {
239                    cpus_range: None,
240                    util_range: (0.5, 0.6),
241                    util_includes_open_cputime: true,
242                    protected: false,
243                    cpus_range_frac: None,
244                    membw_gb: None,
245                    common: LayerCommon {
246                        min_exec_us: 200,
247                        yield_ignore: 0.0,
248                        preempt: false,
249                        preempt_first: false,
250                        exclusive: false,
251                        allow_node_aligned: false,
252                        skip_remote_node: false,
253                        prev_over_idle_core: false,
254                        idle_smt: None,
255                        slice_us: 20000,
256                        fifo: false,
257                        weight: DEFAULT_LAYER_WEIGHT,
258                        disallow_open_after_us: None,
259                        disallow_preempt_after_us: None,
260                        xllc_mig_min_us: 100.0,
261                        growth_algo: LayerGrowthAlgo::Linear,
262                        perf: 1024,
263                        idle_resume_us: None,
264                        nodes: vec![],
265                        llcs: vec![],
266                        placement: LayerPlacement::Standard,
267                    },
268                },
269            },
270        ],
271    };
272}
273
274/// scx_layered: A highly configurable multi-layer sched_ext scheduler
275///
276/// scx_layered allows classifying tasks into multiple layers and applying
277/// different scheduling policies to them. The configuration is specified in
278/// json and composed of two parts - matches and policies.
279///
280/// Matches
281/// =======
282///
283/// Whenever a task is forked or its attributes are changed, the task goes
284/// through a series of matches to determine the layer it belongs to. A
285/// match set is composed of OR groups of AND blocks. An example:
286///
287///   "matches": [
288///     [
289///       {
290///         "CgroupPrefix": "system.slice/"
291///       }
292///     ],
293///     [
294///       {
295///         "CommPrefix": "fbagent"
296///       },
297///       {
298///         "NiceAbove": 0
299///       }
300///     ]
301///   ],
302///
303/// The outer array contains the OR groups and the inner AND blocks, so the
304/// above matches:
305///
306/// - Tasks which are in the cgroup sub-hierarchy under "system.slice".
307///
308/// - Or tasks whose comm starts with "fbagent" and have a nice value > 0.
309///
310/// Currently, the following matches are supported:
311///
312/// - CgroupPrefix: Matches the prefix of the cgroup that the task belongs
313///   to. As this is a string match, whether the pattern has the trailing
314///   '/' makes a difference. For example, "TOP/CHILD/" only matches tasks
315///   which are under that particular cgroup while "TOP/CHILD" also matches
316///   tasks under "TOP/CHILD0/" or "TOP/CHILD1/".
317///
318/// - CommPrefix: Matches the task's comm prefix.
319///
320/// - PcommPrefix: Matches the task's thread group leader's comm prefix.
321///
322/// - NiceAbove: Matches if the task's nice value is greater than the
323///   pattern.
324///
325/// - NiceBelow: Matches if the task's nice value is smaller than the
326///   pattern.
327///
328/// - NiceEquals: Matches if the task's nice value is exactly equal to
329///   the pattern.
330///
331/// - UIDEquals: Matches if the task's effective user id matches the value
332///
333/// - GIDEquals: Matches if the task's effective group id matches the value.
334///
335/// - PIDEquals: Matches if the task's pid matches the value.
336///
337/// - PPIDEquals: Matches if the task's ppid matches the value.
338///
339/// - TGIDEquals: Matches if the task's tgid matches the value.
340///
341/// - NSPIDEquals: Matches if the task's namespace id and pid matches the values.
342///
343/// - NSEquals: Matches if the task's namespace id matches the values.
344///
345/// - IsGroupLeader: Bool. When true, matches if the task is group leader
346///   (i.e. PID == TGID), aka the thread from which other threads are made.
347///   When false, matches if the task is *not* the group leader (i.e. the rest).
348///
349/// - CmdJoin: Matches when the task uses pthread_setname_np to send a join/leave
350/// command to the scheduler. See examples/cmdjoin.c for more details.
351///
352/// - UsedGpuTid: Bool. When true, matches if the tasks which have used
353///   gpus by tid.
354///
355/// - UsedGpuPid: Bool. When true, matches if the tasks which have used gpu
356///   by tgid/pid.
357///
358/// - [EXPERIMENTAL] AvgRuntime: (u64, u64). Match tasks whose average runtime
359///   is within the provided values [min, max).
360///
361/// - HintEquals: u64. Match tasks whose hint value equals this value.
362///   The value must be in the range [0, 1024].
363///
364/// While there are complexity limitations as the matches are performed in
365/// BPF, it is straightforward to add more types of matches.
366///
367/// Templates
368/// ---------
369///
370/// Templates let us create a variable number of layers dynamically at initialization
371/// time out of a cgroup name suffix/prefix. Sometimes we know there are multiple
372/// applications running on a machine, each with their own cgroup but do not know the
373/// exact names of the applications or cgroups, e.g., in cloud computing contexts where
374/// workloads are placed on machines dynamically and run under cgroups whose name is
375/// autogenerated. In that case, we cannot hardcode the cgroup match rules when writing
376/// the configuration. We thus cannot easily prevent tasks from different cgroups from
377/// falling into the same layer and affecting each other's performance.
378///
379///
380/// Templates offer a solution to this problem by generating one layer for each such cgroup,
381/// provided these cgroups share a suffix, and that the suffix is unique to them. Templates
382/// have a cgroup suffix rule that we use to find the relevant cgroups in the system. For each
383/// such cgroup, we copy the layer config and add a matching rule that matches just this cgroup.
384///
385///
386/// Policies
387/// ========
388///
389/// The following is an example policy configuration for a layer.
390///
391///   "kind": {
392///     "Confined": {
393///       "cpus_range": [1, 8],
394///       "util_range": [0.8, 0.9]
395///     }
396///   }
397///
398/// It's of "Confined" kind, which tries to concentrate the layer's tasks
399/// into a limited number of CPUs. In the above case, the number of CPUs
400/// assigned to the layer is scaled between 1 and 8 so that the per-cpu
401/// utilization is kept between 80% and 90%. If the CPUs are loaded higher
402/// than 90%, more CPUs are allocated to the layer. If the utilization drops
403/// below 80%, the layer loses CPUs.
404///
405/// Currently, the following policy kinds are supported:
406///
407/// - Confined: Tasks are restricted to the allocated CPUs. The number of
408///   CPUs allocated is modulated to keep the per-CPU utilization in
409///   "util_range". The range can optionally be restricted with the
410///   "cpus_range" property.
411///
412/// - Grouped: Similar to Confined but tasks may spill outside if there are
413///   idle CPUs outside the allocated ones. The range can optionally be
414///   restricted with the "cpus_range" property.
415///
416/// - Open: Prefer the CPUs which are not occupied by Confined or Grouped
417///   layers. Tasks in this group will spill into occupied CPUs if there are
418///   no unoccupied idle CPUs.
419///
420/// All layers take the following options:
421///
422/// - min_exec_us: Minimum execution time in microseconds. Whenever a task
423///   is scheduled in, this is the minimum CPU time that it's charged no
424///   matter how short the actual execution time may be.
425///
426/// - yield_ignore: Yield ignore ratio. If 0.0, yield(2) forfeits a whole
427///   execution slice. 0.25 yields three quarters of an execution slice and
428///   so on. If 1.0, yield is completely ignored.
429///
430/// - slice_us: Scheduling slice duration in microseconds.
431///
432/// - fifo: Use FIFO queues within the layer instead of the default vtime.
433///
434/// - preempt: If true, tasks in the layer will preempt tasks which belong
435///   to other non-preempting layers when no idle CPUs are available.
436///
437/// - preempt_first: If true, tasks in the layer will try to preempt tasks
438///   in their previous CPUs before trying to find idle CPUs.
439///
440/// - exclusive: If true, tasks in the layer will occupy the whole core. The
441///   other logical CPUs sharing the same core will be kept idle. This isn't
442///   a hard guarantee, so don't depend on it for security purposes.
443///
444/// - allow_node_aligned: Put node aligned tasks on layer DSQs instead of lo
445///   fallback. This is a hack to support node-affine tasks without making
446///   the whole scheduler node aware and should only be used with open
447///   layers on non-saturated machines to avoid possible stalls.
448///
449/// - prev_over_idle_core: On SMT enabled systems, prefer using the same CPU
450///   when picking a CPU for tasks on this layer, even if that CPUs SMT
451///   sibling is processing a task.
452///
453/// - weight: Weight of the layer, which is a range from 1 to 10000 with a
454///   default of 100. Layer weights are used during contention to prevent
455///   starvation across layers. Weights are used in combination with
456///   utilization to determine the infeasible adjusted weight with higher
457///   weights having a larger adjustment in adjusted utilization.
458///
459/// - disallow_open_after_us: Duration to wait after machine reaches saturation
460///   before confining tasks in Open layers.
461///
462/// - cpus_range_frac: Array of 2 floats between 0 and 1.0. Lower and upper
463///   bound fractions of all CPUs to give to a layer. Mutually exclusive
464///   with cpus_range.
465///
466/// - disallow_preempt_after_us: Duration to wait after machine reaches saturation
467///   before confining tasks to preempt.
468///
469/// - xllc_mig_min_us: Skip cross-LLC migrations if they are likely to run on
470///   their existing LLC sooner than this.
471///
472/// - idle_smt: *** DEPRECATED ****
473///
474/// - growth_algo: When a layer is allocated new CPUs different algorithms can
475///   be used to determine which CPU should be allocated next. The default
476///   algorithm is a "sticky" algorithm that attempts to spread layers evenly
477///   across cores.
478///
479/// - perf: CPU performance target. 0 means no configuration. A value
480///   between 1 and 1024 indicates the performance level CPUs running tasks
481///   in this layer are configured to using scx_bpf_cpuperf_set().
482///
483/// - idle_resume_us: Sets the idle resume QoS value. CPU idle time governors are expected to
484///   regard the minimum of the global (effective) CPU latency limit and the effective resume
485///   latency constraint for the given CPU as the upper limit for the exit latency of the idle
486///   states. See the latest kernel docs for more details:
487///   https://www.kernel.org/doc/html/latest/admin-guide/pm/cpuidle.html
488///
489/// - nodes: If set the layer will use the set of NUMA nodes for scheduling
490///   decisions. If unset then all available NUMA nodes will be used. If the
491///   llcs value is set the cpuset of NUMA nodes will be or'ed with the LLC
492///   config.
493///
494/// - llcs: If set the layer will use the set of LLCs (last level caches)
495///   for scheduling decisions. If unset then all LLCs will be used. If
496///   the nodes value is set the cpuset of LLCs will be or'ed with the nodes
497///   config.
498///
499///
500/// Similar to matches, adding new policies and extending existing ones
501/// should be relatively straightforward.
502///
503/// Configuration example and running scx_layered
504/// =============================================
505///
506/// An scx_layered config is composed of layer configs. A layer config is
507/// composed of a name, a set of matches, and a policy block. Running the
508/// following will write an example configuration into example.json.
509///
510///   $ scx_layered -e example.json
511///
512/// Note that the last layer in the configuration must have an empty match set
513/// as a catch-all for tasks which haven't been matched into previous layers.
514///
515/// The configuration can be specified in multiple json files and
516/// command line arguments, which are concatenated in the specified
517/// order. Each must contain valid layer configurations.
518///
519/// By default, an argument to scx_layered is interpreted as a JSON string. If
520/// the argument is a pointer to a JSON file, it should be prefixed with file:
521/// or f: as follows:
522///
523///   $ scx_layered file:example.json
524///   ...
525///   $ scx_layered f:example.json
526///
527/// Monitoring Statistics
528/// =====================
529///
530/// Run with `--stats INTERVAL` to enable stats monitoring. There is
531/// also an scx_stat server listening on /var/run/scx/root/stat that can
532/// be monitored by running `scx_layered --monitor INTERVAL` separately.
533///
534///   ```bash
535///   $ scx_layered --monitor 1
536///   tot= 117909 local=86.20 open_idle= 0.21 affn_viol= 1.37 proc=6ms
537///   busy= 34.2 util= 1733.6 load=  21744.1 fallback_cpu=  1
538///     batch    : util/frac=   11.8/  0.7 load/frac=     29.7:  0.1 tasks=  2597
539///                tot=   3478 local=67.80 open_idle= 0.00 preempt= 0.00 affn_viol= 0.00
540///                cpus=  2 [  2,  2] 04000001 00000000
541///     immediate: util/frac= 1218.8/ 70.3 load/frac=  21399.9: 98.4 tasks=  1107
542///                tot=  68997 local=90.57 open_idle= 0.26 preempt= 9.36 affn_viol= 0.00
543///                cpus= 50 [ 50, 50] fbfffffe 000fffff
544///     normal   : util/frac=  502.9/ 29.0 load/frac=    314.5:  1.4 tasks=  3512
545///                tot=  45434 local=80.97 open_idle= 0.16 preempt= 0.00 affn_viol= 3.56
546///                cpus= 50 [ 50, 50] fbfffffe 000fffff
547///   ```
548///
549/// Global statistics: see [`SysStats`]
550///
551/// Per-layer statistics: see [`LayerStats`]
552///
553#[derive(Debug, Parser)]
554#[command(verbatim_doc_comment)]
555struct Opts {
556    /// Scheduling slice duration in microseconds.
557    #[clap(short = 's', long, default_value = "20000")]
558    slice_us: u64,
559
560    /// Maximum consecutive execution time in microseconds. A task may be
561    /// allowed to keep executing on a CPU for this long. Note that this is
562    /// the upper limit and a task may have to moved off the CPU earlier. 0
563    /// indicates default - 20 * slice_us.
564    #[clap(short = 'M', long, default_value = "0")]
565    max_exec_us: u64,
566
567    /// Scheduling interval in seconds.
568    #[clap(short = 'i', long, default_value = "0.1")]
569    interval: f64,
570
571    /// ***DEPRECATED*** Disable load-fraction based max layer CPU limit.
572    /// recommended.
573    #[clap(short = 'n', long, default_value = "false")]
574    no_load_frac_limit: bool,
575
576    /// Exit debug dump buffer length. 0 indicates default.
577    #[clap(long, default_value = "0")]
578    exit_dump_len: u32,
579
580    /// Enable verbose output, including libbpf details. Specify multiple
581    /// times to increase verbosity.
582    #[clap(short = 'v', long, action = clap::ArgAction::Count)]
583    verbose: u8,
584
585    /// Disable topology awareness. When enabled, the "nodes" and "llcs" settings on
586    /// a layer are ignored. Defaults to false on topologies with multiple NUMA nodes
587    /// or LLCs, and true otherwise.
588    #[arg(short = 't', long, num_args = 0..=1, default_missing_value = "true", require_equals = true)]
589    disable_topology: Option<bool>,
590
591    /// Enable cross NUMA preemption.
592    #[clap(long)]
593    xnuma_preemption: bool,
594
595    /// Disable monitor
596    #[clap(long)]
597    monitor_disable: bool,
598
599    /// Write example layer specifications into the file and exit.
600    #[clap(short = 'e', long)]
601    example: Option<String>,
602
603    /// ***DEPRECATED*** Disables preemption if the weighted load fraction
604    /// of a layer (load_frac_adj) exceeds the threshold. The default is
605    /// disabled (0.0).
606    #[clap(long, default_value = "0.0")]
607    layer_preempt_weight_disable: f64,
608
609    /// ***DEPRECATED*** Disables layer growth if the weighted load fraction
610    /// of a layer (load_frac_adj) exceeds the threshold. The default is
611    /// disabled (0.0).
612    #[clap(long, default_value = "0.0")]
613    layer_growth_weight_disable: f64,
614
615    /// Enable stats monitoring with the specified interval.
616    #[clap(long)]
617    stats: Option<f64>,
618
619    /// Run in stats monitoring mode with the specified interval. Scheduler
620    /// is not launched.
621    #[clap(long)]
622    monitor: Option<f64>,
623
624    /// Run with example layer specifications (useful for e.g. CI pipelines)
625    #[clap(long)]
626    run_example: bool,
627
628    /// ***DEPRECATED *** Enables iteration over local LLCs first for
629    /// dispatch.
630    #[clap(long, default_value = "false")]
631    local_llc_iteration: bool,
632
633    /// Low priority fallback DSQs are used to execute tasks with custom CPU
634    /// affinities. These DSQs are immediately executed iff a CPU is
635    /// otherwise idle. However, after the specified wait, they are
636    /// guranteed upto --lo-fb-share fraction of each CPU.
637    #[clap(long, default_value = "10000")]
638    lo_fb_wait_us: u64,
639
640    /// The fraction of CPU time guaranteed to low priority fallback DSQs.
641    /// See --lo-fb-wait-us.
642    #[clap(long, default_value = ".05")]
643    lo_fb_share: f64,
644
645    /// Disable antistall
646    #[clap(long, default_value = "false")]
647    disable_antistall: bool,
648
649    /// Enable numa topology based gpu task affinitization.
650    #[clap(long, default_value = "false")]
651    enable_gpu_affinitize: bool,
652
653    /// Interval at which to reaffinitize gpu tasks to numa nodes.
654    /// Defaults to 900s
655    #[clap(long, default_value = "900")]
656    gpu_affinitize_secs: u64,
657
658    /// Enable match debug
659    /// This stores a mapping of task tid
660    /// to layer id such that bpftool map dump
661    /// can be used to debug layer matches.
662    #[clap(long, default_value = "false")]
663    enable_match_debug: bool,
664
665    /// Maximum task runnable_at delay (in seconds) before antistall turns on
666    #[clap(long, default_value = "3")]
667    antistall_sec: u64,
668
669    /// Enable gpu support
670    #[clap(long, default_value = "false")]
671    enable_gpu_support: bool,
672
673    /// Gpu Kprobe Level
674    /// The value set here determines how agressive
675    /// the kprobes enabled on gpu driver functions are.
676    /// Higher values are more aggressive, incurring more system overhead
677    /// and more accurately identifying PIDs using GPUs in a more timely manner.
678    /// Lower values incur less system overhead, at the cost of less accurately
679    /// identifying GPU pids and taking longer to do so.
680    #[clap(long, default_value = "3")]
681    gpu_kprobe_level: u64,
682
683    /// Enable netdev IRQ balancing. This is experimental and should be used with caution.
684    #[clap(long, default_value = "false")]
685    netdev_irq_balance: bool,
686
687    /// Disable queued wakeup optimization.
688    #[clap(long, default_value = "false")]
689    disable_queued_wakeup: bool,
690
691    /// Per-cpu kthreads are preempting by default. Make it not so.
692    #[clap(long, default_value = "false")]
693    disable_percpu_kthread_preempt: bool,
694
695    /// Only highpri (nice < 0) per-cpu kthreads are preempting by default.
696    /// Make every per-cpu kthread preempting. Meaningful only if
697    /// --disable-percpu-kthread-preempt is not set.
698    #[clap(long, default_value = "false")]
699    percpu_kthread_preempt_all: bool,
700
701    /// Print scheduler version and exit.
702    #[clap(short = 'V', long, action = clap::ArgAction::SetTrue)]
703    version: bool,
704
705    /// Show descriptions for statistics.
706    #[clap(long)]
707    help_stats: bool,
708
709    /// Layer specification. See --help.
710    specs: Vec<String>,
711
712    /// Periodically force tasks in layers using the AvgRuntime match rule to reevaluate which layer they belong to. Default period of 2s.
713    /// turns this off.
714    #[clap(long, default_value = "2000")]
715    layer_refresh_ms_avgruntime: u64,
716
717    /// Set the path for pinning the task hint map.
718    #[clap(long, default_value = "")]
719    task_hint_map: String,
720
721    /// Print the config (after template expansion) and exit.
722    #[clap(long, default_value = "false")]
723    print_and_exit: bool,
724
725    /// Enable affinitized task to use hi fallback queue to get more CPU time.
726    #[clap(long, default_value = "")]
727    hi_fb_thread_name: String,
728
729    #[clap(flatten, next_help_heading = "Libbpf Options")]
730    pub libbpf: LibbpfOpts,
731}
732
733fn read_total_cpu(reader: &fb_procfs::ProcReader) -> Result<fb_procfs::CpuStat> {
734    reader
735        .read_stat()
736        .context("Failed to read procfs")?
737        .total_cpu
738        .ok_or_else(|| anyhow!("Could not read total cpu stat in proc"))
739}
740
741fn calc_util(curr: &fb_procfs::CpuStat, prev: &fb_procfs::CpuStat) -> Result<f64> {
742    match (curr, prev) {
743        (
744            fb_procfs::CpuStat {
745                user_usec: Some(curr_user),
746                nice_usec: Some(curr_nice),
747                system_usec: Some(curr_system),
748                idle_usec: Some(curr_idle),
749                iowait_usec: Some(curr_iowait),
750                irq_usec: Some(curr_irq),
751                softirq_usec: Some(curr_softirq),
752                stolen_usec: Some(curr_stolen),
753                ..
754            },
755            fb_procfs::CpuStat {
756                user_usec: Some(prev_user),
757                nice_usec: Some(prev_nice),
758                system_usec: Some(prev_system),
759                idle_usec: Some(prev_idle),
760                iowait_usec: Some(prev_iowait),
761                irq_usec: Some(prev_irq),
762                softirq_usec: Some(prev_softirq),
763                stolen_usec: Some(prev_stolen),
764                ..
765            },
766        ) => {
767            let idle_usec = curr_idle.saturating_sub(*prev_idle);
768            let iowait_usec = curr_iowait.saturating_sub(*prev_iowait);
769            let user_usec = curr_user.saturating_sub(*prev_user);
770            let system_usec = curr_system.saturating_sub(*prev_system);
771            let nice_usec = curr_nice.saturating_sub(*prev_nice);
772            let irq_usec = curr_irq.saturating_sub(*prev_irq);
773            let softirq_usec = curr_softirq.saturating_sub(*prev_softirq);
774            let stolen_usec = curr_stolen.saturating_sub(*prev_stolen);
775
776            let busy_usec =
777                user_usec + system_usec + nice_usec + irq_usec + softirq_usec + stolen_usec;
778            let total_usec = idle_usec + busy_usec + iowait_usec;
779            if total_usec > 0 {
780                Ok(((busy_usec as f64) / (total_usec as f64)).clamp(0.0, 1.0))
781            } else {
782                Ok(1.0)
783            }
784        }
785        _ => bail!("Missing stats in cpustat"),
786    }
787}
788
789fn copy_into_cstr(dst: &mut [i8], src: &str) {
790    let cstr = CString::new(src).unwrap();
791    let bytes = unsafe { std::mem::transmute::<&[u8], &[i8]>(cstr.as_bytes_with_nul()) };
792    dst[0..bytes.len()].copy_from_slice(bytes);
793}
794
795fn nodemask_from_nodes(nodes: &Vec<usize>) -> usize {
796    let mut mask = 0;
797    for node in nodes {
798        mask |= 1 << node;
799    }
800    mask
801}
802
803fn llcmask_from_llcs(llcs: &BTreeMap<usize, Arc<Llc>>) -> usize {
804    let mut mask = 0;
805    for (_, cache) in llcs {
806        mask |= 1 << cache.id;
807    }
808    mask
809}
810
811fn read_cpu_ctxs(skel: &BpfSkel) -> Result<Vec<bpf_intf::cpu_ctx>> {
812    let mut cpu_ctxs = vec![];
813    let cpu_ctxs_vec = skel
814        .maps
815        .cpu_ctxs
816        .lookup_percpu(&0u32.to_ne_bytes(), libbpf_rs::MapFlags::ANY)
817        .context("Failed to lookup cpu_ctx")?
818        .unwrap();
819    for cpu in 0..*NR_CPUS_POSSIBLE {
820        cpu_ctxs.push(*unsafe {
821            &*(cpu_ctxs_vec[cpu].as_slice().as_ptr() as *const bpf_intf::cpu_ctx)
822        });
823    }
824    Ok(cpu_ctxs)
825}
826
827#[derive(Clone, Debug)]
828struct BpfStats {
829    gstats: Vec<u64>,
830    lstats: Vec<Vec<u64>>,
831    lstats_sums: Vec<u64>,
832    llc_lstats: Vec<Vec<Vec<u64>>>, // [layer][llc][stat]
833}
834
835impl BpfStats {
836    fn read(skel: &BpfSkel, cpu_ctxs: &[bpf_intf::cpu_ctx]) -> Self {
837        let nr_layers = skel.maps.rodata_data.as_ref().unwrap().nr_layers as usize;
838        let nr_llcs = skel.maps.rodata_data.as_ref().unwrap().nr_llcs as usize;
839        let mut gstats = vec![0u64; NR_GSTATS];
840        let mut lstats = vec![vec![0u64; NR_LSTATS]; nr_layers];
841        let mut llc_lstats = vec![vec![vec![0u64; NR_LLC_LSTATS]; nr_llcs]; nr_layers];
842
843        for cpu in 0..*NR_CPUS_POSSIBLE {
844            for stat in 0..NR_GSTATS {
845                gstats[stat] += cpu_ctxs[cpu].gstats[stat];
846            }
847            for layer in 0..nr_layers {
848                for stat in 0..NR_LSTATS {
849                    lstats[layer][stat] += cpu_ctxs[cpu].lstats[layer][stat];
850                }
851            }
852        }
853
854        let mut lstats_sums = vec![0u64; NR_LSTATS];
855        for layer in 0..nr_layers {
856            for stat in 0..NR_LSTATS {
857                lstats_sums[stat] += lstats[layer][stat];
858            }
859        }
860
861        for llc_id in 0..nr_llcs {
862            // XXX - This would be a lot easier if llc_ctx were in
863            // the bss. Unfortunately, kernel < v6.12 crashes and
864            // kernel >= v6.12 fails verification after such
865            // conversion due to seemingly verifier bugs. Convert to
866            // bss maps later.
867            let key = llc_id as u32;
868            let llc_id_slice =
869                unsafe { std::slice::from_raw_parts((&key as *const u32) as *const u8, 4) };
870            let v = skel
871                .maps
872                .llc_data
873                .lookup(llc_id_slice, libbpf_rs::MapFlags::ANY)
874                .unwrap()
875                .unwrap();
876            let llcc = unsafe { *(v.as_slice().as_ptr() as *const bpf_intf::llc_ctx) };
877
878            for layer_id in 0..nr_layers {
879                for stat_id in 0..NR_LLC_LSTATS {
880                    llc_lstats[layer_id][llc_id][stat_id] = llcc.lstats[layer_id][stat_id];
881                }
882            }
883        }
884
885        Self {
886            gstats,
887            lstats,
888            lstats_sums,
889            llc_lstats,
890        }
891    }
892}
893
894impl<'a, 'b> Sub<&'b BpfStats> for &'a BpfStats {
895    type Output = BpfStats;
896
897    fn sub(self, rhs: &'b BpfStats) -> BpfStats {
898        let vec_sub = |l: &[u64], r: &[u64]| l.iter().zip(r.iter()).map(|(l, r)| *l - *r).collect();
899        BpfStats {
900            gstats: vec_sub(&self.gstats, &rhs.gstats),
901            lstats: self
902                .lstats
903                .iter()
904                .zip(rhs.lstats.iter())
905                .map(|(l, r)| vec_sub(l, r))
906                .collect(),
907            lstats_sums: vec_sub(&self.lstats_sums, &rhs.lstats_sums),
908            llc_lstats: self
909                .llc_lstats
910                .iter()
911                .zip(rhs.llc_lstats.iter())
912                .map(|(l_layer, r_layer)| {
913                    l_layer
914                        .iter()
915                        .zip(r_layer.iter())
916                        .map(|(l_llc, r_llc)| {
917                            let (l_llc, mut r_llc) = (l_llc.clone(), r_llc.clone());
918                            // Lat is not subtractable, take L side.
919                            r_llc[bpf_intf::llc_layer_stat_id_LLC_LSTAT_LAT as usize] = 0;
920                            vec_sub(&l_llc, &r_llc)
921                        })
922                        .collect()
923                })
924                .collect(),
925        }
926    }
927}
928
929#[derive(Clone, Debug)]
930struct Stats {
931    at: Instant,
932    elapsed: Duration,
933    nr_layers: usize,
934    nr_layer_tasks: Vec<usize>,
935    nr_nodes: usize,
936
937    total_util: f64, // Running AVG of sum of layer_utils
938    layer_utils: Vec<Vec<f64>>,
939    prev_layer_usages: Vec<Vec<u64>>,
940
941    layer_membws: Vec<Vec<f64>>, // Estimated memory bandsidth consumption
942    prev_layer_membw_agg: Vec<Vec<u64>>, // Estimated aggregate membw consumption
943
944    cpu_busy: f64, // Read from /proc, maybe higher than total_util
945    prev_total_cpu: fb_procfs::CpuStat,
946
947    bpf_stats: BpfStats,
948    prev_bpf_stats: BpfStats,
949
950    processing_dur: Duration,
951    prev_processing_dur: Duration,
952
953    layer_slice_us: Vec<u64>,
954
955    gpu_tasks_affinitized: u64,
956    gpu_task_affinitization_ms: u64,
957}
958
959impl Stats {
960    fn read_layer_usages(cpu_ctxs: &[bpf_intf::cpu_ctx], nr_layers: usize) -> Vec<Vec<u64>> {
961        let mut layer_usages = vec![vec![0u64; NR_LAYER_USAGES]; nr_layers];
962
963        for cpu in 0..*NR_CPUS_POSSIBLE {
964            for layer in 0..nr_layers {
965                for usage in 0..NR_LAYER_USAGES {
966                    layer_usages[layer][usage] += cpu_ctxs[cpu].layer_usages[layer][usage];
967                }
968            }
969        }
970
971        layer_usages
972    }
973
974    // Same as above, but for aggregate memory bandwidth consumption.
975    fn read_layer_membw_agg(cpu_ctxs: &[bpf_intf::cpu_ctx], nr_layers: usize) -> Vec<Vec<u64>> {
976        let mut layer_membw_agg = vec![vec![0u64; NR_LAYER_USAGES]; nr_layers];
977
978        for cpu in 0..*NR_CPUS_POSSIBLE {
979            for layer in 0..nr_layers {
980                for usage in 0..NR_LAYER_USAGES {
981                    let delta = cpu_ctxs[cpu].layer_membw_agg[layer][usage];
982                    layer_membw_agg[layer][usage] += delta;
983                }
984            }
985        }
986
987        layer_membw_agg
988    }
989
990    fn new(
991        skel: &mut BpfSkel,
992        proc_reader: &fb_procfs::ProcReader,
993        gpu_task_affinitizer: &GpuTaskAffinitizer,
994    ) -> Result<Self> {
995        let nr_layers = skel.maps.rodata_data.as_ref().unwrap().nr_layers as usize;
996        let cpu_ctxs = read_cpu_ctxs(skel)?;
997        let bpf_stats = BpfStats::read(skel, &cpu_ctxs);
998        let nr_nodes = skel.maps.rodata_data.as_ref().unwrap().nr_nodes as usize;
999
1000        Ok(Self {
1001            at: Instant::now(),
1002            elapsed: Default::default(),
1003            nr_layers,
1004            nr_layer_tasks: vec![0; nr_layers],
1005            nr_nodes,
1006
1007            total_util: 0.0,
1008            layer_utils: vec![vec![0.0; NR_LAYER_USAGES]; nr_layers],
1009            layer_membws: vec![vec![0.0; NR_LAYER_USAGES]; nr_layers],
1010            prev_layer_usages: Self::read_layer_usages(&cpu_ctxs, nr_layers),
1011            prev_layer_membw_agg: Self::read_layer_membw_agg(&cpu_ctxs, nr_layers),
1012
1013            cpu_busy: 0.0,
1014            prev_total_cpu: read_total_cpu(proc_reader)?,
1015
1016            bpf_stats: bpf_stats.clone(),
1017            prev_bpf_stats: bpf_stats,
1018
1019            processing_dur: Default::default(),
1020            prev_processing_dur: Default::default(),
1021
1022            layer_slice_us: vec![0; nr_layers],
1023            gpu_tasks_affinitized: gpu_task_affinitizer.tasks_affinitized,
1024            gpu_task_affinitization_ms: gpu_task_affinitizer.last_task_affinitization_ms,
1025        })
1026    }
1027
1028    fn refresh(
1029        &mut self,
1030        skel: &mut BpfSkel,
1031        proc_reader: &fb_procfs::ProcReader,
1032        now: Instant,
1033        cur_processing_dur: Duration,
1034        gpu_task_affinitizer: &GpuTaskAffinitizer,
1035    ) -> Result<()> {
1036        let elapsed = now.duration_since(self.at);
1037        let elapsed_f64 = elapsed.as_secs_f64();
1038        let cpu_ctxs = read_cpu_ctxs(skel)?;
1039
1040        let nr_layer_tasks: Vec<usize> = skel
1041            .maps
1042            .bss_data
1043            .as_ref()
1044            .unwrap()
1045            .layers
1046            .iter()
1047            .take(self.nr_layers)
1048            .map(|layer| layer.nr_tasks as usize)
1049            .collect();
1050        let layer_slice_us: Vec<u64> = skel
1051            .maps
1052            .bss_data
1053            .as_ref()
1054            .unwrap()
1055            .layers
1056            .iter()
1057            .take(self.nr_layers)
1058            .map(|layer| layer.slice_ns / 1000_u64)
1059            .collect();
1060
1061        let cur_layer_usages = Self::read_layer_usages(&cpu_ctxs, self.nr_layers);
1062        let cur_layer_membw_agg = Self::read_layer_membw_agg(&cpu_ctxs, self.nr_layers);
1063
1064        // Computes the runtime deltas and converts them from ns to s.
1065        let compute_diff = |cur_agg: &Vec<Vec<u64>>, prev_agg: &Vec<Vec<u64>>| {
1066            cur_agg
1067                .iter()
1068                .zip(prev_agg.iter())
1069                .map(|(cur, prev)| {
1070                    cur.iter()
1071                        .zip(prev.iter())
1072                        .map(|(c, p)| (c - p) as f64 / 1_000_000_000.0 / elapsed_f64)
1073                        .collect()
1074                })
1075                .collect()
1076        };
1077
1078        // Computes the total memory traffic done since last computation and normalizes to bytes/s
1079        // over the elapsed timescale.
1080        let compute_mem_diff = |cur_agg: &Vec<Vec<u64>>, prev_agg: &Vec<Vec<u64>>| {
1081            cur_agg
1082                .iter()
1083                .zip(prev_agg.iter())
1084                .map(|(cur, prev)| {
1085                    cur.iter()
1086                        .zip(prev.iter())
1087                        .map(|(c, p)| (c - p) as f64 / elapsed_f64)
1088                        .collect()
1089                })
1090                .collect()
1091        };
1092
1093        let cur_layer_utils: Vec<Vec<f64>> =
1094            compute_diff(&cur_layer_usages, &self.prev_layer_usages);
1095        let cur_layer_membw: Vec<Vec<f64>> =
1096            compute_mem_diff(&cur_layer_membw_agg, &self.prev_layer_membw_agg);
1097
1098        let metric_decay =
1099            |cur_metric: Vec<Vec<f64>>, prev_metric: &Vec<Vec<f64>>, decay_rate: f64| {
1100                cur_metric
1101                    .iter()
1102                    .zip(prev_metric.iter())
1103                    .map(|(cur, prev)| {
1104                        cur.iter()
1105                            .zip(prev.iter())
1106                            .map(|(c, p)| {
1107                                let decay = decay_rate.powf(elapsed_f64);
1108                                p * decay + c * (1.0 - decay)
1109                            })
1110                            .collect()
1111                    })
1112                    .collect()
1113            };
1114
1115        let layer_utils: Vec<Vec<f64>> =
1116            metric_decay(cur_layer_utils, &self.layer_utils, *USAGE_DECAY);
1117        let layer_membws: Vec<Vec<f64>> = metric_decay(cur_layer_membw, &self.layer_membws, 0.0);
1118
1119        let cur_total_cpu = read_total_cpu(proc_reader)?;
1120        let cpu_busy = calc_util(&cur_total_cpu, &self.prev_total_cpu)?;
1121
1122        let cur_bpf_stats = BpfStats::read(skel, &cpu_ctxs);
1123        let bpf_stats = &cur_bpf_stats - &self.prev_bpf_stats;
1124
1125        let processing_dur = cur_processing_dur
1126            .checked_sub(self.prev_processing_dur)
1127            .unwrap();
1128
1129        *self = Self {
1130            at: now,
1131            elapsed,
1132            nr_layers: self.nr_layers,
1133            nr_layer_tasks,
1134            nr_nodes: self.nr_nodes,
1135
1136            total_util: layer_utils
1137                .iter()
1138                .map(|x| x.iter().take(LAYER_USAGE_SUM_UPTO + 1).sum::<f64>())
1139                .sum(),
1140            layer_utils,
1141            layer_membws,
1142            prev_layer_usages: cur_layer_usages,
1143            prev_layer_membw_agg: cur_layer_membw_agg,
1144
1145            cpu_busy,
1146            prev_total_cpu: cur_total_cpu,
1147
1148            bpf_stats,
1149            prev_bpf_stats: cur_bpf_stats,
1150
1151            processing_dur,
1152            prev_processing_dur: cur_processing_dur,
1153
1154            layer_slice_us,
1155            gpu_tasks_affinitized: gpu_task_affinitizer.tasks_affinitized,
1156            gpu_task_affinitization_ms: gpu_task_affinitizer.last_task_affinitization_ms,
1157        };
1158        Ok(())
1159    }
1160}
1161
1162#[derive(Debug)]
1163struct Layer {
1164    name: String,
1165    kind: LayerKind,
1166    growth_algo: LayerGrowthAlgo,
1167    core_order: Vec<usize>,
1168
1169    target_llc_cpus: (usize, usize),
1170    assigned_llcs: Vec<usize>,
1171
1172    nr_cpus: usize,
1173    nr_llc_cpus: Vec<usize>,
1174    cpus: Cpumask,
1175    allowed_cpus: Cpumask,
1176}
1177
1178fn get_kallsyms_addr(sym_name: &str) -> Result<u64> {
1179    fs::read_to_string("/proc/kallsyms")?
1180        .lines()
1181        .find(|line| line.contains(sym_name))
1182        .and_then(|line| line.split_whitespace().next())
1183        .and_then(|addr| u64::from_str_radix(addr, 16).ok())
1184        .ok_or_else(|| anyhow!("Symbol '{}' not found", sym_name))
1185}
1186
1187fn resolve_cpus_pct_range(
1188    cpus_range: &Option<(usize, usize)>,
1189    cpus_range_frac: &Option<(f64, f64)>,
1190    max_cpus: usize,
1191) -> Result<(usize, usize)> {
1192    match (cpus_range, cpus_range_frac) {
1193        (Some(_x), Some(_y)) => {
1194            bail!("cpus_range cannot be used with cpus_pct.");
1195        }
1196        (Some((cpus_range_min, cpus_range_max)), None) => Ok((*cpus_range_min, *cpus_range_max)),
1197        (None, Some((cpus_frac_min, cpus_frac_max))) => {
1198            if *cpus_frac_min < 0_f64
1199                || *cpus_frac_min > 1_f64
1200                || *cpus_frac_max < 0_f64
1201                || *cpus_frac_max > 1_f64
1202            {
1203                bail!("cpus_range_frac values must be between 0.0 and 1.0");
1204            }
1205            let cpus_min_count = ((max_cpus as f64) * cpus_frac_min).round_ties_even() as usize;
1206            let cpus_max_count = ((max_cpus as f64) * cpus_frac_max).round_ties_even() as usize;
1207            Ok((
1208                std::cmp::max(cpus_min_count, 1),
1209                std::cmp::min(cpus_max_count, max_cpus),
1210            ))
1211        }
1212        (None, None) => Ok((0, max_cpus)),
1213    }
1214}
1215
1216impl Layer {
1217    fn new(spec: &LayerSpec, topo: &Topology, core_order: &Vec<usize>) -> Result<Self> {
1218        let name = &spec.name;
1219        let kind = spec.kind.clone();
1220        let mut allowed_cpus = Cpumask::new();
1221        match &kind {
1222            LayerKind::Confined {
1223                cpus_range,
1224                cpus_range_frac,
1225                common: LayerCommon { nodes, llcs, .. },
1226                ..
1227            } => {
1228                let cpus_range =
1229                    resolve_cpus_pct_range(cpus_range, cpus_range_frac, topo.all_cpus.len())?;
1230                if cpus_range.0 > cpus_range.1 || cpus_range.1 == 0 {
1231                    bail!("invalid cpus_range {:?}", cpus_range);
1232                }
1233                if nodes.is_empty() && llcs.is_empty() {
1234                    allowed_cpus.set_all();
1235                } else {
1236                    // build up the cpus bitset
1237                    for (node_id, node) in &topo.nodes {
1238                        // first do the matching for nodes
1239                        if nodes.contains(node_id) {
1240                            for &id in node.all_cpus.keys() {
1241                                allowed_cpus.set_cpu(id)?;
1242                            }
1243                        }
1244                        // next match on any LLCs
1245                        for (llc_id, llc) in &node.llcs {
1246                            if llcs.contains(llc_id) {
1247                                for &id in llc.all_cpus.keys() {
1248                                    allowed_cpus.set_cpu(id)?;
1249                                }
1250                            }
1251                        }
1252                    }
1253                }
1254            }
1255            LayerKind::Grouped {
1256                common: LayerCommon { nodes, llcs, .. },
1257                ..
1258            }
1259            | LayerKind::Open {
1260                common: LayerCommon { nodes, llcs, .. },
1261                ..
1262            } => {
1263                if nodes.is_empty() && llcs.is_empty() {
1264                    allowed_cpus.set_all();
1265                } else {
1266                    // build up the cpus bitset
1267                    for (node_id, node) in &topo.nodes {
1268                        // first do the matching for nodes
1269                        if nodes.contains(node_id) {
1270                            for &id in node.all_cpus.keys() {
1271                                allowed_cpus.set_cpu(id)?;
1272                            }
1273                        }
1274                        // next match on any LLCs
1275                        for (llc_id, llc) in &node.llcs {
1276                            if llcs.contains(llc_id) {
1277                                for &id in llc.all_cpus.keys() {
1278                                    allowed_cpus.set_cpu(id)?;
1279                                }
1280                            }
1281                        }
1282                    }
1283                }
1284            }
1285        }
1286
1287        // Util can be above 1.0 for grouped layers if
1288        // util_includes_open_cputime is set.
1289        if let Some(util_range) = kind.util_range() {
1290            if util_range.0 < 0.0 || util_range.1 < 0.0 || util_range.0 >= util_range.1 {
1291                bail!("invalid util_range {:?}", util_range);
1292            }
1293        }
1294
1295        let layer_growth_algo = kind.common().growth_algo.clone();
1296
1297        debug!(
1298            "layer: {} algo: {:?} core order: {:?}",
1299            name, &layer_growth_algo, core_order
1300        );
1301
1302        Ok(Self {
1303            name: name.into(),
1304            kind,
1305            growth_algo: layer_growth_algo,
1306            core_order: core_order.clone(),
1307
1308            target_llc_cpus: (0, 0),
1309            assigned_llcs: vec![],
1310
1311            nr_cpus: 0,
1312            nr_llc_cpus: vec![0; topo.all_llcs.len()],
1313            cpus: Cpumask::new(),
1314            allowed_cpus,
1315        })
1316    }
1317
1318    fn free_some_cpus(&mut self, cpu_pool: &mut CpuPool, max_to_free: usize) -> Result<usize> {
1319        let cpus_to_free = match cpu_pool.next_to_free(&self.cpus, self.core_order.iter().rev())? {
1320            Some(ret) => ret.clone(),
1321            None => return Ok(0),
1322        };
1323
1324        let nr_to_free = cpus_to_free.weight();
1325
1326        Ok(if nr_to_free <= max_to_free {
1327            trace!("[{}] freeing CPUs: {}", self.name, &cpus_to_free);
1328            self.cpus &= &cpus_to_free.not();
1329            self.nr_cpus -= nr_to_free;
1330            for cpu in cpus_to_free.iter() {
1331                self.nr_llc_cpus[cpu_pool.topo.all_cpus[&cpu].llc_id] -= 1;
1332            }
1333            cpu_pool.free(&cpus_to_free)?;
1334            nr_to_free
1335        } else {
1336            0
1337        })
1338    }
1339
1340    fn alloc_some_cpus(&mut self, cpu_pool: &mut CpuPool) -> Result<usize> {
1341        let new_cpus = match cpu_pool
1342            .alloc_cpus(&self.allowed_cpus, &self.core_order)
1343            .clone()
1344        {
1345            Some(ret) => ret.clone(),
1346            None => {
1347                trace!("layer-{} can't grow, no CPUs", &self.name);
1348                return Ok(0);
1349            }
1350        };
1351
1352        let nr_new_cpus = new_cpus.weight();
1353
1354        trace!("[{}] adding CPUs: {}", &self.name, &new_cpus);
1355        self.cpus |= &new_cpus;
1356        self.nr_cpus += nr_new_cpus;
1357        for cpu in new_cpus.iter() {
1358            self.nr_llc_cpus[cpu_pool.topo.all_cpus[&cpu].llc_id] += 1;
1359        }
1360        Ok(nr_new_cpus)
1361    }
1362}
1363#[derive(Debug, Clone)]
1364struct NodeInfo {
1365    node_mask: nix::sched::CpuSet,
1366    _node_id: usize,
1367}
1368
1369#[derive(Debug)]
1370struct GpuTaskAffinitizer {
1371    // This struct tracks information neccessary to numa affinitize
1372    // gpu tasks periodically when needed.
1373    gpu_devs_to_node_info: HashMap<u32, NodeInfo>,
1374    gpu_pids_to_devs: HashMap<Pid, u32>,
1375    last_process_time: Option<Instant>,
1376    sys: System,
1377    pid_map: HashMap<Pid, Vec<Pid>>,
1378    poll_interval: Duration,
1379    enable: bool,
1380    tasks_affinitized: u64,
1381    last_task_affinitization_ms: u64,
1382}
1383
1384impl GpuTaskAffinitizer {
1385    pub fn new(poll_interval: u64, enable: bool) -> GpuTaskAffinitizer {
1386        GpuTaskAffinitizer {
1387            gpu_devs_to_node_info: HashMap::new(),
1388            gpu_pids_to_devs: HashMap::new(),
1389            last_process_time: None,
1390            sys: System::default(),
1391            pid_map: HashMap::new(),
1392            poll_interval: Duration::from_secs(poll_interval),
1393            enable,
1394            tasks_affinitized: 0,
1395            last_task_affinitization_ms: 0,
1396        }
1397    }
1398
1399    fn find_one_cpu(&self, affinity: Vec<u64>) -> Result<u32> {
1400        for (chunk, &mask) in affinity.iter().enumerate() {
1401            let mut inner_offset: u64 = 1;
1402            for _ in 0..64 {
1403                if (mask & inner_offset) != 0 {
1404                    return Ok((64 * chunk + u64::trailing_zeros(inner_offset) as usize) as u32);
1405                }
1406                inner_offset = inner_offset << 1;
1407            }
1408        }
1409        anyhow::bail!("unable to get CPU from NVML bitmask");
1410    }
1411
1412    fn node_to_cpuset(&self, node: &scx_utils::Node) -> Result<CpuSet> {
1413        let mut cpuset = CpuSet::new();
1414        for (cpu_id, _cpu) in &node.all_cpus {
1415            cpuset.set(*cpu_id)?;
1416        }
1417        Ok(cpuset)
1418    }
1419
1420    fn init_dev_node_map(&mut self, topo: Arc<Topology>) -> Result<()> {
1421        let nvml = nvml()?;
1422        let device_count = nvml.device_count()?;
1423
1424        for idx in 0..device_count {
1425            let dev = nvml.device_by_index(idx)?;
1426            // For machines w/ up to 1024 CPUs.
1427            let cpu = dev.cpu_affinity(16)?;
1428            let ideal_cpu = self.find_one_cpu(cpu)?;
1429            if let Some(cpu) = topo.all_cpus.get(&(ideal_cpu as usize)) {
1430                self.gpu_devs_to_node_info.insert(
1431                    idx,
1432                    NodeInfo {
1433                        node_mask: self.node_to_cpuset(
1434                            topo.nodes.get(&cpu.node_id).expect("topo missing node"),
1435                        )?,
1436                        _node_id: cpu.node_id,
1437                    },
1438                );
1439            }
1440        }
1441        Ok(())
1442    }
1443
1444    fn update_gpu_pids(&mut self) -> Result<()> {
1445        let nvml = nvml()?;
1446        for i in 0..nvml.device_count()? {
1447            let device = nvml.device_by_index(i)?;
1448            for proc in device
1449                .running_compute_processes()?
1450                .into_iter()
1451                .chain(device.running_graphics_processes()?.into_iter())
1452            {
1453                self.gpu_pids_to_devs.insert(Pid::from_u32(proc.pid), i);
1454            }
1455        }
1456        Ok(())
1457    }
1458
1459    fn update_process_info(&mut self) -> Result<()> {
1460        self.sys.refresh_processes_specifics(
1461            ProcessesToUpdate::All,
1462            true,
1463            ProcessRefreshKind::nothing(),
1464        );
1465        self.pid_map.clear();
1466        for (pid, proc_) in self.sys.processes() {
1467            if let Some(ppid) = proc_.parent() {
1468                self.pid_map.entry(ppid).or_default().push(*pid);
1469            }
1470        }
1471        Ok(())
1472    }
1473
1474    fn get_child_pids_and_tids(&self, root_pid: Pid) -> HashSet<Pid> {
1475        let mut work = VecDeque::from([root_pid]);
1476        let mut pids_and_tids: HashSet<Pid> = HashSet::new();
1477
1478        while let Some(pid) = work.pop_front() {
1479            if pids_and_tids.insert(pid) {
1480                if let Some(kids) = self.pid_map.get(&pid) {
1481                    work.extend(kids);
1482                }
1483                if let Some(proc_) = self.sys.process(pid) {
1484                    if let Some(tasks) = proc_.tasks() {
1485                        pids_and_tids.extend(tasks.iter().copied());
1486                    }
1487                }
1488            }
1489        }
1490        pids_and_tids
1491    }
1492
1493    fn affinitize_gpu_pids(&mut self) -> Result<()> {
1494        if !self.enable {
1495            return Ok(());
1496        }
1497        for (pid, dev) in &self.gpu_pids_to_devs {
1498            let node_info = self
1499                .gpu_devs_to_node_info
1500                .get(&dev)
1501                .expect("Unable to get gpu pid node mask");
1502            for child in self.get_child_pids_and_tids(*pid) {
1503                match nix::sched::sched_setaffinity(
1504                    nix::unistd::Pid::from_raw(child.as_u32() as i32),
1505                    &node_info.node_mask,
1506                ) {
1507                    Ok(_) => {
1508                        // Increment the global counter for successful affinitization
1509                        self.tasks_affinitized += 1;
1510                    }
1511                    Err(_) => {
1512                        debug!(
1513                            "Error affinitizing gpu pid {} to node {:#?}",
1514                            child.as_u32(),
1515                            node_info
1516                        );
1517                    }
1518                };
1519            }
1520        }
1521        Ok(())
1522    }
1523
1524    pub fn maybe_affinitize(&mut self) {
1525        if !self.enable {
1526            return;
1527        }
1528        let now = Instant::now();
1529
1530        if let Some(last_process_time) = self.last_process_time {
1531            if (now - last_process_time) < self.poll_interval {
1532                return;
1533            }
1534        }
1535
1536        match self.update_gpu_pids() {
1537            Ok(_) => {}
1538            Err(e) => {
1539                error!("Error updating GPU PIDs: {}", e);
1540            }
1541        };
1542        match self.update_process_info() {
1543            Ok(_) => {}
1544            Err(e) => {
1545                error!("Error updating process info to affinitize GPU PIDs: {}", e);
1546            }
1547        };
1548        match self.affinitize_gpu_pids() {
1549            Ok(_) => {}
1550            Err(e) => {
1551                error!("Error updating GPU PIDs: {}", e);
1552            }
1553        };
1554        self.last_process_time = Some(now);
1555        self.last_task_affinitization_ms = (Instant::now() - now).as_millis() as u64;
1556
1557        return;
1558    }
1559
1560    pub fn init(&mut self, topo: Arc<Topology>) {
1561        if !self.enable || self.last_process_time.is_some() {
1562            return;
1563        }
1564
1565        match self.init_dev_node_map(topo) {
1566            Ok(_) => {}
1567            Err(e) => {
1568                error!("Error initializing gpu node dev map: {}", e);
1569            }
1570        };
1571        self.sys = System::new_all();
1572        return;
1573    }
1574}
1575
1576struct Scheduler<'a> {
1577    skel: BpfSkel<'a>,
1578    struct_ops: Option<libbpf_rs::Link>,
1579    layer_specs: Vec<LayerSpec>,
1580
1581    sched_intv: Duration,
1582    layer_refresh_intv: Duration,
1583
1584    cpu_pool: CpuPool,
1585    layers: Vec<Layer>,
1586    idle_qos_enabled: bool,
1587
1588    proc_reader: fb_procfs::ProcReader,
1589    sched_stats: Stats,
1590
1591    nr_layer_cpus_ranges: Vec<(usize, usize)>,
1592    processing_dur: Duration,
1593
1594    topo: Arc<Topology>,
1595    netdevs: BTreeMap<String, NetDev>,
1596    stats_server: StatsServer<StatsReq, StatsRes>,
1597    gpu_task_handler: GpuTaskAffinitizer,
1598}
1599
1600impl<'a> Scheduler<'a> {
1601    fn init_layers(skel: &mut OpenBpfSkel, specs: &[LayerSpec], topo: &Topology) -> Result<()> {
1602        skel.maps.rodata_data.as_mut().unwrap().nr_layers = specs.len() as u32;
1603        let mut perf_set = false;
1604
1605        let mut layer_iteration_order = (0..specs.len()).collect::<Vec<_>>();
1606        let mut layer_weights: Vec<usize> = vec![];
1607
1608        for (spec_i, spec) in specs.iter().enumerate() {
1609            let layer = &mut skel.maps.bss_data.as_mut().unwrap().layers[spec_i];
1610
1611            for (or_i, or) in spec.matches.iter().enumerate() {
1612                for (and_i, and) in or.iter().enumerate() {
1613                    let mt = &mut layer.matches[or_i].matches[and_i];
1614
1615                    // Rules are allowlist-based by default
1616                    mt.exclude.write(false);
1617
1618                    match and {
1619                        LayerMatch::CgroupPrefix(prefix) => {
1620                            mt.kind = bpf_intf::layer_match_kind_MATCH_CGROUP_PREFIX as i32;
1621                            copy_into_cstr(&mut mt.cgroup_prefix, prefix.as_str());
1622                        }
1623                        LayerMatch::CgroupSuffix(suffix) => {
1624                            mt.kind = bpf_intf::layer_match_kind_MATCH_CGROUP_SUFFIX as i32;
1625                            copy_into_cstr(&mut mt.cgroup_suffix, suffix.as_str());
1626                        }
1627                        LayerMatch::CgroupRegex(_) => {
1628                            panic!("CgroupRegex match only supported in template");
1629                        }
1630                        LayerMatch::CgroupContains(substr) => {
1631                            mt.kind = bpf_intf::layer_match_kind_MATCH_CGROUP_CONTAINS as i32;
1632                            copy_into_cstr(&mut mt.cgroup_substr, substr.as_str());
1633                        }
1634                        LayerMatch::CommPrefix(prefix) => {
1635                            mt.kind = bpf_intf::layer_match_kind_MATCH_COMM_PREFIX as i32;
1636                            copy_into_cstr(&mut mt.comm_prefix, prefix.as_str());
1637                        }
1638                        LayerMatch::CommPrefixExclude(prefix) => {
1639                            mt.kind = bpf_intf::layer_match_kind_MATCH_COMM_PREFIX as i32;
1640                            mt.exclude.write(true);
1641                            copy_into_cstr(&mut mt.comm_prefix, prefix.as_str());
1642                        }
1643                        LayerMatch::PcommPrefix(prefix) => {
1644                            mt.kind = bpf_intf::layer_match_kind_MATCH_PCOMM_PREFIX as i32;
1645                            copy_into_cstr(&mut mt.pcomm_prefix, prefix.as_str());
1646                        }
1647                        LayerMatch::PcommPrefixExclude(prefix) => {
1648                            mt.kind = bpf_intf::layer_match_kind_MATCH_PCOMM_PREFIX as i32;
1649                            mt.exclude.write(true);
1650                            copy_into_cstr(&mut mt.pcomm_prefix, prefix.as_str());
1651                        }
1652                        LayerMatch::NiceAbove(nice) => {
1653                            mt.kind = bpf_intf::layer_match_kind_MATCH_NICE_ABOVE as i32;
1654                            mt.nice = *nice;
1655                        }
1656                        LayerMatch::NiceBelow(nice) => {
1657                            mt.kind = bpf_intf::layer_match_kind_MATCH_NICE_BELOW as i32;
1658                            mt.nice = *nice;
1659                        }
1660                        LayerMatch::NiceEquals(nice) => {
1661                            mt.kind = bpf_intf::layer_match_kind_MATCH_NICE_EQUALS as i32;
1662                            mt.nice = *nice;
1663                        }
1664                        LayerMatch::UIDEquals(user_id) => {
1665                            mt.kind = bpf_intf::layer_match_kind_MATCH_USER_ID_EQUALS as i32;
1666                            mt.user_id = *user_id;
1667                        }
1668                        LayerMatch::GIDEquals(group_id) => {
1669                            mt.kind = bpf_intf::layer_match_kind_MATCH_GROUP_ID_EQUALS as i32;
1670                            mt.group_id = *group_id;
1671                        }
1672                        LayerMatch::PIDEquals(pid) => {
1673                            mt.kind = bpf_intf::layer_match_kind_MATCH_PID_EQUALS as i32;
1674                            mt.pid = *pid;
1675                        }
1676                        LayerMatch::PPIDEquals(ppid) => {
1677                            mt.kind = bpf_intf::layer_match_kind_MATCH_PPID_EQUALS as i32;
1678                            mt.ppid = *ppid;
1679                        }
1680                        LayerMatch::TGIDEquals(tgid) => {
1681                            mt.kind = bpf_intf::layer_match_kind_MATCH_TGID_EQUALS as i32;
1682                            mt.tgid = *tgid;
1683                        }
1684                        LayerMatch::NSPIDEquals(nsid, pid) => {
1685                            mt.kind = bpf_intf::layer_match_kind_MATCH_NSPID_EQUALS as i32;
1686                            mt.nsid = *nsid;
1687                            mt.pid = *pid;
1688                        }
1689                        LayerMatch::NSEquals(nsid) => {
1690                            mt.kind = bpf_intf::layer_match_kind_MATCH_NS_EQUALS as i32;
1691                            mt.nsid = *nsid as u64;
1692                        }
1693                        LayerMatch::CmdJoin(joincmd) => {
1694                            mt.kind = bpf_intf::layer_match_kind_MATCH_SCXCMD_JOIN as i32;
1695                            copy_into_cstr(&mut mt.comm_prefix, joincmd);
1696                        }
1697                        LayerMatch::IsGroupLeader(polarity) => {
1698                            mt.kind = bpf_intf::layer_match_kind_MATCH_IS_GROUP_LEADER as i32;
1699                            mt.is_group_leader.write(*polarity);
1700                        }
1701                        LayerMatch::IsKthread(polarity) => {
1702                            mt.kind = bpf_intf::layer_match_kind_MATCH_IS_KTHREAD as i32;
1703                            mt.is_kthread.write(*polarity);
1704                        }
1705                        LayerMatch::UsedGpuTid(polarity) => {
1706                            mt.kind = bpf_intf::layer_match_kind_MATCH_USED_GPU_TID as i32;
1707                            mt.used_gpu_tid.write(*polarity);
1708                        }
1709                        LayerMatch::UsedGpuPid(polarity) => {
1710                            mt.kind = bpf_intf::layer_match_kind_MATCH_USED_GPU_PID as i32;
1711                            mt.used_gpu_pid.write(*polarity);
1712                        }
1713                        LayerMatch::AvgRuntime(min, max) => {
1714                            mt.kind = bpf_intf::layer_match_kind_MATCH_AVG_RUNTIME as i32;
1715                            mt.min_avg_runtime_us = *min;
1716                            mt.max_avg_runtime_us = *max;
1717                        }
1718                        LayerMatch::HintEquals(hint) => {
1719                            mt.kind = bpf_intf::layer_match_kind_MATCH_HINT_EQUALS as i32;
1720                            mt.hint = *hint;
1721                        }
1722                    }
1723                }
1724                layer.matches[or_i].nr_match_ands = or.len() as i32;
1725            }
1726
1727            layer.nr_match_ors = spec.matches.len() as u32;
1728            layer.kind = spec.kind.as_bpf_enum();
1729
1730            {
1731                let LayerCommon {
1732                    min_exec_us,
1733                    yield_ignore,
1734                    perf,
1735                    preempt,
1736                    preempt_first,
1737                    exclusive,
1738                    allow_node_aligned,
1739                    skip_remote_node,
1740                    prev_over_idle_core,
1741                    growth_algo,
1742                    nodes,
1743                    slice_us,
1744                    fifo,
1745                    weight,
1746                    disallow_open_after_us,
1747                    disallow_preempt_after_us,
1748                    xllc_mig_min_us,
1749                    placement,
1750                    ..
1751                } = spec.kind.common();
1752
1753                layer.slice_ns = *slice_us * 1000;
1754                layer.fifo.write(*fifo);
1755                layer.min_exec_ns = min_exec_us * 1000;
1756                layer.yield_step_ns = if *yield_ignore > 0.999 {
1757                    0
1758                } else if *yield_ignore < 0.001 {
1759                    layer.slice_ns
1760                } else {
1761                    (layer.slice_ns as f64 * (1.0 - *yield_ignore)) as u64
1762                };
1763                let mut layer_name: String = spec.name.clone();
1764                layer_name.truncate(MAX_LAYER_NAME);
1765                copy_into_cstr(&mut layer.name, layer_name.as_str());
1766                layer.preempt.write(*preempt);
1767                layer.preempt_first.write(*preempt_first);
1768                layer.excl.write(*exclusive);
1769                layer.allow_node_aligned.write(*allow_node_aligned);
1770                layer.skip_remote_node.write(*skip_remote_node);
1771                layer.prev_over_idle_core.write(*prev_over_idle_core);
1772                layer.growth_algo = growth_algo.as_bpf_enum();
1773                layer.weight = *weight;
1774                layer.disallow_open_after_ns = match disallow_open_after_us.unwrap() {
1775                    v if v == u64::MAX => v,
1776                    v => v * 1000,
1777                };
1778                layer.disallow_preempt_after_ns = match disallow_preempt_after_us.unwrap() {
1779                    v if v == u64::MAX => v,
1780                    v => v * 1000,
1781                };
1782                layer.xllc_mig_min_ns = (xllc_mig_min_us * 1000.0) as u64;
1783                layer_weights.push(layer.weight.try_into().unwrap());
1784                layer.perf = u32::try_from(*perf)?;
1785                layer.node_mask = nodemask_from_nodes(nodes) as u64;
1786                for (topo_node_id, topo_node) in &topo.nodes {
1787                    if !nodes.is_empty() && !nodes.contains(topo_node_id) {
1788                        continue;
1789                    }
1790                    layer.llc_mask |= llcmask_from_llcs(&topo_node.llcs) as u64;
1791                }
1792
1793                let task_place = |place: u32| crate::types::layer_task_place(place);
1794                layer.task_place = match placement {
1795                    LayerPlacement::Standard => {
1796                        task_place(bpf_intf::layer_task_place_PLACEMENT_STD as u32)
1797                    }
1798                    LayerPlacement::Sticky => {
1799                        task_place(bpf_intf::layer_task_place_PLACEMENT_STICK as u32)
1800                    }
1801                    LayerPlacement::Floating => {
1802                        task_place(bpf_intf::layer_task_place_PLACEMENT_FLOAT as u32)
1803                    }
1804                };
1805            }
1806
1807            layer.is_protected.write(match spec.kind {
1808                LayerKind::Open { .. } => false,
1809                LayerKind::Confined { protected, .. } | LayerKind::Grouped { protected, .. } => {
1810                    protected
1811                }
1812            });
1813
1814            match &spec.cpuset {
1815                Some(mask) => {
1816                    Self::update_cpumask(&mask, &mut layer.cpuset);
1817                }
1818                None => {
1819                    for i in 0..layer.cpuset.len() {
1820                        layer.cpuset[i] = u8::MAX;
1821                    }
1822                }
1823            };
1824
1825            perf_set |= layer.perf > 0;
1826        }
1827
1828        layer_iteration_order.sort_by(|i, j| layer_weights[*i].cmp(&layer_weights[*j]));
1829        for (idx, layer_idx) in layer_iteration_order.iter().enumerate() {
1830            skel.maps
1831                .rodata_data
1832                .as_mut()
1833                .unwrap()
1834                .layer_iteration_order[idx] = *layer_idx as u32;
1835        }
1836
1837        if perf_set && !compat::ksym_exists("scx_bpf_cpuperf_set")? {
1838            warn!("cpufreq support not available, ignoring perf configurations");
1839        }
1840
1841        Ok(())
1842    }
1843
1844    fn init_nodes(skel: &mut OpenBpfSkel, _opts: &Opts, topo: &Topology) {
1845        skel.maps.rodata_data.as_mut().unwrap().nr_nodes = topo.nodes.len() as u32;
1846        skel.maps.rodata_data.as_mut().unwrap().nr_llcs = 0;
1847
1848        for (&node_id, node) in &topo.nodes {
1849            debug!("configuring node {}, LLCs {:?}", node_id, node.llcs.len());
1850            skel.maps.rodata_data.as_mut().unwrap().nr_llcs += node.llcs.len() as u32;
1851            let raw_numa_slice = node.span.as_raw_slice();
1852            let node_cpumask_slice =
1853                &mut skel.maps.rodata_data.as_mut().unwrap().numa_cpumasks[node_id];
1854            let (left, _) = node_cpumask_slice.split_at_mut(raw_numa_slice.len());
1855            left.clone_from_slice(raw_numa_slice);
1856            debug!(
1857                "node {} mask: {:?}",
1858                node_id,
1859                skel.maps.rodata_data.as_ref().unwrap().numa_cpumasks[node_id]
1860            );
1861
1862            for llc in node.llcs.values() {
1863                debug!("configuring llc {:?} for node {:?}", llc.id, node_id);
1864                skel.maps.rodata_data.as_mut().unwrap().llc_numa_id_map[llc.id] = node_id as u32;
1865            }
1866        }
1867
1868        for cpu in topo.all_cpus.values() {
1869            skel.maps.rodata_data.as_mut().unwrap().cpu_llc_id_map[cpu.id] = cpu.llc_id as u32;
1870        }
1871    }
1872
1873    fn init_cpu_prox_map(topo: &Topology, cpu_ctxs: &mut [bpf_intf::cpu_ctx]) {
1874        let radiate = |mut vec: Vec<usize>, center_id: usize| -> Vec<usize> {
1875            vec.sort_by_key(|&id| (center_id as i32 - id as i32).abs());
1876            vec
1877        };
1878        let radiate_cpu =
1879            |mut vec: Vec<usize>, center_cpu: usize, center_core: usize| -> Vec<usize> {
1880                vec.sort_by_key(|&id| {
1881                    (
1882                        (center_core as i32 - topo.all_cpus.get(&id).unwrap().core_id as i32).abs(),
1883                        (center_cpu as i32 - id as i32).abs(),
1884                    )
1885                });
1886                vec
1887            };
1888
1889        for (&cpu_id, cpu) in &topo.all_cpus {
1890            // Collect the spans.
1891            let mut core_span = topo.all_cores[&cpu.core_id].span.clone();
1892            let llc_span = &topo.all_llcs[&cpu.llc_id].span;
1893            let node_span = &topo.nodes[&cpu.node_id].span;
1894            let sys_span = &topo.span;
1895
1896            // Make the spans exclusive.
1897            let sys_span = sys_span.and(&node_span.not());
1898            let node_span = node_span.and(&llc_span.not());
1899            let llc_span = llc_span.and(&core_span.not());
1900            core_span.clear_cpu(cpu_id).unwrap();
1901
1902            // Convert them into arrays.
1903            let mut sys_order: Vec<usize> = sys_span.iter().collect();
1904            let mut node_order: Vec<usize> = node_span.iter().collect();
1905            let mut llc_order: Vec<usize> = llc_span.iter().collect();
1906            let mut core_order: Vec<usize> = core_span.iter().collect();
1907
1908            // Shuffle them so that different CPUs follow different orders.
1909            // Each CPU radiates in both directions based on the cpu id and
1910            // radiates out to the closest cores based on core ids.
1911
1912            sys_order = radiate_cpu(sys_order, cpu_id, cpu.core_id);
1913            node_order = radiate(node_order, cpu.node_id);
1914            llc_order = radiate_cpu(llc_order, cpu_id, cpu.core_id);
1915            core_order = radiate_cpu(core_order, cpu_id, cpu.core_id);
1916
1917            // Concatenate and record the topology boundaries.
1918            let mut order: Vec<usize> = vec![];
1919            let mut idx: usize = 0;
1920
1921            idx += 1;
1922            order.push(cpu_id);
1923
1924            idx += core_order.len();
1925            order.append(&mut core_order);
1926            let core_end = idx;
1927
1928            idx += llc_order.len();
1929            order.append(&mut llc_order);
1930            let llc_end = idx;
1931
1932            idx += node_order.len();
1933            order.append(&mut node_order);
1934            let node_end = idx;
1935
1936            idx += sys_order.len();
1937            order.append(&mut sys_order);
1938            let sys_end = idx;
1939
1940            debug!(
1941                "CPU[{}] proximity map[{}/{}/{}/{}]: {:?}",
1942                cpu_id, core_end, llc_end, node_end, sys_end, &order
1943            );
1944
1945            // Record in cpu_ctx.
1946            let pmap = &mut cpu_ctxs[cpu_id].prox_map;
1947            for (i, &cpu) in order.iter().enumerate() {
1948                pmap.cpus[i] = cpu as u16;
1949            }
1950            pmap.core_end = core_end as u32;
1951            pmap.llc_end = llc_end as u32;
1952            pmap.node_end = node_end as u32;
1953            pmap.sys_end = sys_end as u32;
1954        }
1955    }
1956
1957    fn convert_cpu_ctxs(cpu_ctxs: Vec<bpf_intf::cpu_ctx>) -> Vec<Vec<u8>> {
1958        cpu_ctxs
1959            .into_iter()
1960            .map(|cpu_ctx| {
1961                let bytes = unsafe {
1962                    std::slice::from_raw_parts(
1963                        &cpu_ctx as *const bpf_intf::cpu_ctx as *const u8,
1964                        std::mem::size_of::<bpf_intf::cpu_ctx>(),
1965                    )
1966                };
1967                bytes.to_vec()
1968            })
1969            .collect()
1970    }
1971
1972    fn init_cpus(skel: &BpfSkel, layer_specs: &[LayerSpec], topo: &Topology) -> Result<()> {
1973        let key = (0_u32).to_ne_bytes();
1974        let mut cpu_ctxs: Vec<bpf_intf::cpu_ctx> = vec![];
1975        let cpu_ctxs_vec = skel
1976            .maps
1977            .cpu_ctxs
1978            .lookup_percpu(&key, libbpf_rs::MapFlags::ANY)
1979            .context("Failed to lookup cpu_ctx")?
1980            .unwrap();
1981
1982        let op_layers: Vec<u32> = layer_specs
1983            .iter()
1984            .enumerate()
1985            .filter(|(_idx, spec)| match &spec.kind {
1986                LayerKind::Open { .. } => spec.kind.common().preempt,
1987                _ => false,
1988            })
1989            .map(|(idx, _)| idx as u32)
1990            .collect();
1991        let on_layers: Vec<u32> = layer_specs
1992            .iter()
1993            .enumerate()
1994            .filter(|(_idx, spec)| match &spec.kind {
1995                LayerKind::Open { .. } => !spec.kind.common().preempt,
1996                _ => false,
1997            })
1998            .map(|(idx, _)| idx as u32)
1999            .collect();
2000        let gp_layers: Vec<u32> = layer_specs
2001            .iter()
2002            .enumerate()
2003            .filter(|(_idx, spec)| match &spec.kind {
2004                LayerKind::Grouped { .. } => spec.kind.common().preempt,
2005                _ => false,
2006            })
2007            .map(|(idx, _)| idx as u32)
2008            .collect();
2009        let gn_layers: Vec<u32> = layer_specs
2010            .iter()
2011            .enumerate()
2012            .filter(|(_idx, spec)| match &spec.kind {
2013                LayerKind::Grouped { .. } => !spec.kind.common().preempt,
2014                _ => false,
2015            })
2016            .map(|(idx, _)| idx as u32)
2017            .collect();
2018
2019        // FIXME - this incorrectly assumes all possible CPUs are consecutive.
2020        for cpu in 0..*NR_CPUS_POSSIBLE {
2021            cpu_ctxs.push(*unsafe {
2022                &*(cpu_ctxs_vec[cpu].as_slice().as_ptr() as *const bpf_intf::cpu_ctx)
2023            });
2024
2025            let topo_cpu = topo.all_cpus.get(&cpu).unwrap();
2026            let is_big = topo_cpu.core_type == CoreType::Big { turbo: true };
2027            cpu_ctxs[cpu].cpu = cpu as i32;
2028            cpu_ctxs[cpu].layer_id = MAX_LAYERS as u32;
2029            cpu_ctxs[cpu].task_layer_id = MAX_LAYERS as u32;
2030            cpu_ctxs[cpu].is_big = is_big;
2031
2032            fastrand::seed(cpu as u64);
2033
2034            let mut ogp_order = op_layers.clone();
2035            ogp_order.append(&mut gp_layers.clone());
2036            fastrand::shuffle(&mut ogp_order);
2037
2038            let mut ogn_order = on_layers.clone();
2039            ogn_order.append(&mut gn_layers.clone());
2040            fastrand::shuffle(&mut ogn_order);
2041
2042            let mut op_order = op_layers.clone();
2043            fastrand::shuffle(&mut op_order);
2044
2045            let mut on_order = on_layers.clone();
2046            fastrand::shuffle(&mut on_order);
2047
2048            let mut gp_order = gp_layers.clone();
2049            fastrand::shuffle(&mut gp_order);
2050
2051            let mut gn_order = gn_layers.clone();
2052            fastrand::shuffle(&mut gn_order);
2053
2054            for i in 0..MAX_LAYERS {
2055                cpu_ctxs[cpu].ogp_layer_order[i] =
2056                    ogp_order.get(i).cloned().unwrap_or(MAX_LAYERS as u32);
2057                cpu_ctxs[cpu].ogn_layer_order[i] =
2058                    ogn_order.get(i).cloned().unwrap_or(MAX_LAYERS as u32);
2059
2060                cpu_ctxs[cpu].op_layer_order[i] =
2061                    op_order.get(i).cloned().unwrap_or(MAX_LAYERS as u32);
2062                cpu_ctxs[cpu].on_layer_order[i] =
2063                    on_order.get(i).cloned().unwrap_or(MAX_LAYERS as u32);
2064                cpu_ctxs[cpu].gp_layer_order[i] =
2065                    gp_order.get(i).cloned().unwrap_or(MAX_LAYERS as u32);
2066                cpu_ctxs[cpu].gn_layer_order[i] =
2067                    gn_order.get(i).cloned().unwrap_or(MAX_LAYERS as u32);
2068            }
2069        }
2070
2071        Self::init_cpu_prox_map(topo, &mut cpu_ctxs);
2072
2073        skel.maps
2074            .cpu_ctxs
2075            .update_percpu(
2076                &key,
2077                &Self::convert_cpu_ctxs(cpu_ctxs),
2078                libbpf_rs::MapFlags::ANY,
2079            )
2080            .context("Failed to update cpu_ctx")?;
2081
2082        Ok(())
2083    }
2084
2085    fn init_llc_prox_map(skel: &mut BpfSkel, topo: &Topology) -> Result<()> {
2086        for (&llc_id, llc) in &topo.all_llcs {
2087            // Collect the orders.
2088            let mut node_order: Vec<usize> =
2089                topo.nodes[&llc.node_id].llcs.keys().cloned().collect();
2090            let mut sys_order: Vec<usize> = topo.all_llcs.keys().cloned().collect();
2091
2092            // Make the orders exclusive.
2093            sys_order.retain(|id| !node_order.contains(id));
2094            node_order.retain(|&id| id != llc_id);
2095
2096            // Shufle so that different LLCs follow different orders. See
2097            // init_cpu_prox_map().
2098            fastrand::seed(llc_id as u64);
2099            fastrand::shuffle(&mut sys_order);
2100            fastrand::shuffle(&mut node_order);
2101
2102            // Concatenate and record the node boundary.
2103            let mut order: Vec<usize> = vec![];
2104            let mut idx: usize = 0;
2105
2106            idx += 1;
2107            order.push(llc_id);
2108
2109            idx += node_order.len();
2110            order.append(&mut node_order);
2111            let node_end = idx;
2112
2113            idx += sys_order.len();
2114            order.append(&mut sys_order);
2115            let sys_end = idx;
2116
2117            debug!(
2118                "LLC[{}] proximity map[{}/{}]: {:?}",
2119                llc_id, node_end, sys_end, &order
2120            );
2121
2122            // Record in llc_ctx.
2123            //
2124            // XXX - This would be a lot easier if llc_ctx were in the bss.
2125            // See BpfStats::read().
2126            let key = llc_id as u32;
2127            let llc_id_slice =
2128                unsafe { std::slice::from_raw_parts((&key as *const u32) as *const u8, 4) };
2129            let v = skel
2130                .maps
2131                .llc_data
2132                .lookup(llc_id_slice, libbpf_rs::MapFlags::ANY)
2133                .unwrap()
2134                .unwrap();
2135            let mut llcc = unsafe { *(v.as_slice().as_ptr() as *const bpf_intf::llc_ctx) };
2136
2137            let pmap = &mut llcc.prox_map;
2138            for (i, &llc_id) in order.iter().enumerate() {
2139                pmap.llcs[i] = llc_id as u16;
2140            }
2141            pmap.node_end = node_end as u32;
2142            pmap.sys_end = sys_end as u32;
2143
2144            let v = unsafe {
2145                std::slice::from_raw_parts(
2146                    &llcc as *const bpf_intf::llc_ctx as *const u8,
2147                    std::mem::size_of::<bpf_intf::llc_ctx>(),
2148                )
2149            };
2150
2151            skel.maps
2152                .llc_data
2153                .update(llc_id_slice, v, libbpf_rs::MapFlags::ANY)?
2154        }
2155
2156        Ok(())
2157    }
2158
2159    fn init(
2160        opts: &'a Opts,
2161        layer_specs: &[LayerSpec],
2162        open_object: &'a mut MaybeUninit<OpenObject>,
2163        hint_to_layer_map: &HashMap<u64, usize>,
2164        membw_tracking: bool,
2165    ) -> Result<Self> {
2166        let nr_layers = layer_specs.len();
2167        let mut disable_topology = opts.disable_topology.unwrap_or(false);
2168
2169        let topo = Arc::new(if disable_topology {
2170            Topology::with_flattened_llc_node()?
2171        } else {
2172            Topology::new()?
2173        });
2174
2175        /*
2176         * FIXME: scx_layered incorrectly assumes that node, LLC and CPU IDs
2177         * are consecutive. Verify that they are on this system and bail if
2178         * not. It's lucky that core ID is not used anywhere as core IDs are
2179         * not consecutive on some Ryzen CPUs.
2180         */
2181        if topo.nodes.keys().enumerate().any(|(i, &k)| i != k) {
2182            bail!("Holes in node IDs detected: {:?}", topo.nodes.keys());
2183        }
2184        if topo.all_llcs.keys().enumerate().any(|(i, &k)| i != k) {
2185            bail!("Holes in LLC IDs detected: {:?}", topo.all_llcs.keys());
2186        }
2187        if topo.all_cpus.keys().enumerate().any(|(i, &k)| i != k) {
2188            bail!("Holes in CPU IDs detected: {:?}", topo.all_cpus.keys());
2189        }
2190
2191        let netdevs = if opts.netdev_irq_balance {
2192            warn!(
2193                "Experimental netdev IRQ balancing enabled. Reset IRQ masks of network devices after use!!!"
2194            );
2195            read_netdevs()?
2196        } else {
2197            BTreeMap::new()
2198        };
2199
2200        if !disable_topology {
2201            if topo.nodes.len() == 1 && topo.nodes[&0].llcs.len() == 1 {
2202                disable_topology = true;
2203            };
2204            info!(
2205                "Topology awareness not specified, selecting {} based on hardware",
2206                if disable_topology {
2207                    "disabled"
2208                } else {
2209                    "enabled"
2210                }
2211            );
2212        };
2213
2214        let cpu_pool = CpuPool::new(topo.clone())?;
2215
2216        // If disabling topology awareness clear out any set NUMA/LLC configs and
2217        // it will fallback to using all cores.
2218        let layer_specs: Vec<_> = if disable_topology {
2219            info!("Disabling topology awareness");
2220            layer_specs
2221                .iter()
2222                .cloned()
2223                .map(|mut s| {
2224                    s.kind.common_mut().nodes.clear();
2225                    s.kind.common_mut().llcs.clear();
2226                    s
2227                })
2228                .collect()
2229        } else {
2230            layer_specs.to_vec()
2231        };
2232
2233        // Check kernel features
2234        init_libbpf_logging(None);
2235        let kfuncs_in_syscall = scx_bpf_compat::kfuncs_supported_in_syscall()?;
2236        if !kfuncs_in_syscall {
2237            warn!("Using slow path: kfuncs not supported in syscall programs (a8e03b6bbb2c ∉ ker)");
2238        }
2239
2240        // Open the BPF prog first for verification.
2241        let mut skel_builder = BpfSkelBuilder::default();
2242        skel_builder.obj_builder.debug(opts.verbose > 1);
2243        info!(
2244            "Running scx_layered (build ID: {})",
2245            build_id::full_version(env!("CARGO_PKG_VERSION"))
2246        );
2247        let open_opts = opts.libbpf.clone().into_bpf_open_opts();
2248        let mut skel = scx_ops_open!(skel_builder, open_object, layered, open_opts)?;
2249
2250        // No memory BW tracking by default
2251        skel.progs.scx_pmu_tc.set_autoload(membw_tracking);
2252
2253        // enable autoloads for conditionally loaded things
2254        // immediately after creating skel (because this is always before loading)
2255        if opts.enable_gpu_support {
2256            // by default, enable open if gpu support is enabled.
2257            // open has been observed to be relatively cheap to kprobe.
2258            if opts.gpu_kprobe_level >= 1 {
2259                compat::cond_kprobe_enable("nvidia_open", &skel.progs.kprobe_nvidia_open)?;
2260            }
2261            // enable the rest progressively based upon how often they are called
2262            // for observed workloads
2263            if opts.gpu_kprobe_level >= 2 {
2264                compat::cond_kprobe_enable("nvidia_mmap", &skel.progs.kprobe_nvidia_mmap)?;
2265            }
2266            if opts.gpu_kprobe_level >= 3 {
2267                compat::cond_kprobe_enable("nvidia_poll", &skel.progs.kprobe_nvidia_poll)?;
2268            }
2269        }
2270
2271        let ext_sched_class_addr = get_kallsyms_addr("ext_sched_class");
2272        let idle_sched_class_addr = get_kallsyms_addr("idle_sched_class");
2273
2274        let event = if membw_tracking {
2275            setup_membw_tracking(&mut skel)?
2276        } else {
2277            0
2278        };
2279
2280        let rodata = skel.maps.rodata_data.as_mut().unwrap();
2281
2282        if ext_sched_class_addr.is_ok() && idle_sched_class_addr.is_ok() {
2283            rodata.ext_sched_class_addr = ext_sched_class_addr.unwrap();
2284            rodata.idle_sched_class_addr = idle_sched_class_addr.unwrap();
2285        } else {
2286            warn!(
2287                "Unable to get sched_class addresses from /proc/kallsyms, disabling skip_preempt."
2288            );
2289        }
2290
2291        rodata.slice_ns = scx_enums.SCX_SLICE_DFL;
2292        rodata.max_exec_ns = 20 * scx_enums.SCX_SLICE_DFL;
2293
2294        // Initialize skel according to @opts.
2295        skel.struct_ops.layered_mut().exit_dump_len = opts.exit_dump_len;
2296
2297        if !opts.disable_queued_wakeup {
2298            match *compat::SCX_OPS_ALLOW_QUEUED_WAKEUP {
2299                0 => info!("Kernel does not support queued wakeup optimization"),
2300                v => skel.struct_ops.layered_mut().flags |= v,
2301            }
2302        }
2303
2304        rodata.percpu_kthread_preempt = !opts.disable_percpu_kthread_preempt;
2305        rodata.percpu_kthread_preempt_all =
2306            !opts.disable_percpu_kthread_preempt && opts.percpu_kthread_preempt_all;
2307        rodata.debug = opts.verbose as u32;
2308        rodata.slice_ns = opts.slice_us * 1000;
2309        rodata.max_exec_ns = if opts.max_exec_us > 0 {
2310            opts.max_exec_us * 1000
2311        } else {
2312            opts.slice_us * 1000 * 20
2313        };
2314        rodata.nr_cpu_ids = *NR_CPU_IDS as u32;
2315        rodata.nr_possible_cpus = *NR_CPUS_POSSIBLE as u32;
2316        rodata.smt_enabled = topo.smt_enabled;
2317        rodata.has_little_cores = topo.has_little_cores();
2318        rodata.xnuma_preemption = opts.xnuma_preemption;
2319        rodata.antistall_sec = opts.antistall_sec;
2320        rodata.monitor_disable = opts.monitor_disable;
2321        rodata.lo_fb_wait_ns = opts.lo_fb_wait_us * 1000;
2322        rodata.lo_fb_share_ppk = ((opts.lo_fb_share * 1024.0) as u32).clamp(1, 1024);
2323        rodata.enable_antistall = !opts.disable_antistall;
2324        rodata.enable_match_debug = opts.enable_match_debug;
2325        rodata.enable_gpu_support = opts.enable_gpu_support;
2326        rodata.kfuncs_supported_in_syscall = kfuncs_in_syscall;
2327
2328        for (cpu, sib) in topo.sibling_cpus().iter().enumerate() {
2329            rodata.__sibling_cpu[cpu] = *sib;
2330        }
2331        for cpu in topo.all_cpus.keys() {
2332            rodata.all_cpus[cpu / 8] |= 1 << (cpu % 8);
2333        }
2334
2335        rodata.nr_op_layers = layer_specs
2336            .iter()
2337            .filter(|spec| match &spec.kind {
2338                LayerKind::Open { .. } => spec.kind.common().preempt,
2339                _ => false,
2340            })
2341            .count() as u32;
2342        rodata.nr_on_layers = layer_specs
2343            .iter()
2344            .filter(|spec| match &spec.kind {
2345                LayerKind::Open { .. } => !spec.kind.common().preempt,
2346                _ => false,
2347            })
2348            .count() as u32;
2349        rodata.nr_gp_layers = layer_specs
2350            .iter()
2351            .filter(|spec| match &spec.kind {
2352                LayerKind::Grouped { .. } => spec.kind.common().preempt,
2353                _ => false,
2354            })
2355            .count() as u32;
2356        rodata.nr_gn_layers = layer_specs
2357            .iter()
2358            .filter(|spec| match &spec.kind {
2359                LayerKind::Grouped { .. } => !spec.kind.common().preempt,
2360                _ => false,
2361            })
2362            .count() as u32;
2363        rodata.nr_excl_layers = layer_specs
2364            .iter()
2365            .filter(|spec| spec.kind.common().exclusive)
2366            .count() as u32;
2367
2368        let mut min_open = u64::MAX;
2369        let mut min_preempt = u64::MAX;
2370
2371        for spec in layer_specs.iter() {
2372            if let LayerKind::Open { common, .. } = &spec.kind {
2373                min_open = min_open.min(common.disallow_open_after_us.unwrap());
2374                min_preempt = min_preempt.min(common.disallow_preempt_after_us.unwrap());
2375            }
2376        }
2377
2378        rodata.min_open_layer_disallow_open_after_ns = match min_open {
2379            u64::MAX => *DFL_DISALLOW_OPEN_AFTER_US,
2380            v => v,
2381        };
2382        rodata.min_open_layer_disallow_preempt_after_ns = match min_preempt {
2383            u64::MAX => *DFL_DISALLOW_PREEMPT_AFTER_US,
2384            v => v,
2385        };
2386
2387        // Consider all layers empty at the beginning.
2388        for i in 0..layer_specs.len() {
2389            skel.maps.bss_data.as_mut().unwrap().empty_layer_ids[i] = i as u32;
2390        }
2391        skel.maps.bss_data.as_mut().unwrap().nr_empty_layer_ids = nr_layers as u32;
2392
2393        // We set the pin path before loading the skeleton. This will ensure
2394        // libbpf creates and pins the map, or reuses the pinned map fd for us,
2395        // so that we can keep reusing the older map already pinned on scheduler
2396        // restarts.
2397        let layered_task_hint_map_path = &opts.task_hint_map;
2398        let hint_map = &mut skel.maps.scx_layered_task_hint_map;
2399        // Only set pin path if a path is provided.
2400        if layered_task_hint_map_path.is_empty() == false {
2401            hint_map.set_pin_path(layered_task_hint_map_path).unwrap();
2402            rodata.task_hint_map_enabled = true;
2403        }
2404
2405        if !opts.hi_fb_thread_name.is_empty() {
2406            let bpf_hi_fb_thread_name = &mut rodata.hi_fb_thread_name;
2407            copy_into_cstr(bpf_hi_fb_thread_name, opts.hi_fb_thread_name.as_str());
2408            rodata.enable_hi_fb_thread_name_match = true;
2409        }
2410
2411        Self::init_layers(&mut skel, &layer_specs, &topo)?;
2412        Self::init_nodes(&mut skel, opts, &topo);
2413
2414        let mut skel = scx_ops_load!(skel, layered, uei)?;
2415
2416        // Populate the mapping of hints to layer IDs for faster lookups
2417        if hint_to_layer_map.len() != 0 {
2418            for (k, v) in hint_to_layer_map.iter() {
2419                let key: u32 = *k as u32;
2420                let value: u32 = *v as u32;
2421                skel.maps.hint_to_layer_id_map.update(
2422                    &key.to_ne_bytes(),
2423                    &value.to_ne_bytes(),
2424                    libbpf_rs::MapFlags::ANY,
2425                )?;
2426            }
2427        }
2428
2429        if membw_tracking {
2430            create_perf_fds(&mut skel, event)?;
2431        }
2432
2433        let mut layers = vec![];
2434        let layer_growth_orders =
2435            LayerGrowthAlgo::layer_core_orders(&cpu_pool, &layer_specs, &topo)?;
2436        for (idx, spec) in layer_specs.iter().enumerate() {
2437            let growth_order = layer_growth_orders
2438                .get(&idx)
2439                .with_context(|| "layer has no growth order".to_string())?;
2440            layers.push(Layer::new(spec, &topo, growth_order)?);
2441        }
2442
2443        let mut idle_qos_enabled = layers
2444            .iter()
2445            .any(|layer| layer.kind.common().idle_resume_us.unwrap_or(0) > 0);
2446        if idle_qos_enabled && !cpu_idle_resume_latency_supported() {
2447            warn!("idle_resume_us not supported, ignoring");
2448            idle_qos_enabled = false;
2449        }
2450
2451        Self::init_cpus(&skel, &layer_specs, &topo)?;
2452        Self::init_llc_prox_map(&mut skel, &topo)?;
2453
2454        // Other stuff.
2455        let proc_reader = fb_procfs::ProcReader::new();
2456
2457        // Handle setup if layered is running in a pid namespace.
2458        let input = ProgramInput {
2459            ..Default::default()
2460        };
2461        let prog = &mut skel.progs.initialize_pid_namespace;
2462
2463        let _ = prog.test_run(input);
2464
2465        // XXX If we try to refresh the cpumasks here before attaching, we
2466        // sometimes (non-deterministically) don't see the updated values in
2467        // BPF. It would be better to update the cpumasks here before we
2468        // attach, but the value will quickly converge anyways so it's not a
2469        // huge problem in the interim until we figure it out.
2470
2471        // Allow all tasks to open and write to BPF task hint map, now that
2472        // we should have it pinned at the desired location.
2473        if layered_task_hint_map_path.is_empty() == false {
2474            let path = CString::new(layered_task_hint_map_path.as_bytes()).unwrap();
2475            let mode: libc::mode_t = 0o666;
2476            unsafe {
2477                if libc::chmod(path.as_ptr(), mode) != 0 {
2478                    trace!("'chmod' to 666 of task hint map failed, continuing...");
2479                }
2480            }
2481        }
2482
2483        // Attach.
2484        let struct_ops = scx_ops_attach!(skel, layered)?;
2485        let stats_server = StatsServer::new(stats::server_data()).launch()?;
2486        let mut gpu_task_handler =
2487            GpuTaskAffinitizer::new(opts.gpu_affinitize_secs, opts.enable_gpu_affinitize);
2488        gpu_task_handler.init(topo.clone());
2489        let sched = Self {
2490            struct_ops: Some(struct_ops),
2491            layer_specs,
2492
2493            sched_intv: Duration::from_secs_f64(opts.interval),
2494            layer_refresh_intv: Duration::from_millis(opts.layer_refresh_ms_avgruntime),
2495
2496            cpu_pool,
2497            layers,
2498            idle_qos_enabled,
2499
2500            sched_stats: Stats::new(&mut skel, &proc_reader, &gpu_task_handler)?,
2501
2502            nr_layer_cpus_ranges: vec![(0, 0); nr_layers],
2503            processing_dur: Default::default(),
2504
2505            proc_reader,
2506            skel,
2507
2508            topo,
2509            netdevs,
2510            stats_server,
2511            gpu_task_handler,
2512        };
2513
2514        info!("Layered Scheduler Attached. Run `scx_layered --monitor` for metrics.");
2515
2516        Ok(sched)
2517    }
2518
2519    fn update_cpumask(mask: &Cpumask, bpfmask: &mut [u8]) {
2520        for cpu in 0..mask.len() {
2521            if mask.test_cpu(cpu) {
2522                bpfmask[cpu / 8] |= 1 << (cpu % 8);
2523            } else {
2524                bpfmask[cpu / 8] &= !(1 << (cpu % 8));
2525            }
2526        }
2527    }
2528
2529    fn update_bpf_layer_cpumask(layer: &Layer, bpf_layer: &mut types::layer) {
2530        trace!("[{}] Updating BPF CPUs: {}", layer.name, &layer.cpus);
2531        Self::update_cpumask(&layer.cpus, &mut bpf_layer.cpus);
2532
2533        bpf_layer.nr_cpus = layer.nr_cpus as u32;
2534        for (llc_id, &nr_llc_cpus) in layer.nr_llc_cpus.iter().enumerate() {
2535            bpf_layer.nr_llc_cpus[llc_id] = nr_llc_cpus as u32;
2536        }
2537
2538        bpf_layer.refresh_cpus = 1;
2539    }
2540
2541    fn update_netdev_cpumasks(&mut self) -> Result<()> {
2542        let available_cpus = self.cpu_pool.available_cpus();
2543        if available_cpus.is_empty() {
2544            return Ok(());
2545        }
2546
2547        for (iface, netdev) in self.netdevs.iter_mut() {
2548            let node = self
2549                .topo
2550                .nodes
2551                .values()
2552                .take_while(|n| n.id == netdev.node())
2553                .next()
2554                .ok_or_else(|| anyhow!("Failed to get netdev node"))?;
2555            let node_cpus = node.span.clone();
2556            for (irq, irqmask) in netdev.irqs.iter_mut() {
2557                irqmask.clear_all();
2558                for cpu in available_cpus.iter() {
2559                    if !node_cpus.test_cpu(cpu) {
2560                        continue;
2561                    }
2562                    let _ = irqmask.set_cpu(cpu);
2563                }
2564                // If no CPUs are available in the node then spread the load across the node
2565                if irqmask.weight() == 0 {
2566                    for cpu in node_cpus.iter() {
2567                        let _ = irqmask.set_cpu(cpu);
2568                    }
2569                }
2570                trace!("{} updating irq {} cpumask {:?}", iface, irq, irqmask);
2571            }
2572            netdev.apply_cpumasks()?;
2573        }
2574
2575        Ok(())
2576    }
2577
2578    fn clamp_target_by_membw(
2579        &self,
2580        layer: &Layer,
2581        membw_limit: usize,
2582        last_membw: usize,
2583        low: usize,
2584        high: usize,
2585    ) -> usize {
2586        let ncpu = layer.cpus.weight();
2587        let last_membw_percpu = if ncpu > 0 { last_membw / ncpu } else { 0 };
2588
2589        // If there's no way we're hitting the limit, we're good.
2590        if last_membw_percpu * high < membw_limit {
2591            return high;
2592        }
2593
2594        // If there's no way to drop our memory usage down enough,
2595        // pin the target CPUs to low and drop a warning.
2596        if last_membw_percpu * low > membw_limit {
2597            warn!("cannot satisfy memory bw limit for layer {}", layer.name);
2598            return low;
2599        }
2600
2601        // Clamped membw is between low and high, clamp high.
2602        return membw_limit / last_membw_percpu;
2603    }
2604
2605    /// Calculate how many CPUs each layer would like to have if there were
2606    /// no competition. The CPU range is determined by applying the inverse
2607    /// of util_range and then capping by cpus_range. If the current
2608    /// allocation is within the acceptable range, no change is made.
2609    /// Returns (target, min) pair for each layer.
2610    fn calc_target_nr_cpus(&self) -> Vec<(usize, usize)> {
2611        let nr_cpus = self.cpu_pool.topo.all_cpus.len();
2612        let utils = &self.sched_stats.layer_utils;
2613
2614        let mut records: Vec<(u64, u64, u64, usize, usize, usize)> = vec![];
2615        let mut targets: Vec<(usize, usize)> = vec![];
2616
2617        for (idx, layer) in self.layers.iter().enumerate() {
2618            targets.push(match &layer.kind {
2619                LayerKind::Confined {
2620                    util_range,
2621                    cpus_range,
2622                    cpus_range_frac,
2623                    membw_gb,
2624                    ..
2625                }
2626                | LayerKind::Grouped {
2627                    util_range,
2628                    cpus_range,
2629                    cpus_range_frac,
2630                    membw_gb,
2631                    ..
2632                } => {
2633                    // A grouped layer can choose to include open cputime
2634                    // for sizing. Also, as an empty layer can only get CPU
2635                    // time through fallback (counted as owned) or open
2636                    // execution, add open cputime for empty layers.
2637                    let owned = utils[idx][LAYER_USAGE_OWNED];
2638                    let open = utils[idx][LAYER_USAGE_OPEN];
2639
2640                    let mut util = owned;
2641                    if layer.kind.util_includes_open_cputime() || layer.nr_cpus == 0 {
2642                        util += open;
2643                    }
2644
2645                    let util = if util < 0.01 { 0.0 } else { util };
2646                    let low = (util / util_range.1).ceil() as usize;
2647                    let mut high = ((util / util_range.0).floor() as usize).max(low);
2648
2649                    if let Some(membw_limit) = membw_gb {
2650                        let membw_cpus = &self.sched_stats.layer_membws[idx];
2651                        let last_membw = membw_cpus.into_iter().map(|x: &f64| *x as usize).sum();
2652                        trace!(
2653                            "(last_membw, membw_limit): ({last_membw} bytes, {membw_limit} GiB)"
2654                        );
2655
2656                        high = self.clamp_target_by_membw(
2657                            &layer,
2658                            (*membw_limit * (1024_u64.pow(3) as f64)) as usize,
2659                            last_membw,
2660                            low,
2661                            high,
2662                        );
2663                    }
2664
2665                    let target = layer.cpus.weight().clamp(low, high);
2666                    let cpus_range =
2667                        resolve_cpus_pct_range(cpus_range, cpus_range_frac, nr_cpus).unwrap();
2668
2669                    records.push((
2670                        (owned * 100.0) as u64,
2671                        (open * 100.0) as u64,
2672                        (util * 100.0) as u64,
2673                        low,
2674                        high,
2675                        target,
2676                    ));
2677
2678                    (target.clamp(cpus_range.0, cpus_range.1), cpus_range.0)
2679                }
2680                LayerKind::Open { .. } => (0, 0),
2681            });
2682        }
2683
2684        trace!("initial targets: {:?}", &targets);
2685        trace!("(owned, open, util, low, high, target): {:?}", &records);
2686        targets
2687    }
2688
2689    /// Given (target, min) pair for each layer which was determined
2690    /// assuming infinite number of CPUs, distribute the actual CPUs
2691    /// according to their weights.
2692    fn weighted_target_nr_cpus(&self, targets: &[(usize, usize)]) -> Vec<usize> {
2693        let mut nr_left = self.cpu_pool.topo.all_cpus.len();
2694        let weights: Vec<usize> = self
2695            .layers
2696            .iter()
2697            .map(|layer| layer.kind.common().weight as usize)
2698            .collect();
2699        let mut cands: BTreeMap<usize, (usize, usize, usize)> = targets
2700            .iter()
2701            .zip(&weights)
2702            .enumerate()
2703            .map(|(i, ((target, min), weight))| (i, (*target, *min, *weight)))
2704            .collect();
2705        let mut weight_sum: usize = weights.iter().sum();
2706        let mut weighted: Vec<usize> = vec![0; self.layers.len()];
2707
2708        trace!("cands: {:?}", &cands);
2709
2710        // First, accept all layers that are <= min.
2711        cands.retain(|&i, &mut (target, min, weight)| {
2712            if target <= min {
2713                let target = target.min(nr_left);
2714                weighted[i] = target;
2715                weight_sum -= weight;
2716                nr_left -= target;
2717                false
2718            } else {
2719                true
2720            }
2721        });
2722
2723        trace!("cands after accepting mins: {:?}", &cands);
2724
2725        // Keep accepting ones under their allotted share.
2726        let calc_share = |nr_left, weight, weight_sum| {
2727            (((nr_left * weight) as f64 / weight_sum as f64).ceil() as usize).min(nr_left)
2728        };
2729
2730        while !cands.is_empty() {
2731            let mut progress = false;
2732
2733            cands.retain(|&i, &mut (target, _min, weight)| {
2734                let share = calc_share(nr_left, weight, weight_sum);
2735                if target <= share {
2736                    weighted[i] = target;
2737                    weight_sum -= weight;
2738                    nr_left -= target;
2739                    progress = true;
2740                    false
2741                } else {
2742                    true
2743                }
2744            });
2745
2746            if !progress {
2747                break;
2748            }
2749        }
2750
2751        trace!("cands after accepting under allotted: {:?}", &cands);
2752
2753        // The remaining candidates are in contention with each other,
2754        // distribute according to the shares.
2755        let nr_to_share = nr_left;
2756        for (i, (_target, _min, weight)) in cands.into_iter() {
2757            let share = calc_share(nr_to_share, weight, weight_sum).min(nr_left);
2758            weighted[i] = share;
2759            nr_left -= share;
2760        }
2761
2762        trace!("weighted: {:?}", &weighted);
2763
2764        weighted
2765    }
2766
2767    // Figure out a tuple (LLCs, extra_cpus) in terms of the target CPUs
2768    // computed by weighted_target_nr_cpus. Returns the number of full LLCs
2769    // occupied by a layer, and any extra CPUs that don't occupy a full LLC.
2770    fn compute_target_llcs(target: usize, topo: &Topology) -> (usize, usize) {
2771        // TODO(kkd): We assume each LLC has equal number of cores.
2772        let cores_per_llc = topo.all_cores.len() / topo.all_llcs.len();
2773        // TODO(kkd): We assume each core has fixed number of threads.
2774        let cpus_per_core = topo.all_cores.first_key_value().unwrap().1.cpus.len();
2775        let cpus_per_llc = cores_per_llc * cpus_per_core;
2776
2777        let full = target / cpus_per_llc;
2778        let extra = target % cpus_per_llc;
2779
2780        (full, extra.div_ceil(cpus_per_core))
2781    }
2782
2783    // Recalculate the core order for layers using StickyDynamic growth
2784    // algorithm. Tuples from compute_target_llcs are used to decide how many
2785    // LLCs and cores should be assigned to each layer, logic to alloc and free
2786    // CPUs operates on that core order. This happens in three logical steps, we
2787    // first free LLCs from layers that shrunk from last recomputation, then
2788    // distribute freed LLCs to growing layers, and then spill over remaining
2789    // cores in free LLCs.
2790    fn recompute_layer_core_order(&mut self, layer_targets: &Vec<(usize, usize)>) {
2791        // Collect freed LLCs from shrinking layers.
2792        debug!(
2793            " free: before pass: free_llcs={:?}",
2794            self.cpu_pool.free_llcs
2795        );
2796        for &(idx, target) in layer_targets.iter().rev() {
2797            let layer = &mut self.layers[idx];
2798            let old_tlc = layer.target_llc_cpus;
2799            let new_tlc = Self::compute_target_llcs(target, &self.topo);
2800
2801            if layer.growth_algo != LayerGrowthAlgo::StickyDynamic {
2802                continue;
2803            }
2804
2805            let mut to_free = (old_tlc.0 as i32 - new_tlc.0 as i32).max(0) as usize;
2806
2807            debug!(
2808                " free: layer={} old_tlc={:?} new_tlc={:?} to_free={} assigned={} free={}",
2809                layer.name,
2810                old_tlc,
2811                new_tlc,
2812                to_free,
2813                layer.assigned_llcs.len(),
2814                self.cpu_pool.free_llcs.len()
2815            );
2816
2817            while to_free > 0 && layer.assigned_llcs.len() > 0 {
2818                let llc = layer.assigned_llcs.pop().unwrap();
2819                self.cpu_pool.free_llcs.push((llc, 0));
2820                to_free -= 1;
2821
2822                debug!(" layer={} freed_llc={}", layer.name, llc);
2823            }
2824        }
2825        debug!(" free: after pass: free_llcs={:?}", self.cpu_pool.free_llcs);
2826
2827        // Redistribute the freed LLCs to growing layers.
2828        for &(idx, target) in layer_targets.iter().rev() {
2829            let layer = &mut self.layers[idx];
2830            let old_tlc = layer.target_llc_cpus;
2831            let new_tlc = Self::compute_target_llcs(target, &self.topo);
2832
2833            if layer.growth_algo != LayerGrowthAlgo::StickyDynamic {
2834                continue;
2835            }
2836
2837            let mut to_alloc = (new_tlc.0 as i32 - old_tlc.0 as i32).max(0) as usize;
2838
2839            debug!(
2840                " alloc: layer={} old_tlc={:?} new_tlc={:?} to_alloc={} assigned={} free={}",
2841                layer.name,
2842                old_tlc,
2843                new_tlc,
2844                to_alloc,
2845                layer.assigned_llcs.len(),
2846                self.cpu_pool.free_llcs.len()
2847            );
2848
2849            while to_alloc > 0
2850                && self.cpu_pool.free_llcs.len() > 0
2851                && to_alloc <= self.cpu_pool.free_llcs.len()
2852            {
2853                let llc = self.cpu_pool.free_llcs.pop().unwrap().0;
2854                layer.assigned_llcs.push(llc);
2855                to_alloc -= 1;
2856
2857                debug!(" layer={} alloc_llc={}", layer.name, llc);
2858            }
2859
2860            debug!(
2861                " alloc: layer={} assigned_llcs={:?}",
2862                layer.name, layer.assigned_llcs
2863            );
2864
2865            // Update for next iteration.
2866            layer.target_llc_cpus = new_tlc;
2867        }
2868
2869        // Spillover overflowing cores into free LLCs. Bigger layers get to take
2870        // a chunk before smaller layers.
2871        for &(idx, _) in layer_targets.iter() {
2872            let mut core_order = vec![];
2873            let layer = &mut self.layers[idx];
2874
2875            if layer.growth_algo != LayerGrowthAlgo::StickyDynamic {
2876                continue;
2877            }
2878
2879            let tlc = layer.target_llc_cpus;
2880            let mut extra = tlc.1;
2881            // TODO(kkd): Move this logic into cpu_pool? What's the best place?
2882            let cores_per_llc = self.topo.all_cores.len() / self.topo.all_llcs.len();
2883            let cpus_per_core = self.topo.all_cores.first_key_value().unwrap().1.cpus.len();
2884            let cpus_per_llc = cores_per_llc * cpus_per_core;
2885
2886            // Consume from front since we pop from the back.
2887            for i in 0..self.cpu_pool.free_llcs.len() {
2888                let free_vec = &mut self.cpu_pool.free_llcs;
2889                // Available CPUs in LLC.
2890                let avail = cpus_per_llc - free_vec[i].1;
2891                // The amount we'll use.
2892                let mut used = extra.min(avail);
2893
2894                let shift = free_vec[i].1;
2895                free_vec[i].1 += used;
2896
2897                let llc_id = free_vec[i].0;
2898                let llc = self.topo.all_llcs.get(&llc_id).unwrap();
2899
2900                for core in llc.cores.iter().skip(shift) {
2901                    core_order.push(core.1.id);
2902                    if used == 0 {
2903                        break;
2904                    }
2905                    used -= 1;
2906                }
2907
2908                extra -= used;
2909                if extra == 0 {
2910                    break;
2911                }
2912            }
2913
2914            core_order.reverse();
2915            layer.core_order = core_order;
2916        }
2917
2918        // Reset consumed entries in free LLCs.
2919        for i in 0..self.cpu_pool.free_llcs.len() {
2920            self.cpu_pool.free_llcs[i].1 = 0;
2921        }
2922
2923        for &(idx, _) in layer_targets.iter() {
2924            let layer = &mut self.layers[idx];
2925
2926            if layer.growth_algo != LayerGrowthAlgo::StickyDynamic {
2927                continue;
2928            }
2929
2930            for core in self.topo.all_cores.iter() {
2931                let llc_id = core.1.llc_id;
2932                if layer.assigned_llcs.contains(&llc_id) {
2933                    layer.core_order.push(core.1.id);
2934                }
2935            }
2936            // Update core_order for the layer, but reverse to keep the start stable.
2937            layer.core_order.reverse();
2938
2939            debug!(
2940                " alloc: layer={} core_order={:?}",
2941                layer.name, layer.core_order
2942            );
2943        }
2944    }
2945
2946    fn refresh_cpumasks(&mut self) -> Result<()> {
2947        let layer_is_open = |layer: &Layer| matches!(layer.kind, LayerKind::Open { .. });
2948
2949        let mut updated = false;
2950        let targets = self.calc_target_nr_cpus();
2951        let targets = self.weighted_target_nr_cpus(&targets);
2952
2953        let mut ascending: Vec<(usize, usize)> = targets.iter().copied().enumerate().collect();
2954        ascending.sort_by(|a, b| a.1.cmp(&b.1));
2955
2956        self.recompute_layer_core_order(&ascending);
2957
2958        // If any layer is growing, guarantee that the largest layer that is
2959        // freeing CPUs frees at least one CPU.
2960        let mut force_free = self
2961            .layers
2962            .iter()
2963            .zip(targets.iter())
2964            .any(|(layer, &target)| layer.nr_cpus < target);
2965
2966        // Shrink all layers first so that CPUs are available for
2967        // redistribution. Do so in the descending target number of CPUs
2968        // order.
2969        for &(idx, target) in ascending.iter().rev() {
2970            let layer = &mut self.layers[idx];
2971            if layer_is_open(layer) {
2972                continue;
2973            }
2974
2975            let nr_cur = layer.cpus.weight();
2976            if nr_cur <= target {
2977                continue;
2978            }
2979            let mut nr_to_free = nr_cur - target;
2980
2981            // There's some dampening built into util metrics but slow down
2982            // freeing further to avoid unnecessary changes. This is solely
2983            // based on intution. Drop or update according to real-world
2984            // behavior.
2985            let nr_to_break_at = nr_to_free / 2;
2986
2987            let mut freed = false;
2988
2989            while nr_to_free > 0 {
2990                let max_to_free = if force_free {
2991                    force_free = false;
2992                    layer.nr_cpus
2993                } else {
2994                    nr_to_free
2995                };
2996
2997                let nr_freed = layer.free_some_cpus(&mut self.cpu_pool, max_to_free)?;
2998                if nr_freed == 0 {
2999                    break;
3000                }
3001
3002                nr_to_free = nr_to_free.saturating_sub(nr_freed);
3003                freed = true;
3004
3005                if nr_to_free <= nr_to_break_at {
3006                    break;
3007                }
3008            }
3009
3010            if freed {
3011                Self::update_bpf_layer_cpumask(
3012                    layer,
3013                    &mut self.skel.maps.bss_data.as_mut().unwrap().layers[idx],
3014                );
3015                updated = true;
3016            }
3017        }
3018
3019        // Grow layers. Do so in the ascending target number of CPUs order
3020        // so that we're always more generous to smaller layers. This avoids
3021        // starving small layers and shouldn't make noticable difference for
3022        // bigger layers as work conservation should still be achieved
3023        // through open execution.
3024        for &(idx, target) in &ascending {
3025            let layer = &mut self.layers[idx];
3026
3027            if layer_is_open(layer) {
3028                continue;
3029            }
3030
3031            let nr_cur = layer.cpus.weight();
3032            if nr_cur >= target {
3033                continue;
3034            }
3035
3036            let mut nr_to_alloc = target - nr_cur;
3037            let mut alloced = false;
3038
3039            while nr_to_alloc > 0 {
3040                let nr_alloced = layer.alloc_some_cpus(&mut self.cpu_pool)?;
3041                if nr_alloced == 0 {
3042                    break;
3043                }
3044                alloced = true;
3045                nr_to_alloc -= nr_alloced.min(nr_to_alloc);
3046            }
3047
3048            if alloced {
3049                Self::update_bpf_layer_cpumask(
3050                    layer,
3051                    &mut self.skel.maps.bss_data.as_mut().unwrap().layers[idx],
3052                );
3053                updated = true;
3054            }
3055        }
3056
3057        // Give the rest to the open layers.
3058        if updated {
3059            for (idx, layer) in self.layers.iter_mut().enumerate() {
3060                if !layer_is_open(layer) {
3061                    continue;
3062                }
3063
3064                let bpf_layer = &mut self.skel.maps.bss_data.as_mut().unwrap().layers[idx];
3065                let available_cpus = self.cpu_pool.available_cpus().and(&layer.allowed_cpus);
3066                let nr_available_cpus = available_cpus.weight();
3067
3068                // Open layers need the intersection of allowed cpus and
3069                // available cpus.
3070                layer.cpus = available_cpus;
3071                layer.nr_cpus = nr_available_cpus;
3072                Self::update_bpf_layer_cpumask(layer, bpf_layer);
3073            }
3074
3075            self.skel.maps.bss_data.as_mut().unwrap().fallback_cpu =
3076                self.cpu_pool.fallback_cpu as u32;
3077
3078            for (lidx, layer) in self.layers.iter().enumerate() {
3079                self.nr_layer_cpus_ranges[lidx] = (
3080                    self.nr_layer_cpus_ranges[lidx].0.min(layer.nr_cpus),
3081                    self.nr_layer_cpus_ranges[lidx].1.max(layer.nr_cpus),
3082                );
3083            }
3084
3085            // Trigger updates on the BPF side.
3086            let input = ProgramInput {
3087                ..Default::default()
3088            };
3089            let prog = &mut self.skel.progs.refresh_layer_cpumasks;
3090            let _ = prog.test_run(input);
3091
3092            // Update empty_layers.
3093            let empty_layer_ids: Vec<u32> = self
3094                .layers
3095                .iter()
3096                .enumerate()
3097                .filter(|(_idx, layer)| layer.nr_cpus == 0)
3098                .map(|(idx, _layer)| idx as u32)
3099                .collect();
3100            for i in 0..self.layers.len() {
3101                self.skel.maps.bss_data.as_mut().unwrap().empty_layer_ids[i] =
3102                    empty_layer_ids.get(i).cloned().unwrap_or(MAX_LAYERS as u32);
3103            }
3104            self.skel.maps.bss_data.as_mut().unwrap().nr_empty_layer_ids =
3105                empty_layer_ids.len() as u32;
3106        }
3107
3108        let _ = self.update_netdev_cpumasks();
3109        Ok(())
3110    }
3111
3112    fn refresh_idle_qos(&mut self) -> Result<()> {
3113        if !self.idle_qos_enabled {
3114            return Ok(());
3115        }
3116
3117        let mut cpu_idle_qos = vec![0; *NR_CPU_IDS];
3118        for layer in self.layers.iter() {
3119            let idle_resume_us = layer.kind.common().idle_resume_us.unwrap_or(0) as i32;
3120            for cpu in layer.cpus.iter() {
3121                cpu_idle_qos[cpu] = idle_resume_us;
3122            }
3123        }
3124
3125        for (cpu, idle_resume_usec) in cpu_idle_qos.iter().enumerate() {
3126            update_cpu_idle_resume_latency(cpu, *idle_resume_usec)?;
3127        }
3128
3129        Ok(())
3130    }
3131
3132    fn step(&mut self) -> Result<()> {
3133        let started_at = Instant::now();
3134        self.sched_stats.refresh(
3135            &mut self.skel,
3136            &self.proc_reader,
3137            started_at,
3138            self.processing_dur,
3139            &self.gpu_task_handler,
3140        )?;
3141        self.refresh_cpumasks()?;
3142        self.refresh_idle_qos()?;
3143        self.gpu_task_handler.maybe_affinitize();
3144        self.processing_dur += Instant::now().duration_since(started_at);
3145        Ok(())
3146    }
3147
3148    fn generate_sys_stats(
3149        &mut self,
3150        stats: &Stats,
3151        cpus_ranges: &mut [(usize, usize)],
3152    ) -> Result<SysStats> {
3153        let bstats = &stats.bpf_stats;
3154        let mut sys_stats = SysStats::new(stats, bstats, self.cpu_pool.fallback_cpu)?;
3155
3156        for (lidx, (spec, layer)) in self.layer_specs.iter().zip(self.layers.iter()).enumerate() {
3157            let layer_stats = LayerStats::new(lidx, layer, stats, bstats, cpus_ranges[lidx]);
3158            sys_stats.layers.insert(spec.name.to_string(), layer_stats);
3159            cpus_ranges[lidx] = (layer.nr_cpus, layer.nr_cpus);
3160        }
3161
3162        Ok(sys_stats)
3163    }
3164
3165    fn run(&mut self, shutdown: Arc<AtomicBool>) -> Result<UserExitInfo> {
3166        let (res_ch, req_ch) = self.stats_server.channels();
3167        let mut next_sched_at = Instant::now() + self.sched_intv;
3168        let enable_layer_refresh = !self.layer_refresh_intv.is_zero();
3169        let mut next_layer_refresh_at = Instant::now() + self.layer_refresh_intv;
3170        let mut cpus_ranges = HashMap::<ThreadId, Vec<(usize, usize)>>::new();
3171
3172        while !shutdown.load(Ordering::Relaxed) && !uei_exited!(&self.skel, uei) {
3173            let now = Instant::now();
3174
3175            if now >= next_sched_at {
3176                self.step()?;
3177                while next_sched_at < now {
3178                    next_sched_at += self.sched_intv;
3179                }
3180            }
3181
3182            if enable_layer_refresh && now >= next_layer_refresh_at {
3183                self.skel
3184                    .maps
3185                    .bss_data
3186                    .as_mut()
3187                    .unwrap()
3188                    .layer_refresh_seq_avgruntime += 1;
3189                while next_layer_refresh_at < now {
3190                    next_layer_refresh_at += self.layer_refresh_intv;
3191                }
3192            }
3193
3194            match req_ch.recv_deadline(next_sched_at) {
3195                Ok(StatsReq::Hello(tid)) => {
3196                    cpus_ranges.insert(
3197                        tid,
3198                        self.layers.iter().map(|l| (l.nr_cpus, l.nr_cpus)).collect(),
3199                    );
3200                    let stats =
3201                        Stats::new(&mut self.skel, &self.proc_reader, &self.gpu_task_handler)?;
3202                    res_ch.send(StatsRes::Hello(stats))?;
3203                }
3204                Ok(StatsReq::Refresh(tid, mut stats)) => {
3205                    // Propagate self's layer cpu ranges into each stat's.
3206                    for i in 0..self.nr_layer_cpus_ranges.len() {
3207                        for (_, ranges) in cpus_ranges.iter_mut() {
3208                            ranges[i] = (
3209                                ranges[i].0.min(self.nr_layer_cpus_ranges[i].0),
3210                                ranges[i].1.max(self.nr_layer_cpus_ranges[i].1),
3211                            );
3212                        }
3213                        self.nr_layer_cpus_ranges[i] =
3214                            (self.layers[i].nr_cpus, self.layers[i].nr_cpus);
3215                    }
3216
3217                    stats.refresh(
3218                        &mut self.skel,
3219                        &self.proc_reader,
3220                        now,
3221                        self.processing_dur,
3222                        &self.gpu_task_handler,
3223                    )?;
3224                    let sys_stats =
3225                        self.generate_sys_stats(&stats, cpus_ranges.get_mut(&tid).unwrap())?;
3226                    res_ch.send(StatsRes::Refreshed((stats, sys_stats)))?;
3227                }
3228                Ok(StatsReq::Bye(tid)) => {
3229                    cpus_ranges.remove(&tid);
3230                    res_ch.send(StatsRes::Bye)?;
3231                }
3232                Err(RecvTimeoutError::Timeout) => {}
3233                Err(e) => Err(e)?,
3234            }
3235        }
3236
3237        let _ = self.struct_ops.take();
3238        uei_report!(&self.skel, uei)
3239    }
3240}
3241
3242impl Drop for Scheduler<'_> {
3243    fn drop(&mut self) {
3244        info!("Unregister {SCHEDULER_NAME} scheduler");
3245
3246        if let Some(struct_ops) = self.struct_ops.take() {
3247            drop(struct_ops);
3248        }
3249    }
3250}
3251
3252fn write_example_file(path: &str) -> Result<()> {
3253    let mut f = fs::OpenOptions::new()
3254        .create_new(true)
3255        .write(true)
3256        .open(path)?;
3257    Ok(f.write_all(serde_json::to_string_pretty(&*EXAMPLE_CONFIG)?.as_bytes())?)
3258}
3259
3260fn verify_layer_specs(specs: &[LayerSpec]) -> Result<HashMap<u64, usize>> {
3261    let mut hint_to_layer_map = HashMap::<u64, (usize, String)>::new();
3262
3263    let nr_specs = specs.len();
3264    if nr_specs == 0 {
3265        bail!("No layer spec");
3266    }
3267    if nr_specs > MAX_LAYERS {
3268        bail!("Too many layer specs");
3269    }
3270
3271    for (idx, spec) in specs.iter().enumerate() {
3272        if idx < nr_specs - 1 {
3273            if spec.matches.is_empty() {
3274                bail!("Non-terminal spec {:?} has NULL matches", spec.name);
3275            }
3276        } else {
3277            if spec.matches.len() != 1 || !spec.matches[0].is_empty() {
3278                bail!("Terminal spec {:?} must have an empty match", spec.name);
3279            }
3280        }
3281
3282        if spec.matches.len() > MAX_LAYER_MATCH_ORS {
3283            bail!(
3284                "Spec {:?} has too many ({}) OR match blocks",
3285                spec.name,
3286                spec.matches.len()
3287            );
3288        }
3289
3290        for (ands_idx, ands) in spec.matches.iter().enumerate() {
3291            if ands.len() > NR_LAYER_MATCH_KINDS {
3292                bail!(
3293                    "Spec {:?}'s {}th OR block has too many ({}) match conditions",
3294                    spec.name,
3295                    ands_idx,
3296                    ands.len()
3297                );
3298            }
3299            let mut hint_equals_cnt = 0;
3300            for one in ands.iter() {
3301                match one {
3302                    LayerMatch::CgroupPrefix(prefix) => {
3303                        if prefix.len() > MAX_PATH {
3304                            bail!("Spec {:?} has too long a cgroup prefix", spec.name);
3305                        }
3306                    }
3307                    LayerMatch::CgroupSuffix(suffix) => {
3308                        if suffix.len() > MAX_PATH {
3309                            bail!("Spec {:?} has too long a cgroup suffix", spec.name);
3310                        }
3311                    }
3312                    LayerMatch::CgroupContains(substr) => {
3313                        if substr.len() > MAX_PATH {
3314                            bail!("Spec {:?} has too long a cgroup substr", spec.name);
3315                        }
3316                    }
3317                    LayerMatch::CommPrefix(prefix) => {
3318                        if prefix.len() > MAX_COMM {
3319                            bail!("Spec {:?} has too long a comm prefix", spec.name);
3320                        }
3321                    }
3322                    LayerMatch::PcommPrefix(prefix) => {
3323                        if prefix.len() > MAX_COMM {
3324                            bail!("Spec {:?} has too long a process name prefix", spec.name);
3325                        }
3326                    }
3327                    LayerMatch::HintEquals(hint) => {
3328                        if *hint > 1024 {
3329                            bail!(
3330                                "Spec {:?} has hint value outside the range [0, 1024]",
3331                                spec.name
3332                            );
3333                        }
3334
3335                        if let Some((layer_id, name)) = hint_to_layer_map.get(hint) {
3336                            if *layer_id != idx {
3337                                bail!(
3338                                    "Spec {:?} has hint value ({}) that is already mapped to Spec {:?}",
3339                                    spec.name,
3340                                    hint,
3341                                    name
3342                                );
3343                            }
3344                        } else {
3345                            hint_to_layer_map.insert(*hint, (idx, spec.name.clone()));
3346                        }
3347                        hint_equals_cnt += 1;
3348                    }
3349                    _ => {}
3350                }
3351            }
3352            if hint_equals_cnt > 1 {
3353                bail!("Only 1 HintEquals match permitted per AND block");
3354            }
3355            if hint_equals_cnt == 1 && ands.len() != 1 {
3356                bail!("HintEquals match cannot be in conjunction with other matches");
3357            }
3358        }
3359
3360        match spec.kind {
3361            LayerKind::Confined {
3362                cpus_range,
3363                util_range,
3364                ..
3365            }
3366            | LayerKind::Grouped {
3367                cpus_range,
3368                util_range,
3369                ..
3370            } => {
3371                if let Some((cpus_min, cpus_max)) = cpus_range {
3372                    if cpus_min > cpus_max {
3373                        bail!(
3374                            "Spec {:?} has invalid cpus_range({}, {})",
3375                            spec.name,
3376                            cpus_min,
3377                            cpus_max
3378                        );
3379                    }
3380                }
3381                if util_range.0 >= util_range.1 {
3382                    bail!(
3383                        "Spec {:?} has invalid util_range ({}, {})",
3384                        spec.name,
3385                        util_range.0,
3386                        util_range.1
3387                    );
3388                }
3389            }
3390            _ => {}
3391        }
3392    }
3393
3394    Ok(hint_to_layer_map
3395        .into_iter()
3396        .map(|(k, v)| (k, v.0))
3397        .collect())
3398}
3399
3400fn name_suffix(cgroup: &str, len: usize) -> String {
3401    let suffixlen = std::cmp::min(len, cgroup.len());
3402    let suffixrev: String = cgroup.chars().rev().take(suffixlen).collect();
3403
3404    suffixrev.chars().rev().collect()
3405}
3406
3407fn traverse_sysfs(dir: &Path) -> Result<Vec<PathBuf>> {
3408    let mut paths = vec![];
3409
3410    if !dir.is_dir() {
3411        panic!("path {:?} does not correspond to directory", dir);
3412    }
3413
3414    let direntries = fs::read_dir(dir)?;
3415
3416    for entry in direntries {
3417        let path = entry?.path();
3418        if path.is_dir() {
3419            paths.append(&mut traverse_sysfs(&path)?);
3420            paths.push(path);
3421        }
3422    }
3423
3424    Ok(paths)
3425}
3426
3427fn find_cpumask(cgroup: &str) -> Cpumask {
3428    let mut path = String::from(cgroup);
3429    path.push_str("/cpuset.cpus.effective");
3430
3431    let description = fs::read_to_string(&mut path).unwrap();
3432
3433    Cpumask::from_cpulist(&description).unwrap()
3434}
3435
3436fn expand_template(rule: &LayerMatch) -> Result<Vec<(LayerMatch, Cpumask)>> {
3437    match rule {
3438        LayerMatch::CgroupSuffix(suffix) => Ok(traverse_sysfs(Path::new("/sys/fs/cgroup"))?
3439            .into_iter()
3440            .map(|cgroup| String::from(cgroup.to_str().expect("could not parse cgroup path")))
3441            .filter(|cgroup| cgroup.ends_with(suffix))
3442            .map(|cgroup| {
3443                (
3444                    {
3445                        let mut slashterminated = cgroup.clone();
3446                        slashterminated.push('/');
3447                        LayerMatch::CgroupSuffix(name_suffix(&slashterminated, 64))
3448                    },
3449                    find_cpumask(&cgroup),
3450                )
3451            })
3452            .collect()),
3453        LayerMatch::CgroupRegex(expr) => Ok(traverse_sysfs(Path::new("/sys/fs/cgroup"))?
3454            .into_iter()
3455            .map(|cgroup| String::from(cgroup.to_str().expect("could not parse cgroup path")))
3456            .filter(|cgroup| {
3457                let re = Regex::new(expr).unwrap();
3458                re.is_match(cgroup)
3459            })
3460            .map(|cgroup| {
3461                (
3462                    // Here we convert the regex match into a suffix match because we still need to
3463                    // do the matching on the bpf side and doing a regex match in bpf isn't
3464                    // easily done.
3465                    {
3466                        let mut slashterminated = cgroup.clone();
3467                        slashterminated.push('/');
3468                        LayerMatch::CgroupSuffix(name_suffix(&slashterminated, 64))
3469                    },
3470                    find_cpumask(&cgroup),
3471                )
3472            })
3473            .collect()),
3474        _ => panic!("Unimplemented template enum {:?}", rule),
3475    }
3476}
3477
3478fn create_perf_fds(skel: &mut BpfSkel, event: u64) -> Result<()> {
3479    let mut attr = perf::bindings::perf_event_attr::default();
3480    attr.size = std::mem::size_of::<perf::bindings::perf_event_attr>() as u32;
3481    attr.type_ = perf::bindings::PERF_TYPE_RAW;
3482    attr.config = event;
3483    attr.sample_type = 0u64;
3484    attr.set_disabled(0);
3485
3486    let perf_events_map = &skel.maps.scx_pmu_map;
3487    let map_fd = unsafe { libbpf_sys::bpf_map__fd(perf_events_map.as_libbpf_object().as_ptr()) };
3488
3489    let mut failures = 0u64;
3490
3491    for cpu in 0..*NR_CPUS_POSSIBLE {
3492        let fd = unsafe { perf::perf_event_open(&mut attr as *mut _, -1, cpu as i32, -1, 0) };
3493        if fd < 0 {
3494            failures += 1;
3495            trace!(
3496                "perf_event_open failed cpu={cpu} errno={}",
3497                std::io::Error::last_os_error()
3498            );
3499            continue;
3500        }
3501
3502        let key = cpu as u32;
3503        let val = fd as u32;
3504        let ret = unsafe {
3505            libbpf_sys::bpf_map_update_elem(
3506                map_fd,
3507                &key as *const _ as *const _,
3508                &val as *const _ as *const _,
3509                0,
3510            )
3511        };
3512        if ret != 0 {
3513            trace!("bpf_map_update_elem failed cpu={cpu} fd={fd} ret={ret}");
3514        } else {
3515            trace!("mapped cpu={cpu} -> fd={fd}");
3516        }
3517    }
3518
3519    if failures > 0 {
3520        trace!("failed to install {failures} counters");
3521        // Keep going, do not fail the scheduler for this
3522    }
3523
3524    Ok(())
3525}
3526
3527// Set up the counters
3528fn setup_membw_tracking(skel: &mut OpenBpfSkel) -> Result<u64> {
3529    let pmumanager = PMUManager::new(None)?;
3530    let codename = &pmumanager.codename as &str;
3531
3532    let pmuspec = match codename {
3533        "amdzen1" | "amdzen2" | "amdzen3" => {
3534            trace!("found AMD codename {codename}");
3535            pmumanager.pmus.get("ls_any_fills_from_sys.mem_io_local")
3536        }
3537        "amdzen4" | "amdzen5" => {
3538            trace!("found AMD codename {codename}");
3539            pmumanager.pmus.get("ls_any_fills_from_sys.dram_io_all")
3540        }
3541
3542        "haswell" | "broadwell" | "broadwellde" | "broadwellx" | "skylake" | "skylakex"
3543        | "cascadelakex" | "arrowlake" | "meteorlake" | "sapphirerapids" | "emeraldrapids"
3544        | "graniterapids" => {
3545            trace!("found Intel codename {codename}");
3546            pmumanager.pmus.get("mem_load_retired.l3_miss")
3547        }
3548
3549        _ => {
3550            trace!("found unknown codename {codename}");
3551            None
3552        }
3553    };
3554
3555    let spec = pmuspec.ok_or("not_found").unwrap();
3556    let config = (spec.umask << 8) | spec.event;
3557
3558    // Install the counter in the BPF map
3559    skel.maps.rodata_data.as_mut().unwrap().membw_event = config;
3560
3561    Ok(config)
3562}
3563
3564fn main() -> Result<()> {
3565    let opts = Opts::parse();
3566
3567    if opts.version {
3568        println!(
3569            "scx_layered {}",
3570            build_id::full_version(env!("CARGO_PKG_VERSION"))
3571        );
3572        return Ok(());
3573    }
3574
3575    if opts.help_stats {
3576        stats::server_data().describe_meta(&mut std::io::stdout(), None)?;
3577        return Ok(());
3578    }
3579
3580    if opts.no_load_frac_limit {
3581        warn!("--no-load-frac-limit is deprecated and noop");
3582    }
3583    if opts.layer_preempt_weight_disable != 0.0 {
3584        warn!("--layer-preempt-weight-disable is deprecated and noop");
3585    }
3586    if opts.layer_growth_weight_disable != 0.0 {
3587        warn!("--layer-growth-weight-disable is deprecated and noop");
3588    }
3589    if opts.local_llc_iteration {
3590        warn!("--local_llc_iteration is deprecated and noop");
3591    }
3592
3593    let llv = match opts.verbose {
3594        0 => simplelog::LevelFilter::Info,
3595        1 => simplelog::LevelFilter::Debug,
3596        _ => simplelog::LevelFilter::Trace,
3597    };
3598    let mut lcfg = simplelog::ConfigBuilder::new();
3599    lcfg.set_time_offset_to_local()
3600        .expect("Failed to set local time offset")
3601        .set_time_level(simplelog::LevelFilter::Error)
3602        .set_location_level(simplelog::LevelFilter::Off)
3603        .set_target_level(simplelog::LevelFilter::Off)
3604        .set_thread_level(simplelog::LevelFilter::Off);
3605    simplelog::TermLogger::init(
3606        llv,
3607        lcfg.build(),
3608        simplelog::TerminalMode::Stderr,
3609        simplelog::ColorChoice::Auto,
3610    )?;
3611
3612    debug!("opts={:?}", &opts);
3613
3614    let shutdown = Arc::new(AtomicBool::new(false));
3615    let shutdown_clone = shutdown.clone();
3616    ctrlc::set_handler(move || {
3617        shutdown_clone.store(true, Ordering::Relaxed);
3618    })
3619    .context("Error setting Ctrl-C handler")?;
3620
3621    if let Some(intv) = opts.monitor.or(opts.stats) {
3622        let shutdown_copy = shutdown.clone();
3623        let jh = std::thread::spawn(move || {
3624            match stats::monitor(Duration::from_secs_f64(intv), shutdown_copy) {
3625                Ok(_) => {
3626                    debug!("stats monitor thread finished successfully")
3627                }
3628                Err(error_object) => {
3629                    warn!(
3630                        "stats monitor thread finished because of an error {}",
3631                        error_object
3632                    )
3633                }
3634            }
3635        });
3636        if opts.monitor.is_some() {
3637            let _ = jh.join();
3638            return Ok(());
3639        }
3640    }
3641
3642    if let Some(path) = &opts.example {
3643        write_example_file(path)?;
3644        return Ok(());
3645    }
3646
3647    let mut layer_config = match opts.run_example {
3648        true => EXAMPLE_CONFIG.clone(),
3649        false => LayerConfig { specs: vec![] },
3650    };
3651
3652    for (idx, input) in opts.specs.iter().enumerate() {
3653        let specs = LayerSpec::parse(input)
3654            .context(format!("Failed to parse specs[{}] ({:?})", idx, input))?;
3655
3656        for spec in specs {
3657            match spec.template {
3658                Some(ref rule) => {
3659                    let matches = expand_template(&rule)?;
3660                    // in the absence of matching cgroups, have template layers
3661                    // behave as non-template layers do.
3662                    if matches.is_empty() {
3663                        layer_config.specs.push(spec);
3664                    } else {
3665                        for (mt, mask) in matches {
3666                            let mut genspec = spec.clone();
3667
3668                            genspec.cpuset = Some(mask);
3669
3670                            // Push the new "and" rule into each "or" term.
3671                            for orterm in &mut genspec.matches {
3672                                orterm.push(mt.clone());
3673                            }
3674
3675                            match &mt {
3676                                LayerMatch::CgroupSuffix(cgroup) => genspec.name.push_str(cgroup),
3677                                _ => bail!("Template match has unexpected type"),
3678                            }
3679
3680                            // Push the generated layer into the config
3681                            layer_config.specs.push(genspec);
3682                        }
3683                    }
3684                }
3685
3686                None => {
3687                    layer_config.specs.push(spec);
3688                }
3689            }
3690        }
3691    }
3692
3693    for spec in layer_config.specs.iter_mut() {
3694        let common = spec.kind.common_mut();
3695
3696        if common.slice_us == 0 {
3697            common.slice_us = opts.slice_us;
3698        }
3699
3700        if common.weight == 0 {
3701            common.weight = DEFAULT_LAYER_WEIGHT;
3702        }
3703        common.weight = common.weight.clamp(MIN_LAYER_WEIGHT, MAX_LAYER_WEIGHT);
3704
3705        if common.preempt {
3706            if common.disallow_open_after_us.is_some() {
3707                warn!(
3708                    "Preempt layer {} has non-null disallow_open_after_us, ignored",
3709                    &spec.name
3710                );
3711            }
3712            if common.disallow_preempt_after_us.is_some() {
3713                warn!(
3714                    "Preempt layer {} has non-null disallow_preempt_after_us, ignored",
3715                    &spec.name
3716                );
3717            }
3718            common.disallow_open_after_us = Some(u64::MAX);
3719            common.disallow_preempt_after_us = Some(u64::MAX);
3720        } else {
3721            if common.disallow_open_after_us.is_none() {
3722                common.disallow_open_after_us = Some(*DFL_DISALLOW_OPEN_AFTER_US);
3723            }
3724
3725            if common.disallow_preempt_after_us.is_none() {
3726                common.disallow_preempt_after_us = Some(*DFL_DISALLOW_PREEMPT_AFTER_US);
3727            }
3728        }
3729
3730        if common.idle_smt.is_some() {
3731            warn!("Layer {} has deprecated flag \"idle_smt\"", &spec.name);
3732        }
3733    }
3734
3735    let membw_required = layer_config.specs.iter().any(|spec| match spec.kind {
3736        LayerKind::Confined { membw_gb, .. } | LayerKind::Grouped { membw_gb, .. } => {
3737            membw_gb.is_some()
3738        }
3739        LayerKind::Open { .. } => false,
3740    });
3741
3742    if opts.print_and_exit {
3743        println!("specs={}", serde_json::to_string_pretty(&layer_config)?);
3744        return Ok(());
3745    }
3746
3747    debug!("specs={}", serde_json::to_string_pretty(&layer_config)?);
3748    let hint_to_layer_map = verify_layer_specs(&layer_config.specs)?;
3749
3750    let mut open_object = MaybeUninit::uninit();
3751    loop {
3752        let mut sched = Scheduler::init(
3753            &opts,
3754            &layer_config.specs,
3755            &mut open_object,
3756            &hint_to_layer_map,
3757            membw_required,
3758        )?;
3759        if !sched.run(shutdown.clone())?.should_restart() {
3760            break;
3761        }
3762    }
3763
3764    Ok(())
3765}