scx_layered/
main.rs

1// Copyright (c) Meta Platforms, Inc. and affiliates.
2
3// This software may be used and distributed according to the terms of the
4// GNU General Public License version 2.
5mod bpf_skel;
6mod stats;
7
8use std::collections::BTreeMap;
9use std::collections::HashMap;
10use std::collections::HashSet;
11use std::ffi::CString;
12use std::fs;
13use std::io::Write;
14use std::mem::MaybeUninit;
15use std::ops::Sub;
16use std::path::Path;
17use std::path::PathBuf;
18use std::sync::atomic::AtomicBool;
19use std::sync::atomic::Ordering;
20use std::sync::Arc;
21use std::thread::ThreadId;
22use std::time::Duration;
23use std::time::Instant;
24
25use anyhow::anyhow;
26use anyhow::bail;
27use anyhow::Context;
28use anyhow::Result;
29pub use bpf_skel::*;
30use clap::Parser;
31use crossbeam::channel::RecvTimeoutError;
32use lazy_static::lazy_static;
33use libbpf_rs::MapCore as _;
34use libbpf_rs::OpenObject;
35use libbpf_rs::ProgramInput;
36use log::info;
37use log::trace;
38use log::warn;
39use log::{debug, error};
40use nix::sched::CpuSet;
41use nvml_wrapper::error::NvmlError;
42use nvml_wrapper::Nvml;
43use once_cell::sync::OnceCell;
44use regex::Regex;
45use scx_bpf_compat;
46use scx_layered::*;
47use scx_stats::prelude::*;
48use scx_utils::build_id;
49use scx_utils::compat;
50use scx_utils::init_libbpf_logging;
51use scx_utils::libbpf_clap_opts::LibbpfOpts;
52use scx_utils::pm::{cpu_idle_resume_latency_supported, update_cpu_idle_resume_latency};
53use scx_utils::read_netdevs;
54use scx_utils::scx_enums;
55use scx_utils::scx_ops_attach;
56use scx_utils::scx_ops_load;
57use scx_utils::scx_ops_open;
58use scx_utils::uei_exited;
59use scx_utils::uei_report;
60use scx_utils::CoreType;
61use scx_utils::Cpumask;
62use scx_utils::Llc;
63use scx_utils::NetDev;
64use scx_utils::Topology;
65use scx_utils::UserExitInfo;
66use scx_utils::NR_CPUS_POSSIBLE;
67use scx_utils::NR_CPU_IDS;
68use stats::LayerStats;
69use stats::StatsReq;
70use stats::StatsRes;
71use stats::SysStats;
72use std::collections::VecDeque;
73use sysinfo::{Pid, ProcessRefreshKind, ProcessesToUpdate, System};
74
75const SCHEDULER_NAME: &str = "scx_layered";
76const MAX_PATH: usize = bpf_intf::consts_MAX_PATH as usize;
77const MAX_COMM: usize = bpf_intf::consts_MAX_COMM as usize;
78const MAX_LAYER_WEIGHT: u32 = bpf_intf::consts_MAX_LAYER_WEIGHT;
79const MIN_LAYER_WEIGHT: u32 = bpf_intf::consts_MIN_LAYER_WEIGHT;
80const MAX_LAYER_MATCH_ORS: usize = bpf_intf::consts_MAX_LAYER_MATCH_ORS as usize;
81const MAX_LAYER_NAME: usize = bpf_intf::consts_MAX_LAYER_NAME as usize;
82const MAX_LAYERS: usize = bpf_intf::consts_MAX_LAYERS as usize;
83const DEFAULT_LAYER_WEIGHT: u32 = bpf_intf::consts_DEFAULT_LAYER_WEIGHT;
84const USAGE_HALF_LIFE: u32 = bpf_intf::consts_USAGE_HALF_LIFE;
85const USAGE_HALF_LIFE_F64: f64 = USAGE_HALF_LIFE as f64 / 1_000_000_000.0;
86
87const LAYER_USAGE_OWNED: usize = bpf_intf::layer_usage_LAYER_USAGE_OWNED as usize;
88const LAYER_USAGE_OPEN: usize = bpf_intf::layer_usage_LAYER_USAGE_OPEN as usize;
89const LAYER_USAGE_SUM_UPTO: usize = bpf_intf::layer_usage_LAYER_USAGE_SUM_UPTO as usize;
90const LAYER_USAGE_PROTECTED: usize = bpf_intf::layer_usage_LAYER_USAGE_PROTECTED as usize;
91const LAYER_USAGE_PROTECTED_PREEMPT: usize =
92    bpf_intf::layer_usage_LAYER_USAGE_PROTECTED_PREEMPT as usize;
93const NR_LAYER_USAGES: usize = bpf_intf::layer_usage_NR_LAYER_USAGES as usize;
94
95const NR_GSTATS: usize = bpf_intf::global_stat_id_NR_GSTATS as usize;
96const NR_LSTATS: usize = bpf_intf::layer_stat_id_NR_LSTATS as usize;
97const NR_LLC_LSTATS: usize = bpf_intf::llc_layer_stat_id_NR_LLC_LSTATS as usize;
98
99const NR_LAYER_MATCH_KINDS: usize = bpf_intf::layer_match_kind_NR_LAYER_MATCH_KINDS as usize;
100
101static NVML: OnceCell<Nvml> = OnceCell::new();
102
103fn nvml() -> Result<&'static Nvml, NvmlError> {
104    NVML.get_or_try_init(Nvml::init)
105}
106
107lazy_static! {
108    static ref USAGE_DECAY: f64 = 0.5f64.powf(1.0 / USAGE_HALF_LIFE_F64);
109    static ref DFL_DISALLOW_OPEN_AFTER_US: u64 = 2 * scx_enums.SCX_SLICE_DFL / 1000;
110    static ref DFL_DISALLOW_PREEMPT_AFTER_US: u64 = 4 * scx_enums.SCX_SLICE_DFL / 1000;
111    static ref EXAMPLE_CONFIG: LayerConfig = LayerConfig {
112        specs: vec![
113            LayerSpec {
114                name: "batch".into(),
115                comment: Some("tasks under system.slice or tasks with nice value > 0".into()),
116                cpuset: None,
117                template: None,
118                matches: vec![
119                    vec![LayerMatch::CgroupPrefix("system.slice/".into())],
120                    vec![LayerMatch::NiceAbove(0)],
121                ],
122                kind: LayerKind::Confined {
123                    util_range: (0.8, 0.9),
124                    cpus_range: Some((0, 16)),
125                    cpus_range_frac: None,
126                    protected: false,
127                    membw_gb: None,
128                    common: LayerCommon {
129                        min_exec_us: 1000,
130                        yield_ignore: 0.0,
131                        preempt: false,
132                        preempt_first: false,
133                        exclusive: false,
134                        allow_node_aligned: false,
135                        skip_remote_node: false,
136                        prev_over_idle_core: false,
137                        idle_smt: None,
138                        slice_us: 20000,
139                        fifo: false,
140                        weight: DEFAULT_LAYER_WEIGHT,
141                        disallow_open_after_us: None,
142                        disallow_preempt_after_us: None,
143                        xllc_mig_min_us: 1000.0,
144                        growth_algo: LayerGrowthAlgo::Sticky,
145                        idle_resume_us: None,
146                        perf: 1024,
147                        nodes: vec![],
148                        llcs: vec![],
149                        placement: LayerPlacement::Standard,
150                    },
151                },
152            },
153            LayerSpec {
154                name: "immediate".into(),
155                comment: Some("tasks under workload.slice with nice value < 0".into()),
156                cpuset: None,
157                template: None,
158                matches: vec![vec![
159                    LayerMatch::CgroupPrefix("workload.slice/".into()),
160                    LayerMatch::NiceBelow(0),
161                ]],
162                kind: LayerKind::Open {
163                    common: LayerCommon {
164                        min_exec_us: 100,
165                        yield_ignore: 0.25,
166                        preempt: true,
167                        preempt_first: false,
168                        exclusive: true,
169                        allow_node_aligned: true,
170                        skip_remote_node: false,
171                        prev_over_idle_core: true,
172                        idle_smt: None,
173                        slice_us: 20000,
174                        fifo: false,
175                        weight: DEFAULT_LAYER_WEIGHT,
176                        disallow_open_after_us: None,
177                        disallow_preempt_after_us: None,
178                        xllc_mig_min_us: 0.0,
179                        growth_algo: LayerGrowthAlgo::Sticky,
180                        perf: 1024,
181                        idle_resume_us: None,
182                        nodes: vec![],
183                        llcs: vec![],
184                        placement: LayerPlacement::Standard,
185                    },
186                },
187            },
188            LayerSpec {
189                name: "stress-ng".into(),
190                comment: Some("stress-ng test layer".into()),
191                cpuset: None,
192                template: None,
193                matches: vec![
194                    vec![LayerMatch::CommPrefix("stress-ng".into()),],
195                    vec![LayerMatch::PcommPrefix("stress-ng".into()),]
196                ],
197                kind: LayerKind::Confined {
198                    cpus_range: None,
199                    util_range: (0.2, 0.8),
200                    protected: false,
201                    cpus_range_frac: None,
202                    membw_gb: None,
203                    common: LayerCommon {
204                        min_exec_us: 800,
205                        yield_ignore: 0.0,
206                        preempt: true,
207                        preempt_first: false,
208                        exclusive: false,
209                        allow_node_aligned: false,
210                        skip_remote_node: false,
211                        prev_over_idle_core: false,
212                        idle_smt: None,
213                        slice_us: 800,
214                        fifo: false,
215                        weight: DEFAULT_LAYER_WEIGHT,
216                        disallow_open_after_us: None,
217                        disallow_preempt_after_us: None,
218                        xllc_mig_min_us: 0.0,
219                        growth_algo: LayerGrowthAlgo::Topo,
220                        perf: 1024,
221                        idle_resume_us: None,
222                        nodes: vec![],
223                        llcs: vec![],
224                        placement: LayerPlacement::Standard,
225                    },
226                },
227            },
228            LayerSpec {
229                name: "normal".into(),
230                comment: Some("the rest".into()),
231                cpuset: None,
232                template: None,
233                matches: vec![vec![]],
234                kind: LayerKind::Grouped {
235                    cpus_range: None,
236                    util_range: (0.5, 0.6),
237                    util_includes_open_cputime: true,
238                    protected: false,
239                    cpus_range_frac: None,
240                    membw_gb: None,
241                    common: LayerCommon {
242                        min_exec_us: 200,
243                        yield_ignore: 0.0,
244                        preempt: false,
245                        preempt_first: false,
246                        exclusive: false,
247                        allow_node_aligned: false,
248                        skip_remote_node: false,
249                        prev_over_idle_core: false,
250                        idle_smt: None,
251                        slice_us: 20000,
252                        fifo: false,
253                        weight: DEFAULT_LAYER_WEIGHT,
254                        disallow_open_after_us: None,
255                        disallow_preempt_after_us: None,
256                        xllc_mig_min_us: 100.0,
257                        growth_algo: LayerGrowthAlgo::Linear,
258                        perf: 1024,
259                        idle_resume_us: None,
260                        nodes: vec![],
261                        llcs: vec![],
262                        placement: LayerPlacement::Standard,
263                    },
264                },
265            },
266        ],
267    };
268}
269
270/// scx_layered: A highly configurable multi-layer sched_ext scheduler
271///
272/// scx_layered allows classifying tasks into multiple layers and applying
273/// different scheduling policies to them. The configuration is specified in
274/// json and composed of two parts - matches and policies.
275///
276/// Matches
277/// =======
278///
279/// Whenever a task is forked or its attributes are changed, the task goes
280/// through a series of matches to determine the layer it belongs to. A
281/// match set is composed of OR groups of AND blocks. An example:
282///
283///   "matches": [
284///     [
285///       {
286///         "CgroupPrefix": "system.slice/"
287///       }
288///     ],
289///     [
290///       {
291///         "CommPrefix": "fbagent"
292///       },
293///       {
294///         "NiceAbove": 0
295///       }
296///     ]
297///   ],
298///
299/// The outer array contains the OR groups and the inner AND blocks, so the
300/// above matches:
301///
302/// - Tasks which are in the cgroup sub-hierarchy under "system.slice".
303///
304/// - Or tasks whose comm starts with "fbagent" and have a nice value > 0.
305///
306/// Currently, the following matches are supported:
307///
308/// - CgroupPrefix: Matches the prefix of the cgroup that the task belongs
309///   to. As this is a string match, whether the pattern has the trailing
310///   '/' makes a difference. For example, "TOP/CHILD/" only matches tasks
311///   which are under that particular cgroup while "TOP/CHILD" also matches
312///   tasks under "TOP/CHILD0/" or "TOP/CHILD1/".
313///
314/// - CommPrefix: Matches the task's comm prefix.
315///
316/// - PcommPrefix: Matches the task's thread group leader's comm prefix.
317///
318/// - NiceAbove: Matches if the task's nice value is greater than the
319///   pattern.
320///
321/// - NiceBelow: Matches if the task's nice value is smaller than the
322///   pattern.
323///
324/// - NiceEquals: Matches if the task's nice value is exactly equal to
325///   the pattern.
326///
327/// - UIDEquals: Matches if the task's effective user id matches the value
328///
329/// - GIDEquals: Matches if the task's effective group id matches the value.
330///
331/// - PIDEquals: Matches if the task's pid matches the value.
332///
333/// - PPIDEquals: Matches if the task's ppid matches the value.
334///
335/// - TGIDEquals: Matches if the task's tgid matches the value.
336///
337/// - NSPIDEquals: Matches if the task's namespace id and pid matches the values.
338///
339/// - NSEquals: Matches if the task's namespace id matches the values.
340///
341/// - IsGroupLeader: Bool. When true, matches if the task is group leader
342///   (i.e. PID == TGID), aka the thread from which other threads are made.
343///   When false, matches if the task is *not* the group leader (i.e. the rest).
344///
345/// - CmdJoin: Matches when the task uses pthread_setname_np to send a join/leave
346/// command to the scheduler. See examples/cmdjoin.c for more details.
347///
348/// - UsedGpuTid: Bool. When true, matches if the tasks which have used
349///   gpus by tid.
350///
351/// - UsedGpuPid: Bool. When true, matches if the tasks which have used gpu
352///   by tgid/pid.
353///
354/// - [EXPERIMENTAL] AvgRuntime: (u64, u64). Match tasks whose average runtime
355///   is within the provided values [min, max).
356///
357/// - HintEquals: u64. Match tasks whose hint value equals this value.
358///   The value must be in the range [0, 1024].
359///
360/// While there are complexity limitations as the matches are performed in
361/// BPF, it is straightforward to add more types of matches.
362///
363/// Templates
364/// ---------
365///
366/// Templates let us create a variable number of layers dynamically at initialization
367/// time out of a cgroup name suffix/prefix. Sometimes we know there are multiple
368/// applications running on a machine, each with their own cgroup but do not know the
369/// exact names of the applications or cgroups, e.g., in cloud computing contexts where
370/// workloads are placed on machines dynamically and run under cgroups whose name is
371/// autogenerated. In that case, we cannot hardcode the cgroup match rules when writing
372/// the configuration. We thus cannot easily prevent tasks from different cgroups from
373/// falling into the same layer and affecting each other's performance.
374///
375///
376/// Templates offer a solution to this problem by generating one layer for each such cgroup,
377/// provided these cgroups share a suffix, and that the suffix is unique to them. Templates
378/// have a cgroup suffix rule that we use to find the relevant cgroups in the system. For each
379/// such cgroup, we copy the layer config and add a matching rule that matches just this cgroup.
380///
381///
382/// Policies
383/// ========
384///
385/// The following is an example policy configuration for a layer.
386///
387///   "kind": {
388///     "Confined": {
389///       "cpus_range": [1, 8],
390///       "util_range": [0.8, 0.9]
391///     }
392///   }
393///
394/// It's of "Confined" kind, which tries to concentrate the layer's tasks
395/// into a limited number of CPUs. In the above case, the number of CPUs
396/// assigned to the layer is scaled between 1 and 8 so that the per-cpu
397/// utilization is kept between 80% and 90%. If the CPUs are loaded higher
398/// than 90%, more CPUs are allocated to the layer. If the utilization drops
399/// below 80%, the layer loses CPUs.
400///
401/// Currently, the following policy kinds are supported:
402///
403/// - Confined: Tasks are restricted to the allocated CPUs. The number of
404///   CPUs allocated is modulated to keep the per-CPU utilization in
405///   "util_range". The range can optionally be restricted with the
406///   "cpus_range" property.
407///
408/// - Grouped: Similar to Confined but tasks may spill outside if there are
409///   idle CPUs outside the allocated ones. The range can optionally be
410///   restricted with the "cpus_range" property.
411///
412/// - Open: Prefer the CPUs which are not occupied by Confined or Grouped
413///   layers. Tasks in this group will spill into occupied CPUs if there are
414///   no unoccupied idle CPUs.
415///
416/// All layers take the following options:
417///
418/// - min_exec_us: Minimum execution time in microseconds. Whenever a task
419///   is scheduled in, this is the minimum CPU time that it's charged no
420///   matter how short the actual execution time may be.
421///
422/// - yield_ignore: Yield ignore ratio. If 0.0, yield(2) forfeits a whole
423///   execution slice. 0.25 yields three quarters of an execution slice and
424///   so on. If 1.0, yield is completely ignored.
425///
426/// - slice_us: Scheduling slice duration in microseconds.
427///
428/// - fifo: Use FIFO queues within the layer instead of the default vtime.
429///
430/// - preempt: If true, tasks in the layer will preempt tasks which belong
431///   to other non-preempting layers when no idle CPUs are available.
432///
433/// - preempt_first: If true, tasks in the layer will try to preempt tasks
434///   in their previous CPUs before trying to find idle CPUs.
435///
436/// - exclusive: If true, tasks in the layer will occupy the whole core. The
437///   other logical CPUs sharing the same core will be kept idle. This isn't
438///   a hard guarantee, so don't depend on it for security purposes.
439///
440/// - allow_node_aligned: Put node aligned tasks on layer DSQs instead of lo
441///   fallback. This is a hack to support node-affine tasks without making
442///   the whole scheduler node aware and should only be used with open
443///   layers on non-saturated machines to avoid possible stalls.
444///
445/// - prev_over_idle_core: On SMT enabled systems, prefer using the same CPU
446///   when picking a CPU for tasks on this layer, even if that CPUs SMT
447///   sibling is processing a task.
448///
449/// - weight: Weight of the layer, which is a range from 1 to 10000 with a
450///   default of 100. Layer weights are used during contention to prevent
451///   starvation across layers. Weights are used in combination with
452///   utilization to determine the infeasible adjusted weight with higher
453///   weights having a larger adjustment in adjusted utilization.
454///
455/// - disallow_open_after_us: Duration to wait after machine reaches saturation
456///   before confining tasks in Open layers.
457///
458/// - cpus_range_frac: Array of 2 floats between 0 and 1.0. Lower and upper
459///   bound fractions of all CPUs to give to a layer. Mutually exclusive
460///   with cpus_range.
461///
462/// - disallow_preempt_after_us: Duration to wait after machine reaches saturation
463///   before confining tasks to preempt.
464///
465/// - xllc_mig_min_us: Skip cross-LLC migrations if they are likely to run on
466///   their existing LLC sooner than this.
467///
468/// - idle_smt: *** DEPRECATED ****
469///
470/// - growth_algo: When a layer is allocated new CPUs different algorithms can
471///   be used to determine which CPU should be allocated next. The default
472///   algorithm is a "sticky" algorithm that attempts to spread layers evenly
473///   across cores.
474///
475/// - perf: CPU performance target. 0 means no configuration. A value
476///   between 1 and 1024 indicates the performance level CPUs running tasks
477///   in this layer are configured to using scx_bpf_cpuperf_set().
478///
479/// - idle_resume_us: Sets the idle resume QoS value. CPU idle time governors are expected to
480///   regard the minimum of the global (effective) CPU latency limit and the effective resume
481///   latency constraint for the given CPU as the upper limit for the exit latency of the idle
482///   states. See the latest kernel docs for more details:
483///   https://www.kernel.org/doc/html/latest/admin-guide/pm/cpuidle.html
484///
485/// - nodes: If set the layer will use the set of NUMA nodes for scheduling
486///   decisions. If unset then all available NUMA nodes will be used. If the
487///   llcs value is set the cpuset of NUMA nodes will be or'ed with the LLC
488///   config.
489///
490/// - llcs: If set the layer will use the set of LLCs (last level caches)
491///   for scheduling decisions. If unset then all LLCs will be used. If
492///   the nodes value is set the cpuset of LLCs will be or'ed with the nodes
493///   config.
494///
495///
496/// Similar to matches, adding new policies and extending existing ones
497/// should be relatively straightforward.
498///
499/// Configuration example and running scx_layered
500/// =============================================
501///
502/// An scx_layered config is composed of layer configs. A layer config is
503/// composed of a name, a set of matches, and a policy block. Running the
504/// following will write an example configuration into example.json.
505///
506///   $ scx_layered -e example.json
507///
508/// Note that the last layer in the configuration must have an empty match set
509/// as a catch-all for tasks which haven't been matched into previous layers.
510///
511/// The configuration can be specified in multiple json files and
512/// command line arguments, which are concatenated in the specified
513/// order. Each must contain valid layer configurations.
514///
515/// By default, an argument to scx_layered is interpreted as a JSON string. If
516/// the argument is a pointer to a JSON file, it should be prefixed with file:
517/// or f: as follows:
518///
519///   $ scx_layered file:example.json
520///   ...
521///   $ scx_layered f:example.json
522///
523/// Monitoring Statistics
524/// =====================
525///
526/// Run with `--stats INTERVAL` to enable stats monitoring. There is
527/// also an scx_stat server listening on /var/run/scx/root/stat that can
528/// be monitored by running `scx_layered --monitor INTERVAL` separately.
529///
530///   ```bash
531///   $ scx_layered --monitor 1
532///   tot= 117909 local=86.20 open_idle= 0.21 affn_viol= 1.37 proc=6ms
533///   busy= 34.2 util= 1733.6 load=  21744.1 fallback_cpu=  1
534///     batch    : util/frac=   11.8/  0.7 load/frac=     29.7:  0.1 tasks=  2597
535///                tot=   3478 local=67.80 open_idle= 0.00 preempt= 0.00 affn_viol= 0.00
536///                cpus=  2 [  2,  2] 04000001 00000000
537///     immediate: util/frac= 1218.8/ 70.3 load/frac=  21399.9: 98.4 tasks=  1107
538///                tot=  68997 local=90.57 open_idle= 0.26 preempt= 9.36 affn_viol= 0.00
539///                cpus= 50 [ 50, 50] fbfffffe 000fffff
540///     normal   : util/frac=  502.9/ 29.0 load/frac=    314.5:  1.4 tasks=  3512
541///                tot=  45434 local=80.97 open_idle= 0.16 preempt= 0.00 affn_viol= 3.56
542///                cpus= 50 [ 50, 50] fbfffffe 000fffff
543///   ```
544///
545/// Global statistics: see [`SysStats`]
546///
547/// Per-layer statistics: see [`LayerStats`]
548///
549#[derive(Debug, Parser)]
550#[command(verbatim_doc_comment)]
551struct Opts {
552    /// Scheduling slice duration in microseconds.
553    #[clap(short = 's', long, default_value = "20000")]
554    slice_us: u64,
555
556    /// Maximum consecutive execution time in microseconds. A task may be
557    /// allowed to keep executing on a CPU for this long. Note that this is
558    /// the upper limit and a task may have to moved off the CPU earlier. 0
559    /// indicates default - 20 * slice_us.
560    #[clap(short = 'M', long, default_value = "0")]
561    max_exec_us: u64,
562
563    /// Scheduling interval in seconds.
564    #[clap(short = 'i', long, default_value = "0.1")]
565    interval: f64,
566
567    /// ***DEPRECATED*** Disable load-fraction based max layer CPU limit.
568    /// recommended.
569    #[clap(short = 'n', long, default_value = "false")]
570    no_load_frac_limit: bool,
571
572    /// Exit debug dump buffer length. 0 indicates default.
573    #[clap(long, default_value = "0")]
574    exit_dump_len: u32,
575
576    /// Enable verbose output, including libbpf details. Specify multiple
577    /// times to increase verbosity.
578    #[clap(short = 'v', long, action = clap::ArgAction::Count)]
579    verbose: u8,
580
581    /// Disable topology awareness. When enabled, the "nodes" and "llcs" settings on
582    /// a layer are ignored. Defaults to false on topologies with multiple NUMA nodes
583    /// or LLCs, and true otherwise.
584    #[arg(short = 't', long, num_args = 0..=1, default_missing_value = "true", require_equals = true)]
585    disable_topology: Option<bool>,
586
587    /// Enable cross NUMA preemption.
588    #[clap(long)]
589    xnuma_preemption: bool,
590
591    /// Disable monitor
592    #[clap(long)]
593    monitor_disable: bool,
594
595    /// Write example layer specifications into the file and exit.
596    #[clap(short = 'e', long)]
597    example: Option<String>,
598
599    /// ***DEPRECATED*** Disables preemption if the weighted load fraction
600    /// of a layer (load_frac_adj) exceeds the threshold. The default is
601    /// disabled (0.0).
602    #[clap(long, default_value = "0.0")]
603    layer_preempt_weight_disable: f64,
604
605    /// ***DEPRECATED*** Disables layer growth if the weighted load fraction
606    /// of a layer (load_frac_adj) exceeds the threshold. The default is
607    /// disabled (0.0).
608    #[clap(long, default_value = "0.0")]
609    layer_growth_weight_disable: f64,
610
611    /// Enable stats monitoring with the specified interval.
612    #[clap(long)]
613    stats: Option<f64>,
614
615    /// Run in stats monitoring mode with the specified interval. Scheduler
616    /// is not launched.
617    #[clap(long)]
618    monitor: Option<f64>,
619
620    /// Run with example layer specifications (useful for e.g. CI pipelines)
621    #[clap(long)]
622    run_example: bool,
623
624    /// ***DEPRECATED *** Enables iteration over local LLCs first for
625    /// dispatch.
626    #[clap(long, default_value = "false")]
627    local_llc_iteration: bool,
628
629    /// Low priority fallback DSQs are used to execute tasks with custom CPU
630    /// affinities. These DSQs are immediately executed iff a CPU is
631    /// otherwise idle. However, after the specified wait, they are
632    /// guranteed upto --lo-fb-share fraction of each CPU.
633    #[clap(long, default_value = "10000")]
634    lo_fb_wait_us: u64,
635
636    /// The fraction of CPU time guaranteed to low priority fallback DSQs.
637    /// See --lo-fb-wait-us.
638    #[clap(long, default_value = ".05")]
639    lo_fb_share: f64,
640
641    /// Disable antistall
642    #[clap(long, default_value = "false")]
643    disable_antistall: bool,
644
645    /// Enable numa topology based gpu task affinitization.
646    #[clap(long, default_value = "false")]
647    enable_gpu_affinitize: bool,
648
649    /// Interval at which to reaffinitize gpu tasks to numa nodes.
650    /// Defaults to 900s
651    #[clap(long, default_value = "900")]
652    gpu_affinitize_secs: u64,
653
654    /// Enable match debug
655    /// This stores a mapping of task tid
656    /// to layer id such that bpftool map dump
657    /// can be used to debug layer matches.
658    #[clap(long, default_value = "false")]
659    enable_match_debug: bool,
660
661    /// Maximum task runnable_at delay (in seconds) before antistall turns on
662    #[clap(long, default_value = "3")]
663    antistall_sec: u64,
664
665    /// Enable gpu support
666    #[clap(long, default_value = "false")]
667    enable_gpu_support: bool,
668
669    /// Gpu Kprobe Level
670    /// The value set here determines how agressive
671    /// the kprobes enabled on gpu driver functions are.
672    /// Higher values are more aggressive, incurring more system overhead
673    /// and more accurately identifying PIDs using GPUs in a more timely manner.
674    /// Lower values incur less system overhead, at the cost of less accurately
675    /// identifying GPU pids and taking longer to do so.
676    #[clap(long, default_value = "3")]
677    gpu_kprobe_level: u64,
678
679    /// Enable netdev IRQ balancing. This is experimental and should be used with caution.
680    #[clap(long, default_value = "false")]
681    netdev_irq_balance: bool,
682
683    /// Disable queued wakeup optimization.
684    #[clap(long, default_value = "false")]
685    disable_queued_wakeup: bool,
686
687    /// Per-cpu kthreads are preempting by default. Make it not so.
688    #[clap(long, default_value = "false")]
689    disable_percpu_kthread_preempt: bool,
690
691    /// Only highpri (nice < 0) per-cpu kthreads are preempting by default.
692    /// Make every per-cpu kthread preempting. Meaningful only if
693    /// --disable-percpu-kthread-preempt is not set.
694    #[clap(long, default_value = "false")]
695    percpu_kthread_preempt_all: bool,
696
697    /// Print scheduler version and exit.
698    #[clap(short = 'V', long, action = clap::ArgAction::SetTrue)]
699    version: bool,
700
701    /// Show descriptions for statistics.
702    #[clap(long)]
703    help_stats: bool,
704
705    /// Layer specification. See --help.
706    specs: Vec<String>,
707
708    /// Periodically force tasks in layers using the AvgRuntime match rule to reevaluate which layer they belong to. Default period of 2s.
709    /// turns this off.
710    #[clap(long, default_value = "2000")]
711    layer_refresh_ms_avgruntime: u64,
712
713    /// Set the path for pinning the task hint map.
714    #[clap(long, default_value = "")]
715    task_hint_map: String,
716
717    /// Print the config (after template expansion) and exit.
718    #[clap(long, default_value = "false")]
719    print_and_exit: bool,
720
721    /// Enable affinitized task to use hi fallback queue to get more CPU time.
722    #[clap(long, default_value = "")]
723    hi_fb_thread_name: String,
724
725    #[clap(flatten, next_help_heading = "Libbpf Options")]
726    pub libbpf: LibbpfOpts,
727}
728
729fn read_total_cpu(reader: &fb_procfs::ProcReader) -> Result<fb_procfs::CpuStat> {
730    reader
731        .read_stat()
732        .context("Failed to read procfs")?
733        .total_cpu
734        .ok_or_else(|| anyhow!("Could not read total cpu stat in proc"))
735}
736
737fn calc_util(curr: &fb_procfs::CpuStat, prev: &fb_procfs::CpuStat) -> Result<f64> {
738    match (curr, prev) {
739        (
740            fb_procfs::CpuStat {
741                user_usec: Some(curr_user),
742                nice_usec: Some(curr_nice),
743                system_usec: Some(curr_system),
744                idle_usec: Some(curr_idle),
745                iowait_usec: Some(curr_iowait),
746                irq_usec: Some(curr_irq),
747                softirq_usec: Some(curr_softirq),
748                stolen_usec: Some(curr_stolen),
749                ..
750            },
751            fb_procfs::CpuStat {
752                user_usec: Some(prev_user),
753                nice_usec: Some(prev_nice),
754                system_usec: Some(prev_system),
755                idle_usec: Some(prev_idle),
756                iowait_usec: Some(prev_iowait),
757                irq_usec: Some(prev_irq),
758                softirq_usec: Some(prev_softirq),
759                stolen_usec: Some(prev_stolen),
760                ..
761            },
762        ) => {
763            let idle_usec = curr_idle.saturating_sub(*prev_idle);
764            let iowait_usec = curr_iowait.saturating_sub(*prev_iowait);
765            let user_usec = curr_user.saturating_sub(*prev_user);
766            let system_usec = curr_system.saturating_sub(*prev_system);
767            let nice_usec = curr_nice.saturating_sub(*prev_nice);
768            let irq_usec = curr_irq.saturating_sub(*prev_irq);
769            let softirq_usec = curr_softirq.saturating_sub(*prev_softirq);
770            let stolen_usec = curr_stolen.saturating_sub(*prev_stolen);
771
772            let busy_usec =
773                user_usec + system_usec + nice_usec + irq_usec + softirq_usec + stolen_usec;
774            let total_usec = idle_usec + busy_usec + iowait_usec;
775            if total_usec > 0 {
776                Ok(((busy_usec as f64) / (total_usec as f64)).clamp(0.0, 1.0))
777            } else {
778                Ok(1.0)
779            }
780        }
781        _ => bail!("Missing stats in cpustat"),
782    }
783}
784
785fn copy_into_cstr(dst: &mut [i8], src: &str) {
786    let cstr = CString::new(src).unwrap();
787    let bytes = unsafe { std::mem::transmute::<&[u8], &[i8]>(cstr.as_bytes_with_nul()) };
788    dst[0..bytes.len()].copy_from_slice(bytes);
789}
790
791fn nodemask_from_nodes(nodes: &Vec<usize>) -> usize {
792    let mut mask = 0;
793    for node in nodes {
794        mask |= 1 << node;
795    }
796    mask
797}
798
799fn llcmask_from_llcs(llcs: &BTreeMap<usize, Arc<Llc>>) -> usize {
800    let mut mask = 0;
801    for (_, cache) in llcs {
802        mask |= 1 << cache.id;
803    }
804    mask
805}
806
807fn read_cpu_ctxs(skel: &BpfSkel) -> Result<Vec<bpf_intf::cpu_ctx>> {
808    let mut cpu_ctxs = vec![];
809    let cpu_ctxs_vec = skel
810        .maps
811        .cpu_ctxs
812        .lookup_percpu(&0u32.to_ne_bytes(), libbpf_rs::MapFlags::ANY)
813        .context("Failed to lookup cpu_ctx")?
814        .unwrap();
815    for cpu in 0..*NR_CPUS_POSSIBLE {
816        cpu_ctxs.push(*unsafe {
817            &*(cpu_ctxs_vec[cpu].as_slice().as_ptr() as *const bpf_intf::cpu_ctx)
818        });
819    }
820    Ok(cpu_ctxs)
821}
822
823#[derive(Clone, Debug)]
824struct BpfStats {
825    gstats: Vec<u64>,
826    lstats: Vec<Vec<u64>>,
827    lstats_sums: Vec<u64>,
828    llc_lstats: Vec<Vec<Vec<u64>>>, // [layer][llc][stat]
829}
830
831impl BpfStats {
832    fn read(skel: &BpfSkel, cpu_ctxs: &[bpf_intf::cpu_ctx]) -> Self {
833        let nr_layers = skel.maps.rodata_data.as_ref().unwrap().nr_layers as usize;
834        let nr_llcs = skel.maps.rodata_data.as_ref().unwrap().nr_llcs as usize;
835        let mut gstats = vec![0u64; NR_GSTATS];
836        let mut lstats = vec![vec![0u64; NR_LSTATS]; nr_layers];
837        let mut llc_lstats = vec![vec![vec![0u64; NR_LLC_LSTATS]; nr_llcs]; nr_layers];
838
839        for cpu in 0..*NR_CPUS_POSSIBLE {
840            for stat in 0..NR_GSTATS {
841                gstats[stat] += cpu_ctxs[cpu].gstats[stat];
842            }
843            for layer in 0..nr_layers {
844                for stat in 0..NR_LSTATS {
845                    lstats[layer][stat] += cpu_ctxs[cpu].lstats[layer][stat];
846                }
847            }
848        }
849
850        let mut lstats_sums = vec![0u64; NR_LSTATS];
851        for layer in 0..nr_layers {
852            for stat in 0..NR_LSTATS {
853                lstats_sums[stat] += lstats[layer][stat];
854            }
855        }
856
857        for llc_id in 0..nr_llcs {
858            // XXX - This would be a lot easier if llc_ctx were in
859            // the bss. Unfortunately, kernel < v6.12 crashes and
860            // kernel >= v6.12 fails verification after such
861            // conversion due to seemingly verifier bugs. Convert to
862            // bss maps later.
863            let key = llc_id as u32;
864            let llc_id_slice =
865                unsafe { std::slice::from_raw_parts((&key as *const u32) as *const u8, 4) };
866            let v = skel
867                .maps
868                .llc_data
869                .lookup(llc_id_slice, libbpf_rs::MapFlags::ANY)
870                .unwrap()
871                .unwrap();
872            let llcc = unsafe { *(v.as_slice().as_ptr() as *const bpf_intf::llc_ctx) };
873
874            for layer_id in 0..nr_layers {
875                for stat_id in 0..NR_LLC_LSTATS {
876                    llc_lstats[layer_id][llc_id][stat_id] = llcc.lstats[layer_id][stat_id];
877                }
878            }
879        }
880
881        Self {
882            gstats,
883            lstats,
884            lstats_sums,
885            llc_lstats,
886        }
887    }
888}
889
890impl<'a, 'b> Sub<&'b BpfStats> for &'a BpfStats {
891    type Output = BpfStats;
892
893    fn sub(self, rhs: &'b BpfStats) -> BpfStats {
894        let vec_sub = |l: &[u64], r: &[u64]| l.iter().zip(r.iter()).map(|(l, r)| *l - *r).collect();
895        BpfStats {
896            gstats: vec_sub(&self.gstats, &rhs.gstats),
897            lstats: self
898                .lstats
899                .iter()
900                .zip(rhs.lstats.iter())
901                .map(|(l, r)| vec_sub(l, r))
902                .collect(),
903            lstats_sums: vec_sub(&self.lstats_sums, &rhs.lstats_sums),
904            llc_lstats: self
905                .llc_lstats
906                .iter()
907                .zip(rhs.llc_lstats.iter())
908                .map(|(l_layer, r_layer)| {
909                    l_layer
910                        .iter()
911                        .zip(r_layer.iter())
912                        .map(|(l_llc, r_llc)| {
913                            let (l_llc, mut r_llc) = (l_llc.clone(), r_llc.clone());
914                            // Lat is not subtractable, take L side.
915                            r_llc[bpf_intf::llc_layer_stat_id_LLC_LSTAT_LAT as usize] = 0;
916                            vec_sub(&l_llc, &r_llc)
917                        })
918                        .collect()
919                })
920                .collect(),
921        }
922    }
923}
924
925#[derive(Clone, Debug)]
926struct Stats {
927    at: Instant,
928    elapsed: Duration,
929    nr_layers: usize,
930    nr_layer_tasks: Vec<usize>,
931    nr_nodes: usize,
932
933    total_util: f64, // Running AVG of sum of layer_utils
934    layer_utils: Vec<Vec<f64>>,
935    prev_layer_usages: Vec<Vec<u64>>,
936
937    layer_membws: Vec<Vec<f64>>, // Estimated memory bandsidth consumption
938    prev_layer_membw_agg: Vec<Vec<u64>>, // Estimated aggregate membw consumption
939
940    cpu_busy: f64, // Read from /proc, maybe higher than total_util
941    prev_total_cpu: fb_procfs::CpuStat,
942
943    bpf_stats: BpfStats,
944    prev_bpf_stats: BpfStats,
945
946    processing_dur: Duration,
947    prev_processing_dur: Duration,
948
949    layer_slice_us: Vec<u64>,
950
951    gpu_tasks_affinitized: u64,
952    gpu_task_affinitization_ms: u64,
953}
954
955impl Stats {
956    fn read_layer_usages(cpu_ctxs: &[bpf_intf::cpu_ctx], nr_layers: usize) -> Vec<Vec<u64>> {
957        let mut layer_usages = vec![vec![0u64; NR_LAYER_USAGES]; nr_layers];
958
959        for cpu in 0..*NR_CPUS_POSSIBLE {
960            for layer in 0..nr_layers {
961                for usage in 0..NR_LAYER_USAGES {
962                    layer_usages[layer][usage] += cpu_ctxs[cpu].layer_usages[layer][usage];
963                }
964            }
965        }
966
967        layer_usages
968    }
969
970    // Same as above, but for aggregate memory bandwidth consumption.
971    fn read_layer_membw_agg(cpu_ctxs: &[bpf_intf::cpu_ctx], nr_layers: usize) -> Vec<Vec<u64>> {
972        let mut layer_membw_agg = vec![vec![0u64; NR_LAYER_USAGES]; nr_layers];
973
974        for cpu in 0..*NR_CPUS_POSSIBLE {
975            for layer in 0..nr_layers {
976                for usage in 0..NR_LAYER_USAGES {
977                    layer_membw_agg[layer][usage] += cpu_ctxs[cpu].layer_membw_agg[layer][usage];
978                }
979            }
980        }
981
982        layer_membw_agg
983    }
984
985    fn new(
986        skel: &mut BpfSkel,
987        proc_reader: &fb_procfs::ProcReader,
988        gpu_task_affinitizer: &GpuTaskAffinitizer,
989    ) -> Result<Self> {
990        let nr_layers = skel.maps.rodata_data.as_ref().unwrap().nr_layers as usize;
991        let cpu_ctxs = read_cpu_ctxs(skel)?;
992        let bpf_stats = BpfStats::read(skel, &cpu_ctxs);
993        let nr_nodes = skel.maps.rodata_data.as_ref().unwrap().nr_nodes as usize;
994
995        Ok(Self {
996            at: Instant::now(),
997            elapsed: Default::default(),
998            nr_layers,
999            nr_layer_tasks: vec![0; nr_layers],
1000            nr_nodes,
1001
1002            total_util: 0.0,
1003            layer_utils: vec![vec![0.0; NR_LAYER_USAGES]; nr_layers],
1004            layer_membws: vec![vec![0.0; NR_LAYER_USAGES]; nr_layers],
1005            prev_layer_usages: Self::read_layer_usages(&cpu_ctxs, nr_layers),
1006            prev_layer_membw_agg: Self::read_layer_membw_agg(&cpu_ctxs, nr_layers),
1007
1008            cpu_busy: 0.0,
1009            prev_total_cpu: read_total_cpu(proc_reader)?,
1010
1011            bpf_stats: bpf_stats.clone(),
1012            prev_bpf_stats: bpf_stats,
1013
1014            processing_dur: Default::default(),
1015            prev_processing_dur: Default::default(),
1016
1017            layer_slice_us: vec![0; nr_layers],
1018            gpu_tasks_affinitized: gpu_task_affinitizer.tasks_affinitized,
1019            gpu_task_affinitization_ms: gpu_task_affinitizer.last_task_affinitization_ms,
1020        })
1021    }
1022
1023    fn refresh(
1024        &mut self,
1025        skel: &mut BpfSkel,
1026        proc_reader: &fb_procfs::ProcReader,
1027        now: Instant,
1028        cur_processing_dur: Duration,
1029        gpu_task_affinitizer: &GpuTaskAffinitizer,
1030    ) -> Result<()> {
1031        let elapsed = now.duration_since(self.at);
1032        let elapsed_f64 = elapsed.as_secs_f64();
1033        let cpu_ctxs = read_cpu_ctxs(skel)?;
1034
1035        let nr_layer_tasks: Vec<usize> = skel
1036            .maps
1037            .bss_data
1038            .as_ref()
1039            .unwrap()
1040            .layers
1041            .iter()
1042            .take(self.nr_layers)
1043            .map(|layer| layer.nr_tasks as usize)
1044            .collect();
1045        let layer_slice_us: Vec<u64> = skel
1046            .maps
1047            .bss_data
1048            .as_ref()
1049            .unwrap()
1050            .layers
1051            .iter()
1052            .take(self.nr_layers)
1053            .map(|layer| layer.slice_ns / 1000_u64)
1054            .collect();
1055
1056        let cur_layer_usages = Self::read_layer_usages(&cpu_ctxs, self.nr_layers);
1057        let cur_layer_membw_agg = Self::read_layer_membw_agg(&cpu_ctxs, self.nr_layers);
1058
1059        let compute_diff = |cur_agg: &Vec<Vec<u64>>, prev_agg: &Vec<Vec<u64>>| {
1060            cur_agg
1061                .iter()
1062                .zip(prev_agg.iter())
1063                .map(|(cur, prev)| {
1064                    cur.iter()
1065                        .zip(prev.iter())
1066                        .map(|(c, p)| (c - p) as f64 / 1_000_000_000.0 / elapsed_f64)
1067                        .collect()
1068                })
1069                .collect()
1070        };
1071
1072        let cur_layer_utils: Vec<Vec<f64>> =
1073            compute_diff(&cur_layer_usages, &self.prev_layer_usages);
1074        let cur_layer_membw: Vec<Vec<f64>> =
1075            compute_diff(&cur_layer_membw_agg, &self.prev_layer_membw_agg);
1076
1077        let metric_decay = |cur_metric: Vec<Vec<f64>>, prev_metric: &Vec<Vec<f64>>| {
1078            cur_metric
1079                .iter()
1080                .zip(prev_metric.iter())
1081                .map(|(cur, prev)| {
1082                    cur.iter()
1083                        .zip(prev.iter())
1084                        .map(|(c, p)| {
1085                            let decay = USAGE_DECAY.powf(elapsed_f64);
1086                            p * decay + c * (1.0 - decay)
1087                        })
1088                        .collect()
1089                })
1090                .collect()
1091        };
1092
1093        let layer_utils: Vec<Vec<f64>> = metric_decay(cur_layer_utils, &self.layer_utils);
1094        let layer_membws: Vec<Vec<f64>> = metric_decay(cur_layer_membw, &self.layer_membws);
1095
1096        let cur_total_cpu = read_total_cpu(proc_reader)?;
1097        let cpu_busy = calc_util(&cur_total_cpu, &self.prev_total_cpu)?;
1098
1099        let cur_bpf_stats = BpfStats::read(skel, &cpu_ctxs);
1100        let bpf_stats = &cur_bpf_stats - &self.prev_bpf_stats;
1101
1102        let processing_dur = cur_processing_dur
1103            .checked_sub(self.prev_processing_dur)
1104            .unwrap();
1105
1106        *self = Self {
1107            at: now,
1108            elapsed,
1109            nr_layers: self.nr_layers,
1110            nr_layer_tasks,
1111            nr_nodes: self.nr_nodes,
1112
1113            total_util: layer_utils
1114                .iter()
1115                .map(|x| x.iter().take(LAYER_USAGE_SUM_UPTO + 1).sum::<f64>())
1116                .sum(),
1117            layer_utils,
1118            layer_membws,
1119            prev_layer_usages: cur_layer_usages,
1120            prev_layer_membw_agg: cur_layer_membw_agg,
1121
1122            cpu_busy,
1123            prev_total_cpu: cur_total_cpu,
1124
1125            bpf_stats,
1126            prev_bpf_stats: cur_bpf_stats,
1127
1128            processing_dur,
1129            prev_processing_dur: cur_processing_dur,
1130
1131            layer_slice_us,
1132            gpu_tasks_affinitized: gpu_task_affinitizer.tasks_affinitized,
1133            gpu_task_affinitization_ms: gpu_task_affinitizer.last_task_affinitization_ms,
1134        };
1135        Ok(())
1136    }
1137}
1138
1139#[derive(Debug)]
1140struct Layer {
1141    name: String,
1142    kind: LayerKind,
1143    growth_algo: LayerGrowthAlgo,
1144    core_order: Vec<usize>,
1145
1146    target_llc_cpus: (usize, usize),
1147    assigned_llcs: Vec<usize>,
1148
1149    nr_cpus: usize,
1150    nr_llc_cpus: Vec<usize>,
1151    cpus: Cpumask,
1152    allowed_cpus: Cpumask,
1153}
1154
1155fn get_kallsyms_addr(sym_name: &str) -> Result<u64> {
1156    fs::read_to_string("/proc/kallsyms")?
1157        .lines()
1158        .find(|line| line.contains(sym_name))
1159        .and_then(|line| line.split_whitespace().next())
1160        .and_then(|addr| u64::from_str_radix(addr, 16).ok())
1161        .ok_or_else(|| anyhow!("Symbol '{}' not found", sym_name))
1162}
1163
1164fn resolve_cpus_pct_range(
1165    cpus_range: &Option<(usize, usize)>,
1166    cpus_range_frac: &Option<(f64, f64)>,
1167    max_cpus: usize,
1168) -> Result<(usize, usize)> {
1169    match (cpus_range, cpus_range_frac) {
1170        (Some(_x), Some(_y)) => {
1171            bail!("cpus_range cannot be used with cpus_pct.");
1172        }
1173        (Some((cpus_range_min, cpus_range_max)), None) => Ok((*cpus_range_min, *cpus_range_max)),
1174        (None, Some((cpus_frac_min, cpus_frac_max))) => {
1175            if *cpus_frac_min < 0_f64
1176                || *cpus_frac_min > 1_f64
1177                || *cpus_frac_max < 0_f64
1178                || *cpus_frac_max > 1_f64
1179            {
1180                bail!("cpus_range_frac values must be between 0.0 and 1.0");
1181            }
1182            let cpus_min_count = ((max_cpus as f64) * cpus_frac_min).round_ties_even() as usize;
1183            let cpus_max_count = ((max_cpus as f64) * cpus_frac_max).round_ties_even() as usize;
1184            Ok((
1185                std::cmp::max(cpus_min_count, 1),
1186                std::cmp::min(cpus_max_count, max_cpus),
1187            ))
1188        }
1189        (None, None) => Ok((0, max_cpus)),
1190    }
1191}
1192
1193impl Layer {
1194    fn new(spec: &LayerSpec, topo: &Topology, core_order: &Vec<usize>) -> Result<Self> {
1195        let name = &spec.name;
1196        let kind = spec.kind.clone();
1197        let mut allowed_cpus = Cpumask::new();
1198        match &kind {
1199            LayerKind::Confined {
1200                cpus_range,
1201                cpus_range_frac,
1202                common: LayerCommon { nodes, llcs, .. },
1203                ..
1204            } => {
1205                let cpus_range =
1206                    resolve_cpus_pct_range(cpus_range, cpus_range_frac, topo.all_cpus.len())?;
1207                if cpus_range.0 > cpus_range.1 || cpus_range.1 == 0 {
1208                    bail!("invalid cpus_range {:?}", cpus_range);
1209                }
1210                if nodes.is_empty() && llcs.is_empty() {
1211                    allowed_cpus.set_all();
1212                } else {
1213                    // build up the cpus bitset
1214                    for (node_id, node) in &topo.nodes {
1215                        // first do the matching for nodes
1216                        if nodes.contains(node_id) {
1217                            for &id in node.all_cpus.keys() {
1218                                allowed_cpus.set_cpu(id)?;
1219                            }
1220                        }
1221                        // next match on any LLCs
1222                        for (llc_id, llc) in &node.llcs {
1223                            if llcs.contains(llc_id) {
1224                                for &id in llc.all_cpus.keys() {
1225                                    allowed_cpus.set_cpu(id)?;
1226                                }
1227                            }
1228                        }
1229                    }
1230                }
1231            }
1232            LayerKind::Grouped {
1233                common: LayerCommon { nodes, llcs, .. },
1234                ..
1235            }
1236            | LayerKind::Open {
1237                common: LayerCommon { nodes, llcs, .. },
1238                ..
1239            } => {
1240                if nodes.is_empty() && llcs.is_empty() {
1241                    allowed_cpus.set_all();
1242                } else {
1243                    // build up the cpus bitset
1244                    for (node_id, node) in &topo.nodes {
1245                        // first do the matching for nodes
1246                        if nodes.contains(node_id) {
1247                            for &id in node.all_cpus.keys() {
1248                                allowed_cpus.set_cpu(id)?;
1249                            }
1250                        }
1251                        // next match on any LLCs
1252                        for (llc_id, llc) in &node.llcs {
1253                            if llcs.contains(llc_id) {
1254                                for &id in llc.all_cpus.keys() {
1255                                    allowed_cpus.set_cpu(id)?;
1256                                }
1257                            }
1258                        }
1259                    }
1260                }
1261            }
1262        }
1263
1264        // Util can be above 1.0 for grouped layers if
1265        // util_includes_open_cputime is set.
1266        if let Some(util_range) = kind.util_range() {
1267            if util_range.0 < 0.0 || util_range.1 < 0.0 || util_range.0 >= util_range.1 {
1268                bail!("invalid util_range {:?}", util_range);
1269            }
1270        }
1271
1272        let layer_growth_algo = kind.common().growth_algo.clone();
1273
1274        debug!(
1275            "layer: {} algo: {:?} core order: {:?}",
1276            name, &layer_growth_algo, core_order
1277        );
1278
1279        Ok(Self {
1280            name: name.into(),
1281            kind,
1282            growth_algo: layer_growth_algo,
1283            core_order: core_order.clone(),
1284
1285            target_llc_cpus: (0, 0),
1286            assigned_llcs: vec![],
1287
1288            nr_cpus: 0,
1289            nr_llc_cpus: vec![0; topo.all_llcs.len()],
1290            cpus: Cpumask::new(),
1291            allowed_cpus,
1292        })
1293    }
1294
1295    fn free_some_cpus(&mut self, cpu_pool: &mut CpuPool, max_to_free: usize) -> Result<usize> {
1296        let cpus_to_free = match cpu_pool.next_to_free(&self.cpus, self.core_order.iter().rev())? {
1297            Some(ret) => ret.clone(),
1298            None => return Ok(0),
1299        };
1300
1301        let nr_to_free = cpus_to_free.weight();
1302
1303        Ok(if nr_to_free <= max_to_free {
1304            trace!("[{}] freeing CPUs: {}", self.name, &cpus_to_free);
1305            self.cpus &= &cpus_to_free.not();
1306            self.nr_cpus -= nr_to_free;
1307            for cpu in cpus_to_free.iter() {
1308                self.nr_llc_cpus[cpu_pool.topo.all_cpus[&cpu].llc_id] -= 1;
1309            }
1310            cpu_pool.free(&cpus_to_free)?;
1311            nr_to_free
1312        } else {
1313            0
1314        })
1315    }
1316
1317    fn alloc_some_cpus(&mut self, cpu_pool: &mut CpuPool) -> Result<usize> {
1318        let new_cpus = match cpu_pool
1319            .alloc_cpus(&self.allowed_cpus, &self.core_order)
1320            .clone()
1321        {
1322            Some(ret) => ret.clone(),
1323            None => {
1324                trace!("layer-{} can't grow, no CPUs", &self.name);
1325                return Ok(0);
1326            }
1327        };
1328
1329        let nr_new_cpus = new_cpus.weight();
1330
1331        trace!("[{}] adding CPUs: {}", &self.name, &new_cpus);
1332        self.cpus |= &new_cpus;
1333        self.nr_cpus += nr_new_cpus;
1334        for cpu in new_cpus.iter() {
1335            self.nr_llc_cpus[cpu_pool.topo.all_cpus[&cpu].llc_id] += 1;
1336        }
1337        Ok(nr_new_cpus)
1338    }
1339}
1340#[derive(Debug, Clone)]
1341struct NodeInfo {
1342    node_mask: nix::sched::CpuSet,
1343    _node_id: usize,
1344}
1345
1346#[derive(Debug)]
1347struct GpuTaskAffinitizer {
1348    // This struct tracks information neccessary to numa affinitize
1349    // gpu tasks periodically when needed.
1350    gpu_devs_to_node_info: HashMap<u32, NodeInfo>,
1351    gpu_pids_to_devs: HashMap<Pid, u32>,
1352    last_process_time: Option<Instant>,
1353    sys: System,
1354    pid_map: HashMap<Pid, Vec<Pid>>,
1355    poll_interval: Duration,
1356    enable: bool,
1357    tasks_affinitized: u64,
1358    last_task_affinitization_ms: u64,
1359}
1360
1361impl GpuTaskAffinitizer {
1362    pub fn new(poll_interval: u64, enable: bool) -> GpuTaskAffinitizer {
1363        GpuTaskAffinitizer {
1364            gpu_devs_to_node_info: HashMap::new(),
1365            gpu_pids_to_devs: HashMap::new(),
1366            last_process_time: None,
1367            sys: System::default(),
1368            pid_map: HashMap::new(),
1369            poll_interval: Duration::from_secs(poll_interval),
1370            enable,
1371            tasks_affinitized: 0,
1372            last_task_affinitization_ms: 0,
1373        }
1374    }
1375
1376    fn find_one_cpu(&self, affinity: Vec<u64>) -> Result<u32> {
1377        for (chunk, &mask) in affinity.iter().enumerate() {
1378            let mut inner_offset: u64 = 1;
1379            for _ in 0..64 {
1380                if (mask & inner_offset) != 0 {
1381                    return Ok((64 * chunk + u64::trailing_zeros(inner_offset) as usize) as u32);
1382                }
1383                inner_offset = inner_offset << 1;
1384            }
1385        }
1386        anyhow::bail!("unable to get CPU from NVML bitmask");
1387    }
1388
1389    fn node_to_cpuset(&self, node: &scx_utils::Node) -> Result<CpuSet> {
1390        let mut cpuset = CpuSet::new();
1391        for (cpu_id, _cpu) in &node.all_cpus {
1392            cpuset.set(*cpu_id)?;
1393        }
1394        Ok(cpuset)
1395    }
1396
1397    fn init_dev_node_map(&mut self, topo: Arc<Topology>) -> Result<()> {
1398        let nvml = nvml()?;
1399        let device_count = nvml.device_count()?;
1400
1401        for idx in 0..device_count {
1402            let dev = nvml.device_by_index(idx)?;
1403            // For machines w/ up to 1024 CPUs.
1404            let cpu = dev.cpu_affinity(16)?;
1405            let ideal_cpu = self.find_one_cpu(cpu)?;
1406            if let Some(cpu) = topo.all_cpus.get(&(ideal_cpu as usize)) {
1407                self.gpu_devs_to_node_info.insert(
1408                    idx,
1409                    NodeInfo {
1410                        node_mask: self.node_to_cpuset(
1411                            topo.nodes.get(&cpu.node_id).expect("topo missing node"),
1412                        )?,
1413                        _node_id: cpu.node_id,
1414                    },
1415                );
1416            }
1417        }
1418        Ok(())
1419    }
1420
1421    fn update_gpu_pids(&mut self) -> Result<()> {
1422        let nvml = nvml()?;
1423        for i in 0..nvml.device_count()? {
1424            let device = nvml.device_by_index(i)?;
1425            for proc in device
1426                .running_compute_processes()?
1427                .into_iter()
1428                .chain(device.running_graphics_processes()?.into_iter())
1429            {
1430                self.gpu_pids_to_devs.insert(Pid::from_u32(proc.pid), i);
1431            }
1432        }
1433        Ok(())
1434    }
1435
1436    fn update_process_info(&mut self) -> Result<()> {
1437        self.sys.refresh_processes_specifics(
1438            ProcessesToUpdate::All,
1439            true,
1440            ProcessRefreshKind::nothing(),
1441        );
1442        self.pid_map.clear();
1443        for (pid, proc_) in self.sys.processes() {
1444            if let Some(ppid) = proc_.parent() {
1445                self.pid_map.entry(ppid).or_default().push(*pid);
1446            }
1447        }
1448        Ok(())
1449    }
1450
1451    fn get_child_pids_and_tids(&self, root_pid: Pid) -> HashSet<Pid> {
1452        let mut work = VecDeque::from([root_pid]);
1453        let mut pids_and_tids: HashSet<Pid> = HashSet::new();
1454
1455        while let Some(pid) = work.pop_front() {
1456            if pids_and_tids.insert(pid) {
1457                if let Some(kids) = self.pid_map.get(&pid) {
1458                    work.extend(kids);
1459                }
1460                if let Some(proc_) = self.sys.process(pid) {
1461                    if let Some(tasks) = proc_.tasks() {
1462                        pids_and_tids.extend(tasks.iter().copied());
1463                    }
1464                }
1465            }
1466        }
1467        pids_and_tids
1468    }
1469
1470    fn affinitize_gpu_pids(&mut self) -> Result<()> {
1471        if !self.enable {
1472            return Ok(());
1473        }
1474        for (pid, dev) in &self.gpu_pids_to_devs {
1475            let node_info = self
1476                .gpu_devs_to_node_info
1477                .get(&dev)
1478                .expect("Unable to get gpu pid node mask");
1479            for child in self.get_child_pids_and_tids(*pid) {
1480                match nix::sched::sched_setaffinity(
1481                    nix::unistd::Pid::from_raw(child.as_u32() as i32),
1482                    &node_info.node_mask,
1483                ) {
1484                    Ok(_) => {
1485                        // Increment the global counter for successful affinitization
1486                        self.tasks_affinitized += 1;
1487                    }
1488                    Err(_) => {
1489                        debug!(
1490                            "Error affinitizing gpu pid {} to node {:#?}",
1491                            child.as_u32(),
1492                            node_info
1493                        );
1494                    }
1495                };
1496            }
1497        }
1498        Ok(())
1499    }
1500
1501    pub fn maybe_affinitize(&mut self) {
1502        if !self.enable {
1503            return;
1504        }
1505        let now = Instant::now();
1506
1507        if let Some(last_process_time) = self.last_process_time {
1508            if (now - last_process_time) < self.poll_interval {
1509                return;
1510            }
1511        }
1512
1513        match self.update_gpu_pids() {
1514            Ok(_) => {}
1515            Err(e) => {
1516                error!("Error updating GPU PIDs: {}", e);
1517            }
1518        };
1519        match self.update_process_info() {
1520            Ok(_) => {}
1521            Err(e) => {
1522                error!("Error updating process info to affinitize GPU PIDs: {}", e);
1523            }
1524        };
1525        match self.affinitize_gpu_pids() {
1526            Ok(_) => {}
1527            Err(e) => {
1528                error!("Error updating GPU PIDs: {}", e);
1529            }
1530        };
1531        self.last_process_time = Some(now);
1532        self.last_task_affinitization_ms = (Instant::now() - now).as_millis() as u64;
1533
1534        return;
1535    }
1536
1537    pub fn init(&mut self, topo: Arc<Topology>) {
1538        if !self.enable || self.last_process_time.is_some() {
1539            return;
1540        }
1541
1542        match self.init_dev_node_map(topo) {
1543            Ok(_) => {}
1544            Err(e) => {
1545                error!("Error initializing gpu node dev map: {}", e);
1546            }
1547        };
1548        self.sys = System::new_all();
1549        return;
1550    }
1551}
1552
1553struct Scheduler<'a> {
1554    skel: BpfSkel<'a>,
1555    struct_ops: Option<libbpf_rs::Link>,
1556    layer_specs: Vec<LayerSpec>,
1557
1558    sched_intv: Duration,
1559    layer_refresh_intv: Duration,
1560
1561    cpu_pool: CpuPool,
1562    layers: Vec<Layer>,
1563    idle_qos_enabled: bool,
1564
1565    proc_reader: fb_procfs::ProcReader,
1566    sched_stats: Stats,
1567
1568    nr_layer_cpus_ranges: Vec<(usize, usize)>,
1569    processing_dur: Duration,
1570
1571    topo: Arc<Topology>,
1572    netdevs: BTreeMap<String, NetDev>,
1573    stats_server: StatsServer<StatsReq, StatsRes>,
1574    gpu_task_handler: GpuTaskAffinitizer,
1575}
1576
1577impl<'a> Scheduler<'a> {
1578    fn init_layers(skel: &mut OpenBpfSkel, specs: &[LayerSpec], topo: &Topology) -> Result<()> {
1579        skel.maps.rodata_data.as_mut().unwrap().nr_layers = specs.len() as u32;
1580        let mut perf_set = false;
1581
1582        let mut layer_iteration_order = (0..specs.len()).collect::<Vec<_>>();
1583        let mut layer_weights: Vec<usize> = vec![];
1584
1585        for (spec_i, spec) in specs.iter().enumerate() {
1586            let layer = &mut skel.maps.bss_data.as_mut().unwrap().layers[spec_i];
1587
1588            for (or_i, or) in spec.matches.iter().enumerate() {
1589                for (and_i, and) in or.iter().enumerate() {
1590                    let mt = &mut layer.matches[or_i].matches[and_i];
1591
1592                    // Rules are allowlist-based by default
1593                    mt.exclude.write(false);
1594
1595                    match and {
1596                        LayerMatch::CgroupPrefix(prefix) => {
1597                            mt.kind = bpf_intf::layer_match_kind_MATCH_CGROUP_PREFIX as i32;
1598                            copy_into_cstr(&mut mt.cgroup_prefix, prefix.as_str());
1599                        }
1600                        LayerMatch::CgroupSuffix(suffix) => {
1601                            mt.kind = bpf_intf::layer_match_kind_MATCH_CGROUP_SUFFIX as i32;
1602                            copy_into_cstr(&mut mt.cgroup_suffix, suffix.as_str());
1603                        }
1604                        LayerMatch::CgroupRegex(_) => {
1605                            panic!("CgroupRegex match only supported in template");
1606                        }
1607                        LayerMatch::CgroupContains(substr) => {
1608                            mt.kind = bpf_intf::layer_match_kind_MATCH_CGROUP_CONTAINS as i32;
1609                            copy_into_cstr(&mut mt.cgroup_substr, substr.as_str());
1610                        }
1611                        LayerMatch::CommPrefix(prefix) => {
1612                            mt.kind = bpf_intf::layer_match_kind_MATCH_COMM_PREFIX as i32;
1613                            copy_into_cstr(&mut mt.comm_prefix, prefix.as_str());
1614                        }
1615                        LayerMatch::CommPrefixExclude(prefix) => {
1616                            mt.kind = bpf_intf::layer_match_kind_MATCH_COMM_PREFIX as i32;
1617                            mt.exclude.write(true);
1618                            copy_into_cstr(&mut mt.comm_prefix, prefix.as_str());
1619                        }
1620                        LayerMatch::PcommPrefix(prefix) => {
1621                            mt.kind = bpf_intf::layer_match_kind_MATCH_PCOMM_PREFIX as i32;
1622                            copy_into_cstr(&mut mt.pcomm_prefix, prefix.as_str());
1623                        }
1624                        LayerMatch::PcommPrefixExclude(prefix) => {
1625                            mt.kind = bpf_intf::layer_match_kind_MATCH_PCOMM_PREFIX as i32;
1626                            mt.exclude.write(true);
1627                            copy_into_cstr(&mut mt.pcomm_prefix, prefix.as_str());
1628                        }
1629                        LayerMatch::NiceAbove(nice) => {
1630                            mt.kind = bpf_intf::layer_match_kind_MATCH_NICE_ABOVE as i32;
1631                            mt.nice = *nice;
1632                        }
1633                        LayerMatch::NiceBelow(nice) => {
1634                            mt.kind = bpf_intf::layer_match_kind_MATCH_NICE_BELOW as i32;
1635                            mt.nice = *nice;
1636                        }
1637                        LayerMatch::NiceEquals(nice) => {
1638                            mt.kind = bpf_intf::layer_match_kind_MATCH_NICE_EQUALS as i32;
1639                            mt.nice = *nice;
1640                        }
1641                        LayerMatch::UIDEquals(user_id) => {
1642                            mt.kind = bpf_intf::layer_match_kind_MATCH_USER_ID_EQUALS as i32;
1643                            mt.user_id = *user_id;
1644                        }
1645                        LayerMatch::GIDEquals(group_id) => {
1646                            mt.kind = bpf_intf::layer_match_kind_MATCH_GROUP_ID_EQUALS as i32;
1647                            mt.group_id = *group_id;
1648                        }
1649                        LayerMatch::PIDEquals(pid) => {
1650                            mt.kind = bpf_intf::layer_match_kind_MATCH_PID_EQUALS as i32;
1651                            mt.pid = *pid;
1652                        }
1653                        LayerMatch::PPIDEquals(ppid) => {
1654                            mt.kind = bpf_intf::layer_match_kind_MATCH_PPID_EQUALS as i32;
1655                            mt.ppid = *ppid;
1656                        }
1657                        LayerMatch::TGIDEquals(tgid) => {
1658                            mt.kind = bpf_intf::layer_match_kind_MATCH_TGID_EQUALS as i32;
1659                            mt.tgid = *tgid;
1660                        }
1661                        LayerMatch::NSPIDEquals(nsid, pid) => {
1662                            mt.kind = bpf_intf::layer_match_kind_MATCH_NSPID_EQUALS as i32;
1663                            mt.nsid = *nsid;
1664                            mt.pid = *pid;
1665                        }
1666                        LayerMatch::NSEquals(nsid) => {
1667                            mt.kind = bpf_intf::layer_match_kind_MATCH_NS_EQUALS as i32;
1668                            mt.nsid = *nsid as u64;
1669                        }
1670                        LayerMatch::CmdJoin(joincmd) => {
1671                            mt.kind = bpf_intf::layer_match_kind_MATCH_SCXCMD_JOIN as i32;
1672                            copy_into_cstr(&mut mt.comm_prefix, joincmd);
1673                        }
1674                        LayerMatch::IsGroupLeader(polarity) => {
1675                            mt.kind = bpf_intf::layer_match_kind_MATCH_IS_GROUP_LEADER as i32;
1676                            mt.is_group_leader.write(*polarity);
1677                        }
1678                        LayerMatch::IsKthread(polarity) => {
1679                            mt.kind = bpf_intf::layer_match_kind_MATCH_IS_KTHREAD as i32;
1680                            mt.is_kthread.write(*polarity);
1681                        }
1682                        LayerMatch::UsedGpuTid(polarity) => {
1683                            mt.kind = bpf_intf::layer_match_kind_MATCH_USED_GPU_TID as i32;
1684                            mt.used_gpu_tid.write(*polarity);
1685                        }
1686                        LayerMatch::UsedGpuPid(polarity) => {
1687                            mt.kind = bpf_intf::layer_match_kind_MATCH_USED_GPU_PID as i32;
1688                            mt.used_gpu_pid.write(*polarity);
1689                        }
1690                        LayerMatch::AvgRuntime(min, max) => {
1691                            mt.kind = bpf_intf::layer_match_kind_MATCH_AVG_RUNTIME as i32;
1692                            mt.min_avg_runtime_us = *min;
1693                            mt.max_avg_runtime_us = *max;
1694                        }
1695                        LayerMatch::HintEquals(hint) => {
1696                            mt.kind = bpf_intf::layer_match_kind_MATCH_HINT_EQUALS as i32;
1697                            mt.hint = *hint;
1698                        }
1699                    }
1700                }
1701                layer.matches[or_i].nr_match_ands = or.len() as i32;
1702            }
1703
1704            layer.nr_match_ors = spec.matches.len() as u32;
1705            layer.kind = spec.kind.as_bpf_enum();
1706
1707            {
1708                let LayerCommon {
1709                    min_exec_us,
1710                    yield_ignore,
1711                    perf,
1712                    preempt,
1713                    preempt_first,
1714                    exclusive,
1715                    allow_node_aligned,
1716                    skip_remote_node,
1717                    prev_over_idle_core,
1718                    growth_algo,
1719                    nodes,
1720                    slice_us,
1721                    fifo,
1722                    weight,
1723                    disallow_open_after_us,
1724                    disallow_preempt_after_us,
1725                    xllc_mig_min_us,
1726                    placement,
1727                    ..
1728                } = spec.kind.common();
1729
1730                layer.slice_ns = *slice_us * 1000;
1731                layer.fifo.write(*fifo);
1732                layer.min_exec_ns = min_exec_us * 1000;
1733                layer.yield_step_ns = if *yield_ignore > 0.999 {
1734                    0
1735                } else if *yield_ignore < 0.001 {
1736                    layer.slice_ns
1737                } else {
1738                    (layer.slice_ns as f64 * (1.0 - *yield_ignore)) as u64
1739                };
1740                let mut layer_name: String = spec.name.clone();
1741                layer_name.truncate(MAX_LAYER_NAME);
1742                copy_into_cstr(&mut layer.name, layer_name.as_str());
1743                layer.preempt.write(*preempt);
1744                layer.preempt_first.write(*preempt_first);
1745                layer.excl.write(*exclusive);
1746                layer.allow_node_aligned.write(*allow_node_aligned);
1747                layer.skip_remote_node.write(*skip_remote_node);
1748                layer.prev_over_idle_core.write(*prev_over_idle_core);
1749                layer.growth_algo = growth_algo.as_bpf_enum();
1750                layer.weight = *weight;
1751                layer.disallow_open_after_ns = match disallow_open_after_us.unwrap() {
1752                    v if v == u64::MAX => v,
1753                    v => v * 1000,
1754                };
1755                layer.disallow_preempt_after_ns = match disallow_preempt_after_us.unwrap() {
1756                    v if v == u64::MAX => v,
1757                    v => v * 1000,
1758                };
1759                layer.xllc_mig_min_ns = (xllc_mig_min_us * 1000.0) as u64;
1760                layer_weights.push(layer.weight.try_into().unwrap());
1761                layer.perf = u32::try_from(*perf)?;
1762                layer.node_mask = nodemask_from_nodes(nodes) as u64;
1763                for (topo_node_id, topo_node) in &topo.nodes {
1764                    if !nodes.is_empty() && !nodes.contains(topo_node_id) {
1765                        continue;
1766                    }
1767                    layer.llc_mask |= llcmask_from_llcs(&topo_node.llcs) as u64;
1768                }
1769
1770                let task_place = |place: u32| crate::types::layer_task_place(place);
1771                layer.task_place = match placement {
1772                    LayerPlacement::Standard => {
1773                        task_place(bpf_intf::layer_task_place_PLACEMENT_STD as u32)
1774                    }
1775                    LayerPlacement::Sticky => {
1776                        task_place(bpf_intf::layer_task_place_PLACEMENT_STICK as u32)
1777                    }
1778                    LayerPlacement::Floating => {
1779                        task_place(bpf_intf::layer_task_place_PLACEMENT_FLOAT as u32)
1780                    }
1781                };
1782            }
1783
1784            layer.is_protected.write(match spec.kind {
1785                LayerKind::Open { .. } => false,
1786                LayerKind::Confined { protected, .. } | LayerKind::Grouped { protected, .. } => {
1787                    protected
1788                }
1789            });
1790
1791            match &spec.cpuset {
1792                Some(mask) => {
1793                    Self::update_cpumask(&mask, &mut layer.cpuset);
1794                }
1795                None => {
1796                    for i in 0..layer.cpuset.len() {
1797                        layer.cpuset[i] = u8::MAX;
1798                    }
1799                }
1800            };
1801
1802            perf_set |= layer.perf > 0;
1803        }
1804
1805        layer_iteration_order.sort_by(|i, j| layer_weights[*i].cmp(&layer_weights[*j]));
1806        for (idx, layer_idx) in layer_iteration_order.iter().enumerate() {
1807            skel.maps
1808                .rodata_data
1809                .as_mut()
1810                .unwrap()
1811                .layer_iteration_order[idx] = *layer_idx as u32;
1812        }
1813
1814        if perf_set && !compat::ksym_exists("scx_bpf_cpuperf_set")? {
1815            warn!("cpufreq support not available, ignoring perf configurations");
1816        }
1817
1818        Ok(())
1819    }
1820
1821    fn init_nodes(skel: &mut OpenBpfSkel, _opts: &Opts, topo: &Topology) {
1822        skel.maps.rodata_data.as_mut().unwrap().nr_nodes = topo.nodes.len() as u32;
1823        skel.maps.rodata_data.as_mut().unwrap().nr_llcs = 0;
1824
1825        for (&node_id, node) in &topo.nodes {
1826            debug!("configuring node {}, LLCs {:?}", node_id, node.llcs.len());
1827            skel.maps.rodata_data.as_mut().unwrap().nr_llcs += node.llcs.len() as u32;
1828            let raw_numa_slice = node.span.as_raw_slice();
1829            let node_cpumask_slice =
1830                &mut skel.maps.rodata_data.as_mut().unwrap().numa_cpumasks[node_id];
1831            let (left, _) = node_cpumask_slice.split_at_mut(raw_numa_slice.len());
1832            left.clone_from_slice(raw_numa_slice);
1833            debug!(
1834                "node {} mask: {:?}",
1835                node_id,
1836                skel.maps.rodata_data.as_ref().unwrap().numa_cpumasks[node_id]
1837            );
1838
1839            for llc in node.llcs.values() {
1840                debug!("configuring llc {:?} for node {:?}", llc.id, node_id);
1841                skel.maps.rodata_data.as_mut().unwrap().llc_numa_id_map[llc.id] = node_id as u32;
1842            }
1843        }
1844
1845        for cpu in topo.all_cpus.values() {
1846            skel.maps.rodata_data.as_mut().unwrap().cpu_llc_id_map[cpu.id] = cpu.llc_id as u32;
1847        }
1848    }
1849
1850    fn init_cpu_prox_map(topo: &Topology, cpu_ctxs: &mut [bpf_intf::cpu_ctx]) {
1851        let radiate = |mut vec: Vec<usize>, center_id: usize| -> Vec<usize> {
1852            vec.sort_by_key(|&id| (center_id as i32 - id as i32).abs());
1853            vec
1854        };
1855        let radiate_cpu =
1856            |mut vec: Vec<usize>, center_cpu: usize, center_core: usize| -> Vec<usize> {
1857                vec.sort_by_key(|&id| {
1858                    (
1859                        (center_core as i32 - topo.all_cpus.get(&id).unwrap().core_id as i32).abs(),
1860                        (center_cpu as i32 - id as i32).abs(),
1861                    )
1862                });
1863                vec
1864            };
1865
1866        for (&cpu_id, cpu) in &topo.all_cpus {
1867            // Collect the spans.
1868            let mut core_span = topo.all_cores[&cpu.core_id].span.clone();
1869            let llc_span = &topo.all_llcs[&cpu.llc_id].span;
1870            let node_span = &topo.nodes[&cpu.node_id].span;
1871            let sys_span = &topo.span;
1872
1873            // Make the spans exclusive.
1874            let sys_span = sys_span.and(&node_span.not());
1875            let node_span = node_span.and(&llc_span.not());
1876            let llc_span = llc_span.and(&core_span.not());
1877            core_span.clear_cpu(cpu_id).unwrap();
1878
1879            // Convert them into arrays.
1880            let mut sys_order: Vec<usize> = sys_span.iter().collect();
1881            let mut node_order: Vec<usize> = node_span.iter().collect();
1882            let mut llc_order: Vec<usize> = llc_span.iter().collect();
1883            let mut core_order: Vec<usize> = core_span.iter().collect();
1884
1885            // Shuffle them so that different CPUs follow different orders.
1886            // Each CPU radiates in both directions based on the cpu id and
1887            // radiates out to the closest cores based on core ids.
1888
1889            sys_order = radiate_cpu(sys_order, cpu_id, cpu.core_id);
1890            node_order = radiate(node_order, cpu.node_id);
1891            llc_order = radiate_cpu(llc_order, cpu_id, cpu.core_id);
1892            core_order = radiate_cpu(core_order, cpu_id, cpu.core_id);
1893
1894            // Concatenate and record the topology boundaries.
1895            let mut order: Vec<usize> = vec![];
1896            let mut idx: usize = 0;
1897
1898            idx += 1;
1899            order.push(cpu_id);
1900
1901            idx += core_order.len();
1902            order.append(&mut core_order);
1903            let core_end = idx;
1904
1905            idx += llc_order.len();
1906            order.append(&mut llc_order);
1907            let llc_end = idx;
1908
1909            idx += node_order.len();
1910            order.append(&mut node_order);
1911            let node_end = idx;
1912
1913            idx += sys_order.len();
1914            order.append(&mut sys_order);
1915            let sys_end = idx;
1916
1917            debug!(
1918                "CPU[{}] proximity map[{}/{}/{}/{}]: {:?}",
1919                cpu_id, core_end, llc_end, node_end, sys_end, &order
1920            );
1921
1922            // Record in cpu_ctx.
1923            let pmap = &mut cpu_ctxs[cpu_id].prox_map;
1924            for (i, &cpu) in order.iter().enumerate() {
1925                pmap.cpus[i] = cpu as u16;
1926            }
1927            pmap.core_end = core_end as u32;
1928            pmap.llc_end = llc_end as u32;
1929            pmap.node_end = node_end as u32;
1930            pmap.sys_end = sys_end as u32;
1931        }
1932    }
1933
1934    fn convert_cpu_ctxs(cpu_ctxs: Vec<bpf_intf::cpu_ctx>) -> Vec<Vec<u8>> {
1935        cpu_ctxs
1936            .into_iter()
1937            .map(|cpu_ctx| {
1938                let bytes = unsafe {
1939                    std::slice::from_raw_parts(
1940                        &cpu_ctx as *const bpf_intf::cpu_ctx as *const u8,
1941                        std::mem::size_of::<bpf_intf::cpu_ctx>(),
1942                    )
1943                };
1944                bytes.to_vec()
1945            })
1946            .collect()
1947    }
1948
1949    fn init_cpus(skel: &BpfSkel, layer_specs: &[LayerSpec], topo: &Topology) -> Result<()> {
1950        let key = (0_u32).to_ne_bytes();
1951        let mut cpu_ctxs: Vec<bpf_intf::cpu_ctx> = vec![];
1952        let cpu_ctxs_vec = skel
1953            .maps
1954            .cpu_ctxs
1955            .lookup_percpu(&key, libbpf_rs::MapFlags::ANY)
1956            .context("Failed to lookup cpu_ctx")?
1957            .unwrap();
1958
1959        let op_layers: Vec<u32> = layer_specs
1960            .iter()
1961            .enumerate()
1962            .filter(|(_idx, spec)| match &spec.kind {
1963                LayerKind::Open { .. } => spec.kind.common().preempt,
1964                _ => false,
1965            })
1966            .map(|(idx, _)| idx as u32)
1967            .collect();
1968        let on_layers: Vec<u32> = layer_specs
1969            .iter()
1970            .enumerate()
1971            .filter(|(_idx, spec)| match &spec.kind {
1972                LayerKind::Open { .. } => !spec.kind.common().preempt,
1973                _ => false,
1974            })
1975            .map(|(idx, _)| idx as u32)
1976            .collect();
1977        let gp_layers: Vec<u32> = layer_specs
1978            .iter()
1979            .enumerate()
1980            .filter(|(_idx, spec)| match &spec.kind {
1981                LayerKind::Grouped { .. } => spec.kind.common().preempt,
1982                _ => false,
1983            })
1984            .map(|(idx, _)| idx as u32)
1985            .collect();
1986        let gn_layers: Vec<u32> = layer_specs
1987            .iter()
1988            .enumerate()
1989            .filter(|(_idx, spec)| match &spec.kind {
1990                LayerKind::Grouped { .. } => !spec.kind.common().preempt,
1991                _ => false,
1992            })
1993            .map(|(idx, _)| idx as u32)
1994            .collect();
1995
1996        // FIXME - this incorrectly assumes all possible CPUs are consecutive.
1997        for cpu in 0..*NR_CPUS_POSSIBLE {
1998            cpu_ctxs.push(*unsafe {
1999                &*(cpu_ctxs_vec[cpu].as_slice().as_ptr() as *const bpf_intf::cpu_ctx)
2000            });
2001
2002            let topo_cpu = topo.all_cpus.get(&cpu).unwrap();
2003            let is_big = topo_cpu.core_type == CoreType::Big { turbo: true };
2004            cpu_ctxs[cpu].cpu = cpu as i32;
2005            cpu_ctxs[cpu].layer_id = MAX_LAYERS as u32;
2006            cpu_ctxs[cpu].task_layer_id = MAX_LAYERS as u32;
2007            cpu_ctxs[cpu].is_big = is_big;
2008
2009            fastrand::seed(cpu as u64);
2010
2011            let mut ogp_order = op_layers.clone();
2012            ogp_order.append(&mut gp_layers.clone());
2013            fastrand::shuffle(&mut ogp_order);
2014
2015            let mut ogn_order = on_layers.clone();
2016            ogn_order.append(&mut gn_layers.clone());
2017            fastrand::shuffle(&mut ogn_order);
2018
2019            let mut op_order = op_layers.clone();
2020            fastrand::shuffle(&mut op_order);
2021
2022            let mut on_order = on_layers.clone();
2023            fastrand::shuffle(&mut on_order);
2024
2025            let mut gp_order = gp_layers.clone();
2026            fastrand::shuffle(&mut gp_order);
2027
2028            let mut gn_order = gn_layers.clone();
2029            fastrand::shuffle(&mut gn_order);
2030
2031            for i in 0..MAX_LAYERS {
2032                cpu_ctxs[cpu].ogp_layer_order[i] =
2033                    ogp_order.get(i).cloned().unwrap_or(MAX_LAYERS as u32);
2034                cpu_ctxs[cpu].ogn_layer_order[i] =
2035                    ogn_order.get(i).cloned().unwrap_or(MAX_LAYERS as u32);
2036
2037                cpu_ctxs[cpu].op_layer_order[i] =
2038                    op_order.get(i).cloned().unwrap_or(MAX_LAYERS as u32);
2039                cpu_ctxs[cpu].on_layer_order[i] =
2040                    on_order.get(i).cloned().unwrap_or(MAX_LAYERS as u32);
2041                cpu_ctxs[cpu].gp_layer_order[i] =
2042                    gp_order.get(i).cloned().unwrap_or(MAX_LAYERS as u32);
2043                cpu_ctxs[cpu].gn_layer_order[i] =
2044                    gn_order.get(i).cloned().unwrap_or(MAX_LAYERS as u32);
2045            }
2046        }
2047
2048        Self::init_cpu_prox_map(topo, &mut cpu_ctxs);
2049
2050        skel.maps
2051            .cpu_ctxs
2052            .update_percpu(
2053                &key,
2054                &Self::convert_cpu_ctxs(cpu_ctxs),
2055                libbpf_rs::MapFlags::ANY,
2056            )
2057            .context("Failed to update cpu_ctx")?;
2058
2059        Ok(())
2060    }
2061
2062    fn init_llc_prox_map(skel: &mut BpfSkel, topo: &Topology) -> Result<()> {
2063        for (&llc_id, llc) in &topo.all_llcs {
2064            // Collect the orders.
2065            let mut node_order: Vec<usize> =
2066                topo.nodes[&llc.node_id].llcs.keys().cloned().collect();
2067            let mut sys_order: Vec<usize> = topo.all_llcs.keys().cloned().collect();
2068
2069            // Make the orders exclusive.
2070            sys_order.retain(|id| !node_order.contains(id));
2071            node_order.retain(|&id| id != llc_id);
2072
2073            // Shufle so that different LLCs follow different orders. See
2074            // init_cpu_prox_map().
2075            fastrand::seed(llc_id as u64);
2076            fastrand::shuffle(&mut sys_order);
2077            fastrand::shuffle(&mut node_order);
2078
2079            // Concatenate and record the node boundary.
2080            let mut order: Vec<usize> = vec![];
2081            let mut idx: usize = 0;
2082
2083            idx += 1;
2084            order.push(llc_id);
2085
2086            idx += node_order.len();
2087            order.append(&mut node_order);
2088            let node_end = idx;
2089
2090            idx += sys_order.len();
2091            order.append(&mut sys_order);
2092            let sys_end = idx;
2093
2094            debug!(
2095                "LLC[{}] proximity map[{}/{}]: {:?}",
2096                llc_id, node_end, sys_end, &order
2097            );
2098
2099            // Record in llc_ctx.
2100            //
2101            // XXX - This would be a lot easier if llc_ctx were in the bss.
2102            // See BpfStats::read().
2103            let key = llc_id as u32;
2104            let llc_id_slice =
2105                unsafe { std::slice::from_raw_parts((&key as *const u32) as *const u8, 4) };
2106            let v = skel
2107                .maps
2108                .llc_data
2109                .lookup(llc_id_slice, libbpf_rs::MapFlags::ANY)
2110                .unwrap()
2111                .unwrap();
2112            let mut llcc = unsafe { *(v.as_slice().as_ptr() as *const bpf_intf::llc_ctx) };
2113
2114            let pmap = &mut llcc.prox_map;
2115            for (i, &llc_id) in order.iter().enumerate() {
2116                pmap.llcs[i] = llc_id as u16;
2117            }
2118            pmap.node_end = node_end as u32;
2119            pmap.sys_end = sys_end as u32;
2120
2121            let v = unsafe {
2122                std::slice::from_raw_parts(
2123                    &llcc as *const bpf_intf::llc_ctx as *const u8,
2124                    std::mem::size_of::<bpf_intf::llc_ctx>(),
2125                )
2126            };
2127
2128            skel.maps
2129                .llc_data
2130                .update(llc_id_slice, v, libbpf_rs::MapFlags::ANY)?
2131        }
2132
2133        Ok(())
2134    }
2135
2136    fn init(
2137        opts: &'a Opts,
2138        layer_specs: &[LayerSpec],
2139        open_object: &'a mut MaybeUninit<OpenObject>,
2140        hint_to_layer_map: &HashMap<u64, usize>,
2141    ) -> Result<Self> {
2142        let nr_layers = layer_specs.len();
2143        let mut disable_topology = opts.disable_topology.unwrap_or(false);
2144
2145        let topo = Arc::new(if disable_topology {
2146            Topology::with_flattened_llc_node()?
2147        } else {
2148            Topology::new()?
2149        });
2150
2151        /*
2152         * FIXME: scx_layered incorrectly assumes that node, LLC and CPU IDs
2153         * are consecutive. Verify that they are on this system and bail if
2154         * not. It's lucky that core ID is not used anywhere as core IDs are
2155         * not consecutive on some Ryzen CPUs.
2156         */
2157        if topo.nodes.keys().enumerate().any(|(i, &k)| i != k) {
2158            bail!("Holes in node IDs detected: {:?}", topo.nodes.keys());
2159        }
2160        if topo.all_llcs.keys().enumerate().any(|(i, &k)| i != k) {
2161            bail!("Holes in LLC IDs detected: {:?}", topo.all_llcs.keys());
2162        }
2163        if topo.all_cpus.keys().enumerate().any(|(i, &k)| i != k) {
2164            bail!("Holes in CPU IDs detected: {:?}", topo.all_cpus.keys());
2165        }
2166
2167        let netdevs = if opts.netdev_irq_balance {
2168            warn!(
2169                "Experimental netdev IRQ balancing enabled. Reset IRQ masks of network devices after use!!!"
2170            );
2171            read_netdevs()?
2172        } else {
2173            BTreeMap::new()
2174        };
2175
2176        if !disable_topology {
2177            if topo.nodes.len() == 1 && topo.nodes[&0].llcs.len() == 1 {
2178                disable_topology = true;
2179            };
2180            info!(
2181                "Topology awareness not specified, selecting {} based on hardware",
2182                if disable_topology {
2183                    "disabled"
2184                } else {
2185                    "enabled"
2186                }
2187            );
2188        };
2189
2190        let cpu_pool = CpuPool::new(topo.clone())?;
2191
2192        // If disabling topology awareness clear out any set NUMA/LLC configs and
2193        // it will fallback to using all cores.
2194        let layer_specs: Vec<_> = if disable_topology {
2195            info!("Disabling topology awareness");
2196            layer_specs
2197                .iter()
2198                .cloned()
2199                .map(|mut s| {
2200                    s.kind.common_mut().nodes.clear();
2201                    s.kind.common_mut().llcs.clear();
2202                    s
2203                })
2204                .collect()
2205        } else {
2206            layer_specs.to_vec()
2207        };
2208
2209        // Check kernel features
2210        init_libbpf_logging(None);
2211        let kfuncs_in_syscall = scx_bpf_compat::kfuncs_supported_in_syscall()?;
2212        if !kfuncs_in_syscall {
2213            warn!("Using slow path: kfuncs not supported in syscall programs (a8e03b6bbb2c ∉ ker)");
2214        }
2215
2216        // Open the BPF prog first for verification.
2217        let mut skel_builder = BpfSkelBuilder::default();
2218        skel_builder.obj_builder.debug(opts.verbose > 1);
2219        info!(
2220            "Running scx_layered (build ID: {})",
2221            build_id::full_version(env!("CARGO_PKG_VERSION"))
2222        );
2223        let open_opts = opts.libbpf.clone().into_bpf_open_opts();
2224        let mut skel = scx_ops_open!(skel_builder, open_object, layered, open_opts)?;
2225
2226        // enable autoloads for conditionally loaded things
2227        // immediately after creating skel (because this is always before loading)
2228        if opts.enable_gpu_support {
2229            // by default, enable open if gpu support is enabled.
2230            // open has been observed to be relatively cheap to kprobe.
2231            if opts.gpu_kprobe_level >= 1 {
2232                compat::cond_kprobe_enable("nvidia_open", &skel.progs.kprobe_nvidia_open)?;
2233            }
2234            // enable the rest progressively based upon how often they are called
2235            // for observed workloads
2236            if opts.gpu_kprobe_level >= 2 {
2237                compat::cond_kprobe_enable("nvidia_mmap", &skel.progs.kprobe_nvidia_mmap)?;
2238            }
2239            if opts.gpu_kprobe_level >= 3 {
2240                compat::cond_kprobe_enable("nvidia_poll", &skel.progs.kprobe_nvidia_poll)?;
2241            }
2242        }
2243
2244        let ext_sched_class_addr = get_kallsyms_addr("ext_sched_class");
2245        let idle_sched_class_addr = get_kallsyms_addr("idle_sched_class");
2246
2247        let rodata = skel.maps.rodata_data.as_mut().unwrap();
2248
2249        if ext_sched_class_addr.is_ok() && idle_sched_class_addr.is_ok() {
2250            rodata.ext_sched_class_addr = ext_sched_class_addr.unwrap();
2251            rodata.idle_sched_class_addr = idle_sched_class_addr.unwrap();
2252        } else {
2253            warn!(
2254                "Unable to get sched_class addresses from /proc/kallsyms, disabling skip_preempt."
2255            );
2256        }
2257
2258        rodata.slice_ns = scx_enums.SCX_SLICE_DFL;
2259        rodata.max_exec_ns = 20 * scx_enums.SCX_SLICE_DFL;
2260
2261        // Initialize skel according to @opts.
2262        skel.struct_ops.layered_mut().exit_dump_len = opts.exit_dump_len;
2263
2264        if !opts.disable_queued_wakeup {
2265            match *compat::SCX_OPS_ALLOW_QUEUED_WAKEUP {
2266                0 => info!("Kernel does not support queued wakeup optimization"),
2267                v => skel.struct_ops.layered_mut().flags |= v,
2268            }
2269        }
2270
2271        rodata.percpu_kthread_preempt = !opts.disable_percpu_kthread_preempt;
2272        rodata.percpu_kthread_preempt_all =
2273            !opts.disable_percpu_kthread_preempt && opts.percpu_kthread_preempt_all;
2274        rodata.debug = opts.verbose as u32;
2275        rodata.slice_ns = opts.slice_us * 1000;
2276        rodata.max_exec_ns = if opts.max_exec_us > 0 {
2277            opts.max_exec_us * 1000
2278        } else {
2279            opts.slice_us * 1000 * 20
2280        };
2281        rodata.nr_cpu_ids = *NR_CPU_IDS as u32;
2282        rodata.nr_possible_cpus = *NR_CPUS_POSSIBLE as u32;
2283        rodata.smt_enabled = topo.smt_enabled;
2284        rodata.has_little_cores = topo.has_little_cores();
2285        rodata.xnuma_preemption = opts.xnuma_preemption;
2286        rodata.antistall_sec = opts.antistall_sec;
2287        rodata.monitor_disable = opts.monitor_disable;
2288        rodata.lo_fb_wait_ns = opts.lo_fb_wait_us * 1000;
2289        rodata.lo_fb_share_ppk = ((opts.lo_fb_share * 1024.0) as u32).clamp(1, 1024);
2290        rodata.enable_antistall = !opts.disable_antistall;
2291        rodata.enable_match_debug = opts.enable_match_debug;
2292        rodata.enable_gpu_support = opts.enable_gpu_support;
2293        rodata.kfuncs_supported_in_syscall = kfuncs_in_syscall;
2294
2295        for (cpu, sib) in topo.sibling_cpus().iter().enumerate() {
2296            rodata.__sibling_cpu[cpu] = *sib;
2297        }
2298        for cpu in topo.all_cpus.keys() {
2299            rodata.all_cpus[cpu / 8] |= 1 << (cpu % 8);
2300        }
2301
2302        rodata.nr_op_layers = layer_specs
2303            .iter()
2304            .filter(|spec| match &spec.kind {
2305                LayerKind::Open { .. } => spec.kind.common().preempt,
2306                _ => false,
2307            })
2308            .count() as u32;
2309        rodata.nr_on_layers = layer_specs
2310            .iter()
2311            .filter(|spec| match &spec.kind {
2312                LayerKind::Open { .. } => !spec.kind.common().preempt,
2313                _ => false,
2314            })
2315            .count() as u32;
2316        rodata.nr_gp_layers = layer_specs
2317            .iter()
2318            .filter(|spec| match &spec.kind {
2319                LayerKind::Grouped { .. } => spec.kind.common().preempt,
2320                _ => false,
2321            })
2322            .count() as u32;
2323        rodata.nr_gn_layers = layer_specs
2324            .iter()
2325            .filter(|spec| match &spec.kind {
2326                LayerKind::Grouped { .. } => !spec.kind.common().preempt,
2327                _ => false,
2328            })
2329            .count() as u32;
2330        rodata.nr_excl_layers = layer_specs
2331            .iter()
2332            .filter(|spec| spec.kind.common().exclusive)
2333            .count() as u32;
2334
2335        let mut min_open = u64::MAX;
2336        let mut min_preempt = u64::MAX;
2337
2338        for spec in layer_specs.iter() {
2339            if let LayerKind::Open { common, .. } = &spec.kind {
2340                min_open = min_open.min(common.disallow_open_after_us.unwrap());
2341                min_preempt = min_preempt.min(common.disallow_preempt_after_us.unwrap());
2342            }
2343        }
2344
2345        rodata.min_open_layer_disallow_open_after_ns = match min_open {
2346            u64::MAX => *DFL_DISALLOW_OPEN_AFTER_US,
2347            v => v,
2348        };
2349        rodata.min_open_layer_disallow_preempt_after_ns = match min_preempt {
2350            u64::MAX => *DFL_DISALLOW_PREEMPT_AFTER_US,
2351            v => v,
2352        };
2353
2354        // Consider all layers empty at the beginning.
2355        for i in 0..layer_specs.len() {
2356            skel.maps.bss_data.as_mut().unwrap().empty_layer_ids[i] = i as u32;
2357        }
2358        skel.maps.bss_data.as_mut().unwrap().nr_empty_layer_ids = nr_layers as u32;
2359
2360        // We set the pin path before loading the skeleton. This will ensure
2361        // libbpf creates and pins the map, or reuses the pinned map fd for us,
2362        // so that we can keep reusing the older map already pinned on scheduler
2363        // restarts.
2364        let layered_task_hint_map_path = &opts.task_hint_map;
2365        let hint_map = &mut skel.maps.scx_layered_task_hint_map;
2366        // Only set pin path if a path is provided.
2367        if layered_task_hint_map_path.is_empty() == false {
2368            hint_map.set_pin_path(layered_task_hint_map_path).unwrap();
2369            rodata.task_hint_map_enabled = true;
2370        }
2371
2372        if !opts.hi_fb_thread_name.is_empty() {
2373            let bpf_hi_fb_thread_name = &mut rodata.hi_fb_thread_name;
2374            copy_into_cstr(bpf_hi_fb_thread_name, opts.hi_fb_thread_name.as_str());
2375            rodata.enable_hi_fb_thread_name_match = true;
2376        }
2377
2378        Self::init_layers(&mut skel, &layer_specs, &topo)?;
2379        Self::init_nodes(&mut skel, opts, &topo);
2380
2381        let mut skel = scx_ops_load!(skel, layered, uei)?;
2382
2383        // Populate the mapping of hints to layer IDs for faster lookups
2384        if hint_to_layer_map.len() != 0 {
2385            for (k, v) in hint_to_layer_map.iter() {
2386                let key: u32 = *k as u32;
2387                let value: u32 = *v as u32;
2388                skel.maps.hint_to_layer_id_map.update(
2389                    &key.to_ne_bytes(),
2390                    &value.to_ne_bytes(),
2391                    libbpf_rs::MapFlags::ANY,
2392                )?;
2393            }
2394        }
2395
2396        let mut layers = vec![];
2397        let layer_growth_orders =
2398            LayerGrowthAlgo::layer_core_orders(&cpu_pool, &layer_specs, &topo)?;
2399        for (idx, spec) in layer_specs.iter().enumerate() {
2400            let growth_order = layer_growth_orders
2401                .get(&idx)
2402                .with_context(|| "layer has no growth order".to_string())?;
2403            layers.push(Layer::new(spec, &topo, growth_order)?);
2404        }
2405
2406        let mut idle_qos_enabled = layers
2407            .iter()
2408            .any(|layer| layer.kind.common().idle_resume_us.unwrap_or(0) > 0);
2409        if idle_qos_enabled && !cpu_idle_resume_latency_supported() {
2410            warn!("idle_resume_us not supported, ignoring");
2411            idle_qos_enabled = false;
2412        }
2413
2414        Self::init_cpus(&skel, &layer_specs, &topo)?;
2415        Self::init_llc_prox_map(&mut skel, &topo)?;
2416
2417        // Other stuff.
2418        let proc_reader = fb_procfs::ProcReader::new();
2419
2420        // Handle setup if layered is running in a pid namespace.
2421        let input = ProgramInput {
2422            ..Default::default()
2423        };
2424        let prog = &mut skel.progs.initialize_pid_namespace;
2425
2426        let _ = prog.test_run(input);
2427
2428        // XXX If we try to refresh the cpumasks here before attaching, we
2429        // sometimes (non-deterministically) don't see the updated values in
2430        // BPF. It would be better to update the cpumasks here before we
2431        // attach, but the value will quickly converge anyways so it's not a
2432        // huge problem in the interim until we figure it out.
2433
2434        // Allow all tasks to open and write to BPF task hint map, now that
2435        // we should have it pinned at the desired location.
2436        if layered_task_hint_map_path.is_empty() == false {
2437            let path = CString::new(layered_task_hint_map_path.as_bytes()).unwrap();
2438            let mode: libc::mode_t = 0o666;
2439            unsafe {
2440                if libc::chmod(path.as_ptr(), mode) != 0 {
2441                    trace!("'chmod' to 666 of task hint map failed, continuing...");
2442                }
2443            }
2444        }
2445
2446        // Attach.
2447        let struct_ops = scx_ops_attach!(skel, layered)?;
2448        let stats_server = StatsServer::new(stats::server_data()).launch()?;
2449        let mut gpu_task_handler =
2450            GpuTaskAffinitizer::new(opts.gpu_affinitize_secs, opts.enable_gpu_affinitize);
2451        gpu_task_handler.init(topo.clone());
2452        let sched = Self {
2453            struct_ops: Some(struct_ops),
2454            layer_specs,
2455
2456            sched_intv: Duration::from_secs_f64(opts.interval),
2457            layer_refresh_intv: Duration::from_millis(opts.layer_refresh_ms_avgruntime),
2458
2459            cpu_pool,
2460            layers,
2461            idle_qos_enabled,
2462
2463            sched_stats: Stats::new(&mut skel, &proc_reader, &gpu_task_handler)?,
2464
2465            nr_layer_cpus_ranges: vec![(0, 0); nr_layers],
2466            processing_dur: Default::default(),
2467
2468            proc_reader,
2469            skel,
2470
2471            topo,
2472            netdevs,
2473            stats_server,
2474            gpu_task_handler,
2475        };
2476
2477        info!("Layered Scheduler Attached. Run `scx_layered --monitor` for metrics.");
2478
2479        Ok(sched)
2480    }
2481
2482    fn update_cpumask(mask: &Cpumask, bpfmask: &mut [u8]) {
2483        for cpu in 0..mask.len() {
2484            if mask.test_cpu(cpu) {
2485                bpfmask[cpu / 8] |= 1 << (cpu % 8);
2486            } else {
2487                bpfmask[cpu / 8] &= !(1 << (cpu % 8));
2488            }
2489        }
2490    }
2491
2492    fn update_bpf_layer_cpumask(layer: &Layer, bpf_layer: &mut types::layer) {
2493        trace!("[{}] Updating BPF CPUs: {}", layer.name, &layer.cpus);
2494        Self::update_cpumask(&layer.cpus, &mut bpf_layer.cpus);
2495
2496        bpf_layer.nr_cpus = layer.nr_cpus as u32;
2497        for (llc_id, &nr_llc_cpus) in layer.nr_llc_cpus.iter().enumerate() {
2498            bpf_layer.nr_llc_cpus[llc_id] = nr_llc_cpus as u32;
2499        }
2500
2501        bpf_layer.refresh_cpus = 1;
2502    }
2503
2504    fn update_netdev_cpumasks(&mut self) -> Result<()> {
2505        let available_cpus = self.cpu_pool.available_cpus();
2506        if available_cpus.is_empty() {
2507            return Ok(());
2508        }
2509
2510        for (iface, netdev) in self.netdevs.iter_mut() {
2511            let node = self
2512                .topo
2513                .nodes
2514                .values()
2515                .take_while(|n| n.id == netdev.node())
2516                .next()
2517                .ok_or_else(|| anyhow!("Failed to get netdev node"))?;
2518            let node_cpus = node.span.clone();
2519            for (irq, irqmask) in netdev.irqs.iter_mut() {
2520                irqmask.clear_all();
2521                for cpu in available_cpus.iter() {
2522                    if !node_cpus.test_cpu(cpu) {
2523                        continue;
2524                    }
2525                    let _ = irqmask.set_cpu(cpu);
2526                }
2527                // If no CPUs are available in the node then spread the load across the node
2528                if irqmask.weight() == 0 {
2529                    for cpu in node_cpus.iter() {
2530                        let _ = irqmask.set_cpu(cpu);
2531                    }
2532                }
2533                trace!("{} updating irq {} cpumask {:?}", iface, irq, irqmask);
2534            }
2535            netdev.apply_cpumasks()?;
2536        }
2537
2538        Ok(())
2539    }
2540
2541    fn clamp_target_by_membw(
2542        &self,
2543        layer: &Layer,
2544        membw_limit: usize,
2545        last_membw: usize,
2546        low: usize,
2547        high: usize,
2548    ) -> usize {
2549        let ncpu = layer.cpus.weight();
2550        let last_membw_percpu = if ncpu > 0 { last_membw / ncpu } else { 0 };
2551
2552        // If there's no way we're hitting the limit, we're good.
2553        if last_membw_percpu * high < membw_limit {
2554            return high;
2555        }
2556
2557        // If there's no way to drop our memory usage down enough,
2558        // pin the target CPUs to low and drop a warning.
2559        if last_membw_percpu * low > membw_limit {
2560            warn!("cannot satisfy memory bw limit for layer {}", layer.name);
2561            return low;
2562        }
2563
2564        // Clamped membw is between low and high, clamp high.
2565        return membw_limit / last_membw_percpu;
2566    }
2567
2568    /// Calculate how many CPUs each layer would like to have if there were
2569    /// no competition. The CPU range is determined by applying the inverse
2570    /// of util_range and then capping by cpus_range. If the current
2571    /// allocation is within the acceptable range, no change is made.
2572    /// Returns (target, min) pair for each layer.
2573    fn calc_target_nr_cpus(&self) -> Vec<(usize, usize)> {
2574        let nr_cpus = self.cpu_pool.topo.all_cpus.len();
2575        let utils = &self.sched_stats.layer_utils;
2576
2577        let mut records: Vec<(u64, u64, u64, usize, usize, usize)> = vec![];
2578        let mut targets: Vec<(usize, usize)> = vec![];
2579
2580        for (idx, layer) in self.layers.iter().enumerate() {
2581            targets.push(match &layer.kind {
2582                LayerKind::Confined {
2583                    util_range,
2584                    cpus_range,
2585                    cpus_range_frac,
2586                    membw_gb,
2587                    ..
2588                }
2589                | LayerKind::Grouped {
2590                    util_range,
2591                    cpus_range,
2592                    cpus_range_frac,
2593                    membw_gb,
2594                    ..
2595                } => {
2596                    // A grouped layer can choose to include open cputime
2597                    // for sizing. Also, as an empty layer can only get CPU
2598                    // time through fallback (counted as owned) or open
2599                    // execution, add open cputime for empty layers.
2600                    let owned = utils[idx][LAYER_USAGE_OWNED];
2601                    let open = utils[idx][LAYER_USAGE_OPEN];
2602
2603                    let mut util = owned;
2604                    if layer.kind.util_includes_open_cputime() || layer.nr_cpus == 0 {
2605                        util += open;
2606                    }
2607
2608                    let util = if util < 0.01 { 0.0 } else { util };
2609                    let low = (util / util_range.1).ceil() as usize;
2610                    let mut high = ((util / util_range.0).floor() as usize).max(low);
2611
2612                    if let Some(membw_limit) = membw_gb {
2613                        let membw_cpus = &self.sched_stats.layer_membws[idx];
2614                        let last_membw = membw_cpus.into_iter().map(|x: &f64| *x as usize).sum();
2615                        high = self.clamp_target_by_membw(
2616                            &layer,
2617                            (*membw_limit * (1024_u64.pow(3) as f64)) as usize,
2618                            last_membw,
2619                            low,
2620                            high,
2621                        );
2622                    }
2623
2624                    let target = layer.cpus.weight().clamp(low, high);
2625                    let cpus_range =
2626                        resolve_cpus_pct_range(cpus_range, cpus_range_frac, nr_cpus).unwrap();
2627
2628                    records.push((
2629                        (owned * 100.0) as u64,
2630                        (open * 100.0) as u64,
2631                        (util * 100.0) as u64,
2632                        low,
2633                        high,
2634                        target,
2635                    ));
2636
2637                    (target.clamp(cpus_range.0, cpus_range.1), cpus_range.0)
2638                }
2639                LayerKind::Open { .. } => (0, 0),
2640            });
2641        }
2642
2643        trace!("initial targets: {:?}", &targets);
2644        trace!("(owned, open, util, low, high, target): {:?}", &records);
2645        targets
2646    }
2647
2648    /// Given (target, min) pair for each layer which was determined
2649    /// assuming infinite number of CPUs, distribute the actual CPUs
2650    /// according to their weights.
2651    fn weighted_target_nr_cpus(&self, targets: &[(usize, usize)]) -> Vec<usize> {
2652        let mut nr_left = self.cpu_pool.topo.all_cpus.len();
2653        let weights: Vec<usize> = self
2654            .layers
2655            .iter()
2656            .map(|layer| layer.kind.common().weight as usize)
2657            .collect();
2658        let mut cands: BTreeMap<usize, (usize, usize, usize)> = targets
2659            .iter()
2660            .zip(&weights)
2661            .enumerate()
2662            .map(|(i, ((target, min), weight))| (i, (*target, *min, *weight)))
2663            .collect();
2664        let mut weight_sum: usize = weights.iter().sum();
2665        let mut weighted: Vec<usize> = vec![0; self.layers.len()];
2666
2667        trace!("cands: {:?}", &cands);
2668
2669        // First, accept all layers that are <= min.
2670        cands.retain(|&i, &mut (target, min, weight)| {
2671            if target <= min {
2672                let target = target.min(nr_left);
2673                weighted[i] = target;
2674                weight_sum -= weight;
2675                nr_left -= target;
2676                false
2677            } else {
2678                true
2679            }
2680        });
2681
2682        trace!("cands after accepting mins: {:?}", &cands);
2683
2684        // Keep accepting ones under their allotted share.
2685        let calc_share = |nr_left, weight, weight_sum| {
2686            (((nr_left * weight) as f64 / weight_sum as f64).ceil() as usize).min(nr_left)
2687        };
2688
2689        while !cands.is_empty() {
2690            let mut progress = false;
2691
2692            cands.retain(|&i, &mut (target, _min, weight)| {
2693                let share = calc_share(nr_left, weight, weight_sum);
2694                if target <= share {
2695                    weighted[i] = target;
2696                    weight_sum -= weight;
2697                    nr_left -= target;
2698                    progress = true;
2699                    false
2700                } else {
2701                    true
2702                }
2703            });
2704
2705            if !progress {
2706                break;
2707            }
2708        }
2709
2710        trace!("cands after accepting under allotted: {:?}", &cands);
2711
2712        // The remaining candidates are in contention with each other,
2713        // distribute according to the shares.
2714        let nr_to_share = nr_left;
2715        for (i, (_target, _min, weight)) in cands.into_iter() {
2716            let share = calc_share(nr_to_share, weight, weight_sum).min(nr_left);
2717            weighted[i] = share;
2718            nr_left -= share;
2719        }
2720
2721        trace!("weighted: {:?}", &weighted);
2722
2723        weighted
2724    }
2725
2726    // Figure out a tuple (LLCs, extra_cpus) in terms of the target CPUs
2727    // computed by weighted_target_nr_cpus. Returns the number of full LLCs
2728    // occupied by a layer, and any extra CPUs that don't occupy a full LLC.
2729    fn compute_target_llcs(target: usize, topo: &Topology) -> (usize, usize) {
2730        // TODO(kkd): We assume each LLC has equal number of cores.
2731        let cores_per_llc = topo.all_cores.len() / topo.all_llcs.len();
2732        // TODO(kkd): We assume each core has fixed number of threads.
2733        let cpus_per_core = topo.all_cores.first_key_value().unwrap().1.cpus.len();
2734        let cpus_per_llc = cores_per_llc * cpus_per_core;
2735
2736        let full = target / cpus_per_llc;
2737        let extra = target % cpus_per_llc;
2738
2739        (full, extra.div_ceil(cpus_per_core))
2740    }
2741
2742    // Recalculate the core order for layers using StickyDynamic growth
2743    // algorithm. Tuples from compute_target_llcs are used to decide how many
2744    // LLCs and cores should be assigned to each layer, logic to alloc and free
2745    // CPUs operates on that core order. This happens in three logical steps, we
2746    // first free LLCs from layers that shrunk from last recomputation, then
2747    // distribute freed LLCs to growing layers, and then spill over remaining
2748    // cores in free LLCs.
2749    fn recompute_layer_core_order(&mut self, layer_targets: &Vec<(usize, usize)>) {
2750        // Collect freed LLCs from shrinking layers.
2751        debug!(
2752            " free: before pass: free_llcs={:?}",
2753            self.cpu_pool.free_llcs
2754        );
2755        for &(idx, target) in layer_targets.iter().rev() {
2756            let layer = &mut self.layers[idx];
2757            let old_tlc = layer.target_llc_cpus;
2758            let new_tlc = Self::compute_target_llcs(target, &self.topo);
2759
2760            if layer.growth_algo != LayerGrowthAlgo::StickyDynamic {
2761                continue;
2762            }
2763
2764            let mut to_free = (old_tlc.0 as i32 - new_tlc.0 as i32).max(0) as usize;
2765
2766            debug!(
2767                " free: layer={} old_tlc={:?} new_tlc={:?} to_free={} assigned={} free={}",
2768                layer.name,
2769                old_tlc,
2770                new_tlc,
2771                to_free,
2772                layer.assigned_llcs.len(),
2773                self.cpu_pool.free_llcs.len()
2774            );
2775
2776            while to_free > 0 && layer.assigned_llcs.len() > 0 {
2777                let llc = layer.assigned_llcs.pop().unwrap();
2778                self.cpu_pool.free_llcs.push((llc, 0));
2779                to_free -= 1;
2780
2781                debug!(" layer={} freed_llc={}", layer.name, llc);
2782            }
2783        }
2784        debug!(" free: after pass: free_llcs={:?}", self.cpu_pool.free_llcs);
2785
2786        // Redistribute the freed LLCs to growing layers.
2787        for &(idx, target) in layer_targets.iter().rev() {
2788            let layer = &mut self.layers[idx];
2789            let old_tlc = layer.target_llc_cpus;
2790            let new_tlc = Self::compute_target_llcs(target, &self.topo);
2791
2792            if layer.growth_algo != LayerGrowthAlgo::StickyDynamic {
2793                continue;
2794            }
2795
2796            let mut to_alloc = (new_tlc.0 as i32 - old_tlc.0 as i32).max(0) as usize;
2797
2798            debug!(
2799                " alloc: layer={} old_tlc={:?} new_tlc={:?} to_alloc={} assigned={} free={}",
2800                layer.name,
2801                old_tlc,
2802                new_tlc,
2803                to_alloc,
2804                layer.assigned_llcs.len(),
2805                self.cpu_pool.free_llcs.len()
2806            );
2807
2808            while to_alloc > 0
2809                && self.cpu_pool.free_llcs.len() > 0
2810                && to_alloc <= self.cpu_pool.free_llcs.len()
2811            {
2812                let llc = self.cpu_pool.free_llcs.pop().unwrap().0;
2813                layer.assigned_llcs.push(llc);
2814                to_alloc -= 1;
2815
2816                debug!(" layer={} alloc_llc={}", layer.name, llc);
2817            }
2818
2819            debug!(
2820                " alloc: layer={} assigned_llcs={:?}",
2821                layer.name, layer.assigned_llcs
2822            );
2823
2824            // Update for next iteration.
2825            layer.target_llc_cpus = new_tlc;
2826        }
2827
2828        // Spillover overflowing cores into free LLCs. Bigger layers get to take
2829        // a chunk before smaller layers.
2830        for &(idx, _) in layer_targets.iter() {
2831            let mut core_order = vec![];
2832            let layer = &mut self.layers[idx];
2833
2834            if layer.growth_algo != LayerGrowthAlgo::StickyDynamic {
2835                continue;
2836            }
2837
2838            let tlc = layer.target_llc_cpus;
2839            let mut extra = tlc.1;
2840            // TODO(kkd): Move this logic into cpu_pool? What's the best place?
2841            let cores_per_llc = self.topo.all_cores.len() / self.topo.all_llcs.len();
2842            let cpus_per_core = self.topo.all_cores.first_key_value().unwrap().1.cpus.len();
2843            let cpus_per_llc = cores_per_llc * cpus_per_core;
2844
2845            // Consume from front since we pop from the back.
2846            for i in 0..self.cpu_pool.free_llcs.len() {
2847                let free_vec = &mut self.cpu_pool.free_llcs;
2848                // Available CPUs in LLC.
2849                let avail = cpus_per_llc - free_vec[i].1;
2850                // The amount we'll use.
2851                let mut used = extra.min(avail);
2852
2853                let shift = free_vec[i].1;
2854                free_vec[i].1 += used;
2855
2856                let llc_id = free_vec[i].0;
2857                let llc = self.topo.all_llcs.get(&llc_id).unwrap();
2858
2859                for core in llc.cores.iter().skip(shift) {
2860                    core_order.push(core.1.id);
2861                    if used == 0 {
2862                        break;
2863                    }
2864                    used -= 1;
2865                }
2866
2867                extra -= used;
2868                if extra == 0 {
2869                    break;
2870                }
2871            }
2872
2873            core_order.reverse();
2874            layer.core_order = core_order;
2875        }
2876
2877        // Reset consumed entries in free LLCs.
2878        for i in 0..self.cpu_pool.free_llcs.len() {
2879            self.cpu_pool.free_llcs[i].1 = 0;
2880        }
2881
2882        for &(idx, _) in layer_targets.iter() {
2883            let layer = &mut self.layers[idx];
2884
2885            if layer.growth_algo != LayerGrowthAlgo::StickyDynamic {
2886                continue;
2887            }
2888
2889            for core in self.topo.all_cores.iter() {
2890                let llc_id = core.1.llc_id;
2891                if layer.assigned_llcs.contains(&llc_id) {
2892                    layer.core_order.push(core.1.id);
2893                }
2894            }
2895            // Update core_order for the layer, but reverse to keep the start stable.
2896            layer.core_order.reverse();
2897
2898            debug!(
2899                " alloc: layer={} core_order={:?}",
2900                layer.name, layer.core_order
2901            );
2902        }
2903    }
2904
2905    fn refresh_cpumasks(&mut self) -> Result<()> {
2906        let layer_is_open = |layer: &Layer| matches!(layer.kind, LayerKind::Open { .. });
2907
2908        let mut updated = false;
2909        let targets = self.calc_target_nr_cpus();
2910        let targets = self.weighted_target_nr_cpus(&targets);
2911
2912        let mut ascending: Vec<(usize, usize)> = targets.iter().copied().enumerate().collect();
2913        ascending.sort_by(|a, b| a.1.cmp(&b.1));
2914
2915        self.recompute_layer_core_order(&ascending);
2916
2917        // If any layer is growing, guarantee that the largest layer that is
2918        // freeing CPUs frees at least one CPU.
2919        let mut force_free = self
2920            .layers
2921            .iter()
2922            .zip(targets.iter())
2923            .any(|(layer, &target)| layer.nr_cpus < target);
2924
2925        // Shrink all layers first so that CPUs are available for
2926        // redistribution. Do so in the descending target number of CPUs
2927        // order.
2928        for &(idx, target) in ascending.iter().rev() {
2929            let layer = &mut self.layers[idx];
2930            if layer_is_open(layer) {
2931                continue;
2932            }
2933
2934            let nr_cur = layer.cpus.weight();
2935            if nr_cur <= target {
2936                continue;
2937            }
2938            let mut nr_to_free = nr_cur - target;
2939
2940            // There's some dampening built into util metrics but slow down
2941            // freeing further to avoid unnecessary changes. This is solely
2942            // based on intution. Drop or update according to real-world
2943            // behavior.
2944            let nr_to_break_at = nr_to_free / 2;
2945
2946            let mut freed = false;
2947
2948            while nr_to_free > 0 {
2949                let max_to_free = if force_free {
2950                    force_free = false;
2951                    layer.nr_cpus
2952                } else {
2953                    nr_to_free
2954                };
2955
2956                let nr_freed = layer.free_some_cpus(&mut self.cpu_pool, max_to_free)?;
2957                if nr_freed == 0 {
2958                    break;
2959                }
2960
2961                nr_to_free = nr_to_free.saturating_sub(nr_freed);
2962                freed = true;
2963
2964                if nr_to_free <= nr_to_break_at {
2965                    break;
2966                }
2967            }
2968
2969            if freed {
2970                Self::update_bpf_layer_cpumask(
2971                    layer,
2972                    &mut self.skel.maps.bss_data.as_mut().unwrap().layers[idx],
2973                );
2974                updated = true;
2975            }
2976        }
2977
2978        // Grow layers. Do so in the ascending target number of CPUs order
2979        // so that we're always more generous to smaller layers. This avoids
2980        // starving small layers and shouldn't make noticable difference for
2981        // bigger layers as work conservation should still be achieved
2982        // through open execution.
2983        for &(idx, target) in &ascending {
2984            let layer = &mut self.layers[idx];
2985
2986            if layer_is_open(layer) {
2987                continue;
2988            }
2989
2990            let nr_cur = layer.cpus.weight();
2991            if nr_cur >= target {
2992                continue;
2993            }
2994
2995            let mut nr_to_alloc = target - nr_cur;
2996            let mut alloced = false;
2997
2998            while nr_to_alloc > 0 {
2999                let nr_alloced = layer.alloc_some_cpus(&mut self.cpu_pool)?;
3000                if nr_alloced == 0 {
3001                    break;
3002                }
3003                alloced = true;
3004                nr_to_alloc -= nr_alloced.min(nr_to_alloc);
3005            }
3006
3007            if alloced {
3008                Self::update_bpf_layer_cpumask(
3009                    layer,
3010                    &mut self.skel.maps.bss_data.as_mut().unwrap().layers[idx],
3011                );
3012                updated = true;
3013            }
3014        }
3015
3016        // Give the rest to the open layers.
3017        if updated {
3018            for (idx, layer) in self.layers.iter_mut().enumerate() {
3019                if !layer_is_open(layer) {
3020                    continue;
3021                }
3022
3023                let bpf_layer = &mut self.skel.maps.bss_data.as_mut().unwrap().layers[idx];
3024                let available_cpus = self.cpu_pool.available_cpus().and(&layer.allowed_cpus);
3025                let nr_available_cpus = available_cpus.weight();
3026
3027                // Open layers need the intersection of allowed cpus and
3028                // available cpus.
3029                layer.cpus = available_cpus;
3030                layer.nr_cpus = nr_available_cpus;
3031                Self::update_bpf_layer_cpumask(layer, bpf_layer);
3032            }
3033
3034            self.skel.maps.bss_data.as_mut().unwrap().fallback_cpu =
3035                self.cpu_pool.fallback_cpu as u32;
3036
3037            for (lidx, layer) in self.layers.iter().enumerate() {
3038                self.nr_layer_cpus_ranges[lidx] = (
3039                    self.nr_layer_cpus_ranges[lidx].0.min(layer.nr_cpus),
3040                    self.nr_layer_cpus_ranges[lidx].1.max(layer.nr_cpus),
3041                );
3042            }
3043
3044            // Trigger updates on the BPF side.
3045            let input = ProgramInput {
3046                ..Default::default()
3047            };
3048            let prog = &mut self.skel.progs.refresh_layer_cpumasks;
3049            let _ = prog.test_run(input);
3050
3051            // Update empty_layers.
3052            let empty_layer_ids: Vec<u32> = self
3053                .layers
3054                .iter()
3055                .enumerate()
3056                .filter(|(_idx, layer)| layer.nr_cpus == 0)
3057                .map(|(idx, _layer)| idx as u32)
3058                .collect();
3059            for i in 0..self.layers.len() {
3060                self.skel.maps.bss_data.as_mut().unwrap().empty_layer_ids[i] =
3061                    empty_layer_ids.get(i).cloned().unwrap_or(MAX_LAYERS as u32);
3062            }
3063            self.skel.maps.bss_data.as_mut().unwrap().nr_empty_layer_ids =
3064                empty_layer_ids.len() as u32;
3065        }
3066
3067        let _ = self.update_netdev_cpumasks();
3068        Ok(())
3069    }
3070
3071    fn refresh_idle_qos(&mut self) -> Result<()> {
3072        if !self.idle_qos_enabled {
3073            return Ok(());
3074        }
3075
3076        let mut cpu_idle_qos = vec![0; *NR_CPU_IDS];
3077        for layer in self.layers.iter() {
3078            let idle_resume_us = layer.kind.common().idle_resume_us.unwrap_or(0) as i32;
3079            for cpu in layer.cpus.iter() {
3080                cpu_idle_qos[cpu] = idle_resume_us;
3081            }
3082        }
3083
3084        for (cpu, idle_resume_usec) in cpu_idle_qos.iter().enumerate() {
3085            update_cpu_idle_resume_latency(cpu, *idle_resume_usec)?;
3086        }
3087
3088        Ok(())
3089    }
3090
3091    fn step(&mut self) -> Result<()> {
3092        let started_at = Instant::now();
3093        self.sched_stats.refresh(
3094            &mut self.skel,
3095            &self.proc_reader,
3096            started_at,
3097            self.processing_dur,
3098            &self.gpu_task_handler,
3099        )?;
3100        self.refresh_cpumasks()?;
3101        self.refresh_idle_qos()?;
3102        self.gpu_task_handler.maybe_affinitize();
3103        self.processing_dur += Instant::now().duration_since(started_at);
3104        Ok(())
3105    }
3106
3107    fn generate_sys_stats(
3108        &mut self,
3109        stats: &Stats,
3110        cpus_ranges: &mut [(usize, usize)],
3111    ) -> Result<SysStats> {
3112        let bstats = &stats.bpf_stats;
3113        let mut sys_stats = SysStats::new(stats, bstats, self.cpu_pool.fallback_cpu)?;
3114
3115        for (lidx, (spec, layer)) in self.layer_specs.iter().zip(self.layers.iter()).enumerate() {
3116            let layer_stats = LayerStats::new(lidx, layer, stats, bstats, cpus_ranges[lidx]);
3117            sys_stats.layers.insert(spec.name.to_string(), layer_stats);
3118            cpus_ranges[lidx] = (layer.nr_cpus, layer.nr_cpus);
3119        }
3120
3121        Ok(sys_stats)
3122    }
3123
3124    fn run(&mut self, shutdown: Arc<AtomicBool>) -> Result<UserExitInfo> {
3125        let (res_ch, req_ch) = self.stats_server.channels();
3126        let mut next_sched_at = Instant::now() + self.sched_intv;
3127        let enable_layer_refresh = !self.layer_refresh_intv.is_zero();
3128        let mut next_layer_refresh_at = Instant::now() + self.layer_refresh_intv;
3129        let mut cpus_ranges = HashMap::<ThreadId, Vec<(usize, usize)>>::new();
3130
3131        while !shutdown.load(Ordering::Relaxed) && !uei_exited!(&self.skel, uei) {
3132            let now = Instant::now();
3133
3134            if now >= next_sched_at {
3135                self.step()?;
3136                while next_sched_at < now {
3137                    next_sched_at += self.sched_intv;
3138                }
3139            }
3140
3141            if enable_layer_refresh && now >= next_layer_refresh_at {
3142                self.skel
3143                    .maps
3144                    .bss_data
3145                    .as_mut()
3146                    .unwrap()
3147                    .layer_refresh_seq_avgruntime += 1;
3148                while next_layer_refresh_at < now {
3149                    next_layer_refresh_at += self.layer_refresh_intv;
3150                }
3151            }
3152
3153            match req_ch.recv_deadline(next_sched_at) {
3154                Ok(StatsReq::Hello(tid)) => {
3155                    cpus_ranges.insert(
3156                        tid,
3157                        self.layers.iter().map(|l| (l.nr_cpus, l.nr_cpus)).collect(),
3158                    );
3159                    let stats =
3160                        Stats::new(&mut self.skel, &self.proc_reader, &self.gpu_task_handler)?;
3161                    res_ch.send(StatsRes::Hello(stats))?;
3162                }
3163                Ok(StatsReq::Refresh(tid, mut stats)) => {
3164                    // Propagate self's layer cpu ranges into each stat's.
3165                    for i in 0..self.nr_layer_cpus_ranges.len() {
3166                        for (_, ranges) in cpus_ranges.iter_mut() {
3167                            ranges[i] = (
3168                                ranges[i].0.min(self.nr_layer_cpus_ranges[i].0),
3169                                ranges[i].1.max(self.nr_layer_cpus_ranges[i].1),
3170                            );
3171                        }
3172                        self.nr_layer_cpus_ranges[i] =
3173                            (self.layers[i].nr_cpus, self.layers[i].nr_cpus);
3174                    }
3175
3176                    stats.refresh(
3177                        &mut self.skel,
3178                        &self.proc_reader,
3179                        now,
3180                        self.processing_dur,
3181                        &self.gpu_task_handler,
3182                    )?;
3183                    let sys_stats =
3184                        self.generate_sys_stats(&stats, cpus_ranges.get_mut(&tid).unwrap())?;
3185                    res_ch.send(StatsRes::Refreshed((stats, sys_stats)))?;
3186                }
3187                Ok(StatsReq::Bye(tid)) => {
3188                    cpus_ranges.remove(&tid);
3189                    res_ch.send(StatsRes::Bye)?;
3190                }
3191                Err(RecvTimeoutError::Timeout) => {}
3192                Err(e) => Err(e)?,
3193            }
3194        }
3195
3196        let _ = self.struct_ops.take();
3197        uei_report!(&self.skel, uei)
3198    }
3199}
3200
3201impl Drop for Scheduler<'_> {
3202    fn drop(&mut self) {
3203        info!("Unregister {SCHEDULER_NAME} scheduler");
3204
3205        if let Some(struct_ops) = self.struct_ops.take() {
3206            drop(struct_ops);
3207        }
3208    }
3209}
3210
3211fn write_example_file(path: &str) -> Result<()> {
3212    let mut f = fs::OpenOptions::new()
3213        .create_new(true)
3214        .write(true)
3215        .open(path)?;
3216    Ok(f.write_all(serde_json::to_string_pretty(&*EXAMPLE_CONFIG)?.as_bytes())?)
3217}
3218
3219fn verify_layer_specs(specs: &[LayerSpec]) -> Result<HashMap<u64, usize>> {
3220    let mut hint_to_layer_map = HashMap::<u64, (usize, String)>::new();
3221
3222    let nr_specs = specs.len();
3223    if nr_specs == 0 {
3224        bail!("No layer spec");
3225    }
3226    if nr_specs > MAX_LAYERS {
3227        bail!("Too many layer specs");
3228    }
3229
3230    for (idx, spec) in specs.iter().enumerate() {
3231        if idx < nr_specs - 1 {
3232            if spec.matches.is_empty() {
3233                bail!("Non-terminal spec {:?} has NULL matches", spec.name);
3234            }
3235        } else {
3236            if spec.matches.len() != 1 || !spec.matches[0].is_empty() {
3237                bail!("Terminal spec {:?} must have an empty match", spec.name);
3238            }
3239        }
3240
3241        if spec.matches.len() > MAX_LAYER_MATCH_ORS {
3242            bail!(
3243                "Spec {:?} has too many ({}) OR match blocks",
3244                spec.name,
3245                spec.matches.len()
3246            );
3247        }
3248
3249        for (ands_idx, ands) in spec.matches.iter().enumerate() {
3250            if ands.len() > NR_LAYER_MATCH_KINDS {
3251                bail!(
3252                    "Spec {:?}'s {}th OR block has too many ({}) match conditions",
3253                    spec.name,
3254                    ands_idx,
3255                    ands.len()
3256                );
3257            }
3258            let mut hint_equals_cnt = 0;
3259            for one in ands.iter() {
3260                match one {
3261                    LayerMatch::CgroupPrefix(prefix) => {
3262                        if prefix.len() > MAX_PATH {
3263                            bail!("Spec {:?} has too long a cgroup prefix", spec.name);
3264                        }
3265                    }
3266                    LayerMatch::CgroupSuffix(suffix) => {
3267                        if suffix.len() > MAX_PATH {
3268                            bail!("Spec {:?} has too long a cgroup suffix", spec.name);
3269                        }
3270                    }
3271                    LayerMatch::CgroupContains(substr) => {
3272                        if substr.len() > MAX_PATH {
3273                            bail!("Spec {:?} has too long a cgroup substr", spec.name);
3274                        }
3275                    }
3276                    LayerMatch::CommPrefix(prefix) => {
3277                        if prefix.len() > MAX_COMM {
3278                            bail!("Spec {:?} has too long a comm prefix", spec.name);
3279                        }
3280                    }
3281                    LayerMatch::PcommPrefix(prefix) => {
3282                        if prefix.len() > MAX_COMM {
3283                            bail!("Spec {:?} has too long a process name prefix", spec.name);
3284                        }
3285                    }
3286                    LayerMatch::HintEquals(hint) => {
3287                        if *hint > 1024 {
3288                            bail!(
3289                                "Spec {:?} has hint value outside the range [0, 1024]",
3290                                spec.name
3291                            );
3292                        }
3293
3294                        if let Some((layer_id, name)) = hint_to_layer_map.get(hint) {
3295                            if *layer_id != idx {
3296                                bail!(
3297                                    "Spec {:?} has hint value ({}) that is already mapped to Spec {:?}",
3298                                    spec.name,
3299                                    hint,
3300                                    name
3301                                );
3302                            }
3303                        } else {
3304                            hint_to_layer_map.insert(*hint, (idx, spec.name.clone()));
3305                        }
3306                        hint_equals_cnt += 1;
3307                    }
3308                    _ => {}
3309                }
3310            }
3311            if hint_equals_cnt > 1 {
3312                bail!("Only 1 HintEquals match permitted per AND block");
3313            }
3314            if hint_equals_cnt == 1 && ands.len() != 1 {
3315                bail!("HintEquals match cannot be in conjunction with other matches");
3316            }
3317        }
3318
3319        match spec.kind {
3320            LayerKind::Confined {
3321                cpus_range,
3322                util_range,
3323                ..
3324            }
3325            | LayerKind::Grouped {
3326                cpus_range,
3327                util_range,
3328                ..
3329            } => {
3330                if let Some((cpus_min, cpus_max)) = cpus_range {
3331                    if cpus_min > cpus_max {
3332                        bail!(
3333                            "Spec {:?} has invalid cpus_range({}, {})",
3334                            spec.name,
3335                            cpus_min,
3336                            cpus_max
3337                        );
3338                    }
3339                }
3340                if util_range.0 >= util_range.1 {
3341                    bail!(
3342                        "Spec {:?} has invalid util_range ({}, {})",
3343                        spec.name,
3344                        util_range.0,
3345                        util_range.1
3346                    );
3347                }
3348            }
3349            _ => {}
3350        }
3351    }
3352
3353    Ok(hint_to_layer_map
3354        .into_iter()
3355        .map(|(k, v)| (k, v.0))
3356        .collect())
3357}
3358
3359fn name_suffix(cgroup: &str, len: usize) -> String {
3360    let suffixlen = std::cmp::min(len, cgroup.len());
3361    let suffixrev: String = cgroup.chars().rev().take(suffixlen).collect();
3362
3363    suffixrev.chars().rev().collect()
3364}
3365
3366fn traverse_sysfs(dir: &Path) -> Result<Vec<PathBuf>> {
3367    let mut paths = vec![];
3368
3369    if !dir.is_dir() {
3370        panic!("path {:?} does not correspond to directory", dir);
3371    }
3372
3373    let direntries = fs::read_dir(dir)?;
3374
3375    for entry in direntries {
3376        let path = entry?.path();
3377        if path.is_dir() {
3378            paths.append(&mut traverse_sysfs(&path)?);
3379            paths.push(path);
3380        }
3381    }
3382
3383    Ok(paths)
3384}
3385
3386fn find_cpumask(cgroup: &str) -> Cpumask {
3387    let mut path = String::from(cgroup);
3388    path.push_str("/cpuset.cpus.effective");
3389
3390    let description = fs::read_to_string(&mut path).unwrap();
3391
3392    Cpumask::from_cpulist(&description).unwrap()
3393}
3394
3395fn expand_template(rule: &LayerMatch) -> Result<Vec<(LayerMatch, Cpumask)>> {
3396    match rule {
3397        LayerMatch::CgroupSuffix(suffix) => Ok(traverse_sysfs(Path::new("/sys/fs/cgroup"))?
3398            .into_iter()
3399            .map(|cgroup| String::from(cgroup.to_str().expect("could not parse cgroup path")))
3400            .filter(|cgroup| cgroup.ends_with(suffix))
3401            .map(|cgroup| {
3402                (
3403                    {
3404                        let mut slashterminated = cgroup.clone();
3405                        slashterminated.push('/');
3406                        LayerMatch::CgroupSuffix(name_suffix(&slashterminated, 64))
3407                    },
3408                    find_cpumask(&cgroup),
3409                )
3410            })
3411            .collect()),
3412        LayerMatch::CgroupRegex(expr) => Ok(traverse_sysfs(Path::new("/sys/fs/cgroup"))?
3413            .into_iter()
3414            .map(|cgroup| String::from(cgroup.to_str().expect("could not parse cgroup path")))
3415            .filter(|cgroup| {
3416                let re = Regex::new(expr).unwrap();
3417                re.is_match(cgroup)
3418            })
3419            .map(|cgroup| {
3420                (
3421                    // Here we convert the regex match into a suffix match because we still need to
3422                    // do the matching on the bpf side and doing a regex match in bpf isn't
3423                    // easily done.
3424                    {
3425                        let mut slashterminated = cgroup.clone();
3426                        slashterminated.push('/');
3427                        LayerMatch::CgroupSuffix(name_suffix(&slashterminated, 64))
3428                    },
3429                    find_cpumask(&cgroup),
3430                )
3431            })
3432            .collect()),
3433        _ => panic!("Unimplemented template enum {:?}", rule),
3434    }
3435}
3436
3437fn main() -> Result<()> {
3438    let opts = Opts::parse();
3439
3440    if opts.version {
3441        println!(
3442            "scx_layered {}",
3443            build_id::full_version(env!("CARGO_PKG_VERSION"))
3444        );
3445        return Ok(());
3446    }
3447
3448    if opts.help_stats {
3449        stats::server_data().describe_meta(&mut std::io::stdout(), None)?;
3450        return Ok(());
3451    }
3452
3453    if opts.no_load_frac_limit {
3454        warn!("--no-load-frac-limit is deprecated and noop");
3455    }
3456    if opts.layer_preempt_weight_disable != 0.0 {
3457        warn!("--layer-preempt-weight-disable is deprecated and noop");
3458    }
3459    if opts.layer_growth_weight_disable != 0.0 {
3460        warn!("--layer-growth-weight-disable is deprecated and noop");
3461    }
3462    if opts.local_llc_iteration {
3463        warn!("--local_llc_iteration is deprecated and noop");
3464    }
3465
3466    let llv = match opts.verbose {
3467        0 => simplelog::LevelFilter::Info,
3468        1 => simplelog::LevelFilter::Debug,
3469        _ => simplelog::LevelFilter::Trace,
3470    };
3471    let mut lcfg = simplelog::ConfigBuilder::new();
3472    lcfg.set_time_offset_to_local()
3473        .expect("Failed to set local time offset")
3474        .set_time_level(simplelog::LevelFilter::Error)
3475        .set_location_level(simplelog::LevelFilter::Off)
3476        .set_target_level(simplelog::LevelFilter::Off)
3477        .set_thread_level(simplelog::LevelFilter::Off);
3478    simplelog::TermLogger::init(
3479        llv,
3480        lcfg.build(),
3481        simplelog::TerminalMode::Stderr,
3482        simplelog::ColorChoice::Auto,
3483    )?;
3484
3485    debug!("opts={:?}", &opts);
3486
3487    let shutdown = Arc::new(AtomicBool::new(false));
3488    let shutdown_clone = shutdown.clone();
3489    ctrlc::set_handler(move || {
3490        shutdown_clone.store(true, Ordering::Relaxed);
3491    })
3492    .context("Error setting Ctrl-C handler")?;
3493
3494    if let Some(intv) = opts.monitor.or(opts.stats) {
3495        let shutdown_copy = shutdown.clone();
3496        let jh = std::thread::spawn(move || {
3497            match stats::monitor(Duration::from_secs_f64(intv), shutdown_copy) {
3498                Ok(_) => {
3499                    debug!("stats monitor thread finished successfully")
3500                }
3501                Err(error_object) => {
3502                    warn!(
3503                        "stats monitor thread finished because of an error {}",
3504                        error_object
3505                    )
3506                }
3507            }
3508        });
3509        if opts.monitor.is_some() {
3510            let _ = jh.join();
3511            return Ok(());
3512        }
3513    }
3514
3515    if let Some(path) = &opts.example {
3516        write_example_file(path)?;
3517        return Ok(());
3518    }
3519
3520    let mut layer_config = match opts.run_example {
3521        true => EXAMPLE_CONFIG.clone(),
3522        false => LayerConfig { specs: vec![] },
3523    };
3524
3525    for (idx, input) in opts.specs.iter().enumerate() {
3526        let specs = LayerSpec::parse(input)
3527            .context(format!("Failed to parse specs[{}] ({:?})", idx, input))?;
3528
3529        for spec in specs {
3530            match spec.template {
3531                Some(ref rule) => {
3532                    let matches = expand_template(&rule)?;
3533                    // in the absence of matching cgroups, have template layers
3534                    // behave as non-template layers do.
3535                    if matches.is_empty() {
3536                        layer_config.specs.push(spec);
3537                    } else {
3538                        for (mt, mask) in matches {
3539                            let mut genspec = spec.clone();
3540
3541                            genspec.cpuset = Some(mask);
3542
3543                            // Push the new "and" rule into each "or" term.
3544                            for orterm in &mut genspec.matches {
3545                                orterm.push(mt.clone());
3546                            }
3547
3548                            match &mt {
3549                                LayerMatch::CgroupSuffix(cgroup) => genspec.name.push_str(cgroup),
3550                                _ => bail!("Template match has unexpected type"),
3551                            }
3552
3553                            // Push the generated layer into the config
3554                            layer_config.specs.push(genspec);
3555                        }
3556                    }
3557                }
3558
3559                None => {
3560                    layer_config.specs.push(spec);
3561                }
3562            }
3563        }
3564    }
3565
3566    for spec in layer_config.specs.iter_mut() {
3567        let common = spec.kind.common_mut();
3568
3569        if common.slice_us == 0 {
3570            common.slice_us = opts.slice_us;
3571        }
3572
3573        if common.weight == 0 {
3574            common.weight = DEFAULT_LAYER_WEIGHT;
3575        }
3576        common.weight = common.weight.clamp(MIN_LAYER_WEIGHT, MAX_LAYER_WEIGHT);
3577
3578        if common.preempt {
3579            if common.disallow_open_after_us.is_some() {
3580                warn!(
3581                    "Preempt layer {} has non-null disallow_open_after_us, ignored",
3582                    &spec.name
3583                );
3584            }
3585            if common.disallow_preempt_after_us.is_some() {
3586                warn!(
3587                    "Preempt layer {} has non-null disallow_preempt_after_us, ignored",
3588                    &spec.name
3589                );
3590            }
3591            common.disallow_open_after_us = Some(u64::MAX);
3592            common.disallow_preempt_after_us = Some(u64::MAX);
3593        } else {
3594            if common.disallow_open_after_us.is_none() {
3595                common.disallow_open_after_us = Some(*DFL_DISALLOW_OPEN_AFTER_US);
3596            }
3597
3598            if common.disallow_preempt_after_us.is_none() {
3599                common.disallow_preempt_after_us = Some(*DFL_DISALLOW_PREEMPT_AFTER_US);
3600            }
3601        }
3602
3603        if common.idle_smt.is_some() {
3604            warn!("Layer {} has deprecated flag \"idle_smt\"", &spec.name);
3605        }
3606    }
3607
3608    if opts.print_and_exit {
3609        println!("specs={}", serde_json::to_string_pretty(&layer_config)?);
3610        return Ok(());
3611    }
3612
3613    debug!("specs={}", serde_json::to_string_pretty(&layer_config)?);
3614    let hint_to_layer_map = verify_layer_specs(&layer_config.specs)?;
3615
3616    let mut open_object = MaybeUninit::uninit();
3617    loop {
3618        let mut sched = Scheduler::init(
3619            &opts,
3620            &layer_config.specs,
3621            &mut open_object,
3622            &hint_to_layer_map,
3623        )?;
3624        if !sched.run(shutdown.clone())?.should_restart() {
3625            break;
3626        }
3627    }
3628
3629    Ok(())
3630}