scx_layered/
main.rs

1// Copyright (c) Meta Platforms, Inc. and affiliates.
2
3// This software may be used and distributed according to the terms of the
4// GNU General Public License version 2.
5mod bpf_skel;
6mod stats;
7
8use std::collections::BTreeMap;
9use std::collections::HashMap;
10use std::ffi::CString;
11use std::fs;
12use std::io::Write;
13use std::mem::MaybeUninit;
14use std::ops::Sub;
15use std::path::Path;
16use std::path::PathBuf;
17use std::sync::atomic::AtomicBool;
18use std::sync::atomic::Ordering;
19use std::sync::Arc;
20use std::thread::ThreadId;
21use std::time::Duration;
22use std::time::Instant;
23
24use ::fb_procfs as procfs;
25use anyhow::anyhow;
26use anyhow::bail;
27use anyhow::Context;
28use anyhow::Result;
29pub use bpf_skel::*;
30use clap::Parser;
31use crossbeam::channel::RecvTimeoutError;
32use lazy_static::lazy_static;
33use libbpf_rs::MapCore as _;
34use libbpf_rs::OpenObject;
35use libbpf_rs::ProgramInput;
36use log::debug;
37use log::info;
38use log::trace;
39use log::warn;
40use scx_layered::*;
41use scx_stats::prelude::*;
42use scx_utils::compat;
43use scx_utils::init_libbpf_logging;
44use scx_utils::pm::{cpu_idle_resume_latency_supported, update_cpu_idle_resume_latency};
45use scx_utils::read_netdevs;
46use scx_utils::scx_enums;
47use scx_utils::scx_ops_attach;
48use scx_utils::scx_ops_load;
49use scx_utils::scx_ops_open;
50use scx_utils::uei_exited;
51use scx_utils::uei_report;
52use scx_utils::CoreType;
53use scx_utils::Cpumask;
54use scx_utils::Llc;
55use scx_utils::NetDev;
56use scx_utils::Topology;
57use scx_utils::UserExitInfo;
58use scx_utils::NR_CPUS_POSSIBLE;
59use scx_utils::NR_CPU_IDS;
60use stats::LayerStats;
61use stats::StatsReq;
62use stats::StatsRes;
63use stats::SysStats;
64
65const MAX_PATH: usize = bpf_intf::consts_MAX_PATH as usize;
66const MAX_COMM: usize = bpf_intf::consts_MAX_COMM as usize;
67const MAX_LAYER_WEIGHT: u32 = bpf_intf::consts_MAX_LAYER_WEIGHT;
68const MIN_LAYER_WEIGHT: u32 = bpf_intf::consts_MIN_LAYER_WEIGHT;
69const MAX_LAYER_MATCH_ORS: usize = bpf_intf::consts_MAX_LAYER_MATCH_ORS as usize;
70const MAX_LAYER_NAME: usize = bpf_intf::consts_MAX_LAYER_NAME as usize;
71const MAX_LAYERS: usize = bpf_intf::consts_MAX_LAYERS as usize;
72const DEFAULT_LAYER_WEIGHT: u32 = bpf_intf::consts_DEFAULT_LAYER_WEIGHT;
73const USAGE_HALF_LIFE: u32 = bpf_intf::consts_USAGE_HALF_LIFE;
74const USAGE_HALF_LIFE_F64: f64 = USAGE_HALF_LIFE as f64 / 1_000_000_000.0;
75
76const LAYER_USAGE_OWNED: usize = bpf_intf::layer_usage_LAYER_USAGE_OWNED as usize;
77const LAYER_USAGE_OPEN: usize = bpf_intf::layer_usage_LAYER_USAGE_OPEN as usize;
78const LAYER_USAGE_SUM_UPTO: usize = bpf_intf::layer_usage_LAYER_USAGE_SUM_UPTO as usize;
79const LAYER_USAGE_PROTECTED: usize = bpf_intf::layer_usage_LAYER_USAGE_PROTECTED as usize;
80const LAYER_USAGE_PROTECTED_PREEMPT: usize =
81    bpf_intf::layer_usage_LAYER_USAGE_PROTECTED_PREEMPT as usize;
82const NR_LAYER_USAGES: usize = bpf_intf::layer_usage_NR_LAYER_USAGES as usize;
83
84const NR_GSTATS: usize = bpf_intf::global_stat_id_NR_GSTATS as usize;
85const NR_LSTATS: usize = bpf_intf::layer_stat_id_NR_LSTATS as usize;
86const NR_LLC_LSTATS: usize = bpf_intf::llc_layer_stat_id_NR_LLC_LSTATS as usize;
87
88const NR_LAYER_MATCH_KINDS: usize = bpf_intf::layer_match_kind_NR_LAYER_MATCH_KINDS as usize;
89
90lazy_static! {
91    static ref USAGE_DECAY: f64 = 0.5f64.powf(1.0 / USAGE_HALF_LIFE_F64);
92    static ref DFL_DISALLOW_OPEN_AFTER_US: u64 = 2 * scx_enums.SCX_SLICE_DFL / 1000;
93    static ref DFL_DISALLOW_PREEMPT_AFTER_US: u64 = 4 * scx_enums.SCX_SLICE_DFL / 1000;
94    static ref EXAMPLE_CONFIG: LayerConfig = LayerConfig {
95        specs: vec![
96            LayerSpec {
97                name: "batch".into(),
98                comment: Some("tasks under system.slice or tasks with nice value > 0".into()),
99                cpuset: None,
100                template: None,
101                matches: vec![
102                    vec![LayerMatch::CgroupPrefix("system.slice/".into())],
103                    vec![LayerMatch::NiceAbove(0)],
104                ],
105                kind: LayerKind::Confined {
106                    util_range: (0.8, 0.9),
107                    cpus_range: Some((0, 16)),
108                    cpus_range_frac: None,
109                    protected: false,
110                    common: LayerCommon {
111                        min_exec_us: 1000,
112                        yield_ignore: 0.0,
113                        preempt: false,
114                        preempt_first: false,
115                        exclusive: false,
116                        allow_node_aligned: false,
117                        skip_remote_node: false,
118                        prev_over_idle_core: false,
119                        idle_smt: None,
120                        slice_us: 20000,
121                        fifo: false,
122                        weight: DEFAULT_LAYER_WEIGHT,
123                        disallow_open_after_us: None,
124                        disallow_preempt_after_us: None,
125                        xllc_mig_min_us: 1000.0,
126                        growth_algo: LayerGrowthAlgo::Sticky,
127                        idle_resume_us: None,
128                        perf: 1024,
129                        nodes: vec![],
130                        llcs: vec![],
131                        placement: LayerPlacement::Standard,
132                    },
133                },
134            },
135            LayerSpec {
136                name: "immediate".into(),
137                comment: Some("tasks under workload.slice with nice value < 0".into()),
138                cpuset: None,
139                template: None,
140                matches: vec![vec![
141                    LayerMatch::CgroupPrefix("workload.slice/".into()),
142                    LayerMatch::NiceBelow(0),
143                ]],
144                kind: LayerKind::Open {
145                    common: LayerCommon {
146                        min_exec_us: 100,
147                        yield_ignore: 0.25,
148                        preempt: true,
149                        preempt_first: false,
150                        exclusive: true,
151                        allow_node_aligned: true,
152                        skip_remote_node: false,
153                        prev_over_idle_core: true,
154                        idle_smt: None,
155                        slice_us: 20000,
156                        fifo: false,
157                        weight: DEFAULT_LAYER_WEIGHT,
158                        disallow_open_after_us: None,
159                        disallow_preempt_after_us: None,
160                        xllc_mig_min_us: 0.0,
161                        growth_algo: LayerGrowthAlgo::Sticky,
162                        perf: 1024,
163                        idle_resume_us: None,
164                        nodes: vec![],
165                        llcs: vec![],
166                        placement: LayerPlacement::Standard,
167                    },
168                },
169            },
170            LayerSpec {
171                name: "stress-ng".into(),
172                comment: Some("stress-ng test layer".into()),
173                cpuset: None,
174                template: None,
175                matches: vec![
176                    vec![LayerMatch::CommPrefix("stress-ng".into()),],
177                    vec![LayerMatch::PcommPrefix("stress-ng".into()),]
178                ],
179                kind: LayerKind::Confined {
180                    cpus_range: None,
181                    util_range: (0.2, 0.8),
182                    protected: false,
183                    cpus_range_frac: None,
184                    common: LayerCommon {
185                        min_exec_us: 800,
186                        yield_ignore: 0.0,
187                        preempt: true,
188                        preempt_first: false,
189                        exclusive: false,
190                        allow_node_aligned: false,
191                        skip_remote_node: false,
192                        prev_over_idle_core: false,
193                        idle_smt: None,
194                        slice_us: 800,
195                        fifo: false,
196                        weight: DEFAULT_LAYER_WEIGHT,
197                        disallow_open_after_us: None,
198                        disallow_preempt_after_us: None,
199                        xllc_mig_min_us: 0.0,
200                        growth_algo: LayerGrowthAlgo::Topo,
201                        perf: 1024,
202                        idle_resume_us: None,
203                        nodes: vec![],
204                        llcs: vec![],
205                        placement: LayerPlacement::Standard,
206                    },
207                },
208            },
209            LayerSpec {
210                name: "normal".into(),
211                comment: Some("the rest".into()),
212                cpuset: None,
213                template: None,
214                matches: vec![vec![]],
215                kind: LayerKind::Grouped {
216                    cpus_range: None,
217                    util_range: (0.5, 0.6),
218                    util_includes_open_cputime: true,
219                    protected: false,
220                    cpus_range_frac: None,
221                    common: LayerCommon {
222                        min_exec_us: 200,
223                        yield_ignore: 0.0,
224                        preempt: false,
225                        preempt_first: false,
226                        exclusive: false,
227                        allow_node_aligned: false,
228                        skip_remote_node: false,
229                        prev_over_idle_core: false,
230                        idle_smt: None,
231                        slice_us: 20000,
232                        fifo: false,
233                        weight: DEFAULT_LAYER_WEIGHT,
234                        disallow_open_after_us: None,
235                        disallow_preempt_after_us: None,
236                        xllc_mig_min_us: 100.0,
237                        growth_algo: LayerGrowthAlgo::Linear,
238                        perf: 1024,
239                        idle_resume_us: None,
240                        nodes: vec![],
241                        llcs: vec![],
242                        placement: LayerPlacement::Standard,
243                    },
244                },
245            },
246        ],
247    };
248}
249
250/// scx_layered: A highly configurable multi-layer sched_ext scheduler
251///
252/// scx_layered allows classifying tasks into multiple layers and applying
253/// different scheduling policies to them. The configuration is specified in
254/// json and composed of two parts - matches and policies.
255///
256/// Matches
257/// =======
258///
259/// Whenever a task is forked or its attributes are changed, the task goes
260/// through a series of matches to determine the layer it belongs to. A
261/// match set is composed of OR groups of AND blocks. An example:
262///
263///   "matches": [
264///     [
265///       {
266///         "CgroupPrefix": "system.slice/"
267///       }
268///     ],
269///     [
270///       {
271///         "CommPrefix": "fbagent"
272///       },
273///       {
274///         "NiceAbove": 0
275///       }
276///     ]
277///   ],
278///
279/// The outer array contains the OR groups and the inner AND blocks, so the
280/// above matches:
281///
282/// - Tasks which are in the cgroup sub-hierarchy under "system.slice".
283///
284/// - Or tasks whose comm starts with "fbagent" and have a nice value > 0.
285///
286/// Currently, the following matches are supported:
287///
288/// - CgroupPrefix: Matches the prefix of the cgroup that the task belongs
289///   to. As this is a string match, whether the pattern has the trailing
290///   '/' makes a difference. For example, "TOP/CHILD/" only matches tasks
291///   which are under that particular cgroup while "TOP/CHILD" also matches
292///   tasks under "TOP/CHILD0/" or "TOP/CHILD1/".
293///
294/// - CommPrefix: Matches the task's comm prefix.
295///
296/// - PcommPrefix: Matches the task's thread group leader's comm prefix.
297///
298/// - NiceAbove: Matches if the task's nice value is greater than the
299///   pattern.
300///
301/// - NiceBelow: Matches if the task's nice value is smaller than the
302///   pattern.
303///
304/// - NiceEquals: Matches if the task's nice value is exactly equal to
305///   the pattern.
306///
307/// - UIDEquals: Matches if the task's effective user id matches the value
308///
309/// - GIDEquals: Matches if the task's effective group id matches the value.
310///
311/// - PIDEquals: Matches if the task's pid matches the value.
312///
313/// - PPIDEquals: Matches if the task's ppid matches the value.
314///
315/// - TGIDEquals: Matches if the task's tgid matches the value.
316///
317/// - NSPIDEquals: Matches if the task's namespace id and pid matches the values.
318///
319/// - NSEquals: Matches if the task's namespace id matches the values.
320///
321/// - IsGroupLeader: Bool. When true, matches if the task is group leader
322///   (i.e. PID == TGID), aka the thread from which other threads are made.
323///   When false, matches if the task is *not* the group leader (i.e. the rest).
324///
325/// - CmdJoin: Matches when the task uses pthread_setname_np to send a join/leave
326/// command to the scheduler. See examples/cmdjoin.c for more details.
327///
328/// - UsedGpuTid: Bool. When true, matches if the tasks which have used
329///   gpus by tid.
330///
331/// - UsedGpuPid: Bool. When true, matches if the tasks which have used gpu
332///   by tgid/pid.
333///
334/// - [EXPERIMENTAL] AvgRuntime: (u64, u64). Match tasks whose average runtime
335///   is within the provided values [min, max).
336///
337/// While there are complexity limitations as the matches are performed in
338/// BPF, it is straightforward to add more types of matches.
339///
340/// Templates
341/// ---------
342///
343/// Templates let us create a variable number of layers dynamically at initialization
344/// time out of a cgroup name suffix/prefix. Sometimes we know there are multiple
345/// applications running on a machine, each with their own cgroup but do not know the
346/// exact names of the applications or cgroups, e.g., in cloud computing contexts where
347/// workloads are placed on machines dynamically and run under cgroups whose name is
348/// autogenerated. In that case, we cannot hardcode the cgroup match rules when writing
349/// the configuration. We thus cannot easily prevent tasks from different cgroups from
350/// falling into the same layer and affecting each other's performance.
351///
352///
353/// Templates offer a solution to this problem by generating one layer for each such cgroup,
354/// provided these cgroups share a suffix, and that the suffix is unique to them. Templates
355/// have a cgroup suffix rule that we use to find the relevant cgroups in the system. For each
356/// such cgroup, we copy the layer config and add a matching rule that matches just this cgroup.
357///
358///
359/// Policies
360/// ========
361///
362/// The following is an example policy configuration for a layer.
363///
364///   "kind": {
365///     "Confined": {
366///       "cpus_range": [1, 8],
367///       "util_range": [0.8, 0.9]
368///     }
369///   }
370///
371/// It's of "Confined" kind, which tries to concentrate the layer's tasks
372/// into a limited number of CPUs. In the above case, the number of CPUs
373/// assigned to the layer is scaled between 1 and 8 so that the per-cpu
374/// utilization is kept between 80% and 90%. If the CPUs are loaded higher
375/// than 90%, more CPUs are allocated to the layer. If the utilization drops
376/// below 80%, the layer loses CPUs.
377///
378/// Currently, the following policy kinds are supported:
379///
380/// - Confined: Tasks are restricted to the allocated CPUs. The number of
381///   CPUs allocated is modulated to keep the per-CPU utilization in
382///   "util_range". The range can optionally be restricted with the
383///   "cpus_range" property.
384///
385/// - Grouped: Similar to Confined but tasks may spill outside if there are
386///   idle CPUs outside the allocated ones. The range can optionally be
387///   restricted with the "cpus_range" property.
388///
389/// - Open: Prefer the CPUs which are not occupied by Confined or Grouped
390///   layers. Tasks in this group will spill into occupied CPUs if there are
391///   no unoccupied idle CPUs.
392///
393/// All layers take the following options:
394///
395/// - min_exec_us: Minimum execution time in microseconds. Whenever a task
396///   is scheduled in, this is the minimum CPU time that it's charged no
397///   matter how short the actual execution time may be.
398///
399/// - yield_ignore: Yield ignore ratio. If 0.0, yield(2) forfeits a whole
400///   execution slice. 0.25 yields three quarters of an execution slice and
401///   so on. If 1.0, yield is completely ignored.
402///
403/// - slice_us: Scheduling slice duration in microseconds.
404///
405/// - fifo: Use FIFO queues within the layer instead of the default vtime.
406///
407/// - preempt: If true, tasks in the layer will preempt tasks which belong
408///   to other non-preempting layers when no idle CPUs are available.
409///
410/// - preempt_first: If true, tasks in the layer will try to preempt tasks
411///   in their previous CPUs before trying to find idle CPUs.
412///
413/// - exclusive: If true, tasks in the layer will occupy the whole core. The
414///   other logical CPUs sharing the same core will be kept idle. This isn't
415///   a hard guarantee, so don't depend on it for security purposes.
416///
417/// - allow_node_aligned: Put node aligned tasks on layer DSQs instead of lo
418///   fallback. This is a hack to support node-affine tasks without making
419///   the whole scheduler node aware and should only be used with open
420///   layers on non-saturated machines to avoid possible stalls.
421///
422/// - prev_over_idle_core: On SMT enabled systems, prefer using the same CPU
423///   when picking a CPU for tasks on this layer, even if that CPUs SMT
424///   sibling is processing a task.
425///
426/// - weight: Weight of the layer, which is a range from 1 to 10000 with a
427///   default of 100. Layer weights are used during contention to prevent
428///   starvation across layers. Weights are used in combination with
429///   utilization to determine the infeasible adjusted weight with higher
430///   weights having a larger adjustment in adjusted utilization.
431///
432/// - disallow_open_after_us: Duration to wait after machine reaches saturation
433///   before confining tasks in Open layers.
434///
435/// - cpus_range_frac: Array of 2 floats between 0 and 1.0. Lower and upper
436///   bound fractions of all CPUs to give to a layer. Mutually exclusive
437///   with cpus_range.
438///
439/// - disallow_preempt_after_us: Duration to wait after machine reaches saturation
440///   before confining tasks to preempt.
441///
442/// - xllc_mig_min_us: Skip cross-LLC migrations if they are likely to run on
443///   their existing LLC sooner than this.
444///
445/// - idle_smt: *** DEPRECATED ****
446///
447/// - growth_algo: When a layer is allocated new CPUs different algorithms can
448///   be used to determine which CPU should be allocated next. The default
449///   algorithm is a "sticky" algorithm that attempts to spread layers evenly
450///   across cores.
451///
452/// - perf: CPU performance target. 0 means no configuration. A value
453///   between 1 and 1024 indicates the performance level CPUs running tasks
454///   in this layer are configured to using scx_bpf_cpuperf_set().
455///
456/// - idle_resume_us: Sets the idle resume QoS value. CPU idle time governors are expected to
457///   regard the minimum of the global (effective) CPU latency limit and the effective resume
458///   latency constraint for the given CPU as the upper limit for the exit latency of the idle
459///   states. See the latest kernel docs for more details:
460///   https://www.kernel.org/doc/html/latest/admin-guide/pm/cpuidle.html
461///
462/// - nodes: If set the layer will use the set of NUMA nodes for scheduling
463///   decisions. If unset then all available NUMA nodes will be used. If the
464///   llcs value is set the cpuset of NUMA nodes will be or'ed with the LLC
465///   config.
466///
467/// - llcs: If set the layer will use the set of LLCs (last level caches)
468///   for scheduling decisions. If unset then all LLCs will be used. If
469///   the nodes value is set the cpuset of LLCs will be or'ed with the nodes
470///   config.
471///
472///
473/// Similar to matches, adding new policies and extending existing ones
474/// should be relatively straightforward.
475///
476/// Configuration example and running scx_layered
477/// =============================================
478///
479/// An scx_layered config is composed of layer configs. A layer config is
480/// composed of a name, a set of matches, and a policy block. Running the
481/// following will write an example configuration into example.json.
482///
483///   $ scx_layered -e example.json
484///
485/// Note that the last layer in the configuration must have an empty match set
486/// as a catch-all for tasks which haven't been matched into previous layers.
487///
488/// The configuration can be specified in multiple json files and
489/// command line arguments, which are concatenated in the specified
490/// order. Each must contain valid layer configurations.
491///
492/// By default, an argument to scx_layered is interpreted as a JSON string. If
493/// the argument is a pointer to a JSON file, it should be prefixed with file:
494/// or f: as follows:
495///
496///   $ scx_layered file:example.json
497///   ...
498///   $ scx_layered f:example.json
499///
500/// Monitoring Statistics
501/// =====================
502///
503/// Run with `--stats INTERVAL` to enable stats monitoring. There is
504/// also an scx_stat server listening on /var/run/scx/root/stat that can
505/// be monitored by running `scx_layered --monitor INTERVAL` separately.
506///
507///   ```bash
508///   $ scx_layered --monitor 1
509///   tot= 117909 local=86.20 open_idle= 0.21 affn_viol= 1.37 proc=6ms
510///   busy= 34.2 util= 1733.6 load=  21744.1 fallback_cpu=  1
511///     batch    : util/frac=   11.8/  0.7 load/frac=     29.7:  0.1 tasks=  2597
512///                tot=   3478 local=67.80 open_idle= 0.00 preempt= 0.00 affn_viol= 0.00
513///                cpus=  2 [  2,  2] 04000001 00000000
514///     immediate: util/frac= 1218.8/ 70.3 load/frac=  21399.9: 98.4 tasks=  1107
515///                tot=  68997 local=90.57 open_idle= 0.26 preempt= 9.36 affn_viol= 0.00
516///                cpus= 50 [ 50, 50] fbfffffe 000fffff
517///     normal   : util/frac=  502.9/ 29.0 load/frac=    314.5:  1.4 tasks=  3512
518///                tot=  45434 local=80.97 open_idle= 0.16 preempt= 0.00 affn_viol= 3.56
519///                cpus= 50 [ 50, 50] fbfffffe 000fffff
520///   ```
521///
522/// Global statistics: see [`SysStats`]
523///
524/// Per-layer statistics: see [`LayerStats`]
525///
526#[derive(Debug, Parser)]
527#[command(verbatim_doc_comment)]
528struct Opts {
529    /// Scheduling slice duration in microseconds.
530    #[clap(short = 's', long, default_value = "20000")]
531    slice_us: u64,
532
533    /// Maximum consecutive execution time in microseconds. A task may be
534    /// allowed to keep executing on a CPU for this long. Note that this is
535    /// the upper limit and a task may have to moved off the CPU earlier. 0
536    /// indicates default - 20 * slice_us.
537    #[clap(short = 'M', long, default_value = "0")]
538    max_exec_us: u64,
539
540    /// Scheduling interval in seconds.
541    #[clap(short = 'i', long, default_value = "0.1")]
542    interval: f64,
543
544    /// ***DEPRECATED*** Disable load-fraction based max layer CPU limit.
545    /// recommended.
546    #[clap(short = 'n', long, default_value = "false")]
547    no_load_frac_limit: bool,
548
549    /// Exit debug dump buffer length. 0 indicates default.
550    #[clap(long, default_value = "0")]
551    exit_dump_len: u32,
552
553    /// Enable verbose output, including libbpf details. Specify multiple
554    /// times to increase verbosity.
555    #[clap(short = 'v', long, action = clap::ArgAction::Count)]
556    verbose: u8,
557
558    /// Disable topology awareness. When enabled, the "nodes" and "llcs" settings on
559    /// a layer are ignored. Defaults to false on topologies with multiple NUMA nodes
560    /// or LLCs, and true otherwise.
561    #[arg(short = 't', long, num_args = 0..=1, default_missing_value = "true", require_equals = true)]
562    disable_topology: Option<bool>,
563
564    /// Enable cross NUMA preemption.
565    #[clap(long)]
566    xnuma_preemption: bool,
567
568    /// Disable monitor
569    #[clap(long)]
570    monitor_disable: bool,
571
572    /// Write example layer specifications into the file and exit.
573    #[clap(short = 'e', long)]
574    example: Option<String>,
575
576    /// ***DEPRECATED*** Disables preemption if the weighted load fraction
577    /// of a layer (load_frac_adj) exceeds the threshold. The default is
578    /// disabled (0.0).
579    #[clap(long, default_value = "0.0")]
580    layer_preempt_weight_disable: f64,
581
582    /// ***DEPRECATED*** Disables layer growth if the weighted load fraction
583    /// of a layer (load_frac_adj) exceeds the threshold. The default is
584    /// disabled (0.0).
585    #[clap(long, default_value = "0.0")]
586    layer_growth_weight_disable: f64,
587
588    /// Enable stats monitoring with the specified interval.
589    #[clap(long)]
590    stats: Option<f64>,
591
592    /// Run in stats monitoring mode with the specified interval. Scheduler
593    /// is not launched.
594    #[clap(long)]
595    monitor: Option<f64>,
596
597    /// Run with example layer specifications (useful for e.g. CI pipelines)
598    #[clap(long)]
599    run_example: bool,
600
601    /// ***DEPRECATED *** Enables iteration over local LLCs first for
602    /// dispatch.
603    #[clap(long, default_value = "false")]
604    local_llc_iteration: bool,
605
606    /// Low priority fallback DSQs are used to execute tasks with custom CPU
607    /// affinities. These DSQs are immediately executed iff a CPU is
608    /// otherwise idle. However, after the specified wait, they are
609    /// guranteed upto --lo-fb-share fraction of each CPU.
610    #[clap(long, default_value = "10000")]
611    lo_fb_wait_us: u64,
612
613    /// The fraction of CPU time guaranteed to low priority fallback DSQs.
614    /// See --lo-fb-wait-us.
615    #[clap(long, default_value = ".05")]
616    lo_fb_share: f64,
617
618    /// Disable antistall
619    #[clap(long, default_value = "false")]
620    disable_antistall: bool,
621
622    /// Enable match debug
623    /// This stores a mapping of task tid
624    /// to layer id such that bpftool map dump
625    /// can be used to debug layer matches.
626    #[clap(long, default_value = "false")]
627    enable_match_debug: bool,
628
629    /// Maximum task runnable_at delay (in seconds) before antistall turns on
630    #[clap(long, default_value = "3")]
631    antistall_sec: u64,
632
633    /// Enable gpu support
634    #[clap(long, default_value = "false")]
635    enable_gpu_support: bool,
636
637    /// Gpu Kprobe Level
638    /// The value set here determines how agressive
639    /// the kprobes enabled on gpu driver functions are.
640    /// Higher values are more aggressive, incurring more system overhead
641    /// and more accurately identifying PIDs using GPUs in a more timely manner.
642    /// Lower values incur less system overhead, at the cost of less accurately
643    /// identifying GPU pids and taking longer to do so.
644    #[clap(long, default_value = "3")]
645    gpu_kprobe_level: u64,
646
647    /// Enable netdev IRQ balancing. This is experimental and should be used with caution.
648    #[clap(long, default_value = "false")]
649    netdev_irq_balance: bool,
650
651    /// Disable queued wakeup optimization.
652    #[clap(long, default_value = "false")]
653    disable_queued_wakeup: bool,
654
655    /// Per-cpu kthreads are preempting by default. Make it not so.
656    #[clap(long, default_value = "false")]
657    disable_percpu_kthread_preempt: bool,
658
659    /// Only highpri (nice < 0) per-cpu kthreads are preempting by default.
660    /// Make every per-cpu kthread preempting. Meaningful only if
661    /// --disable-percpu-kthread-preempt is not set.
662    #[clap(long, default_value = "false")]
663    percpu_kthread_preempt_all: bool,
664
665    /// Show descriptions for statistics.
666    #[clap(long)]
667    help_stats: bool,
668
669    /// Layer specification. See --help.
670    specs: Vec<String>,
671
672    /// Periodically force tasks in layers using the AvgRuntime match rule to reevaluate which layer they belong to. Default period of 2s.
673    /// turns this off.
674    #[clap(long, default_value = "2000")]
675    layer_refresh_ms_avgruntime: u64,
676
677    /// Set the path for pinning the task hint map.
678    #[clap(long, default_value = "")]
679    task_hint_map: String,
680
681    /// Print the config (after template expansion) and exit.
682    #[clap(long, default_value = "false")]
683    print_and_exit: bool,
684}
685
686fn read_total_cpu(reader: &procfs::ProcReader) -> Result<procfs::CpuStat> {
687    reader
688        .read_stat()
689        .context("Failed to read procfs")?
690        .total_cpu
691        .ok_or_else(|| anyhow!("Could not read total cpu stat in proc"))
692}
693
694fn calc_util(curr: &procfs::CpuStat, prev: &procfs::CpuStat) -> Result<f64> {
695    match (curr, prev) {
696        (
697            procfs::CpuStat {
698                user_usec: Some(curr_user),
699                nice_usec: Some(curr_nice),
700                system_usec: Some(curr_system),
701                idle_usec: Some(curr_idle),
702                iowait_usec: Some(curr_iowait),
703                irq_usec: Some(curr_irq),
704                softirq_usec: Some(curr_softirq),
705                stolen_usec: Some(curr_stolen),
706                ..
707            },
708            procfs::CpuStat {
709                user_usec: Some(prev_user),
710                nice_usec: Some(prev_nice),
711                system_usec: Some(prev_system),
712                idle_usec: Some(prev_idle),
713                iowait_usec: Some(prev_iowait),
714                irq_usec: Some(prev_irq),
715                softirq_usec: Some(prev_softirq),
716                stolen_usec: Some(prev_stolen),
717                ..
718            },
719        ) => {
720            let idle_usec = curr_idle.saturating_sub(*prev_idle);
721            let iowait_usec = curr_iowait.saturating_sub(*prev_iowait);
722            let user_usec = curr_user.saturating_sub(*prev_user);
723            let system_usec = curr_system.saturating_sub(*prev_system);
724            let nice_usec = curr_nice.saturating_sub(*prev_nice);
725            let irq_usec = curr_irq.saturating_sub(*prev_irq);
726            let softirq_usec = curr_softirq.saturating_sub(*prev_softirq);
727            let stolen_usec = curr_stolen.saturating_sub(*prev_stolen);
728
729            let busy_usec =
730                user_usec + system_usec + nice_usec + irq_usec + softirq_usec + stolen_usec;
731            let total_usec = idle_usec + busy_usec + iowait_usec;
732            if total_usec > 0 {
733                Ok(((busy_usec as f64) / (total_usec as f64)).clamp(0.0, 1.0))
734            } else {
735                Ok(1.0)
736            }
737        }
738        _ => bail!("Missing stats in cpustat"),
739    }
740}
741
742fn copy_into_cstr(dst: &mut [i8], src: &str) {
743    let cstr = CString::new(src).unwrap();
744    let bytes = unsafe { std::mem::transmute::<&[u8], &[i8]>(cstr.as_bytes_with_nul()) };
745    dst[0..bytes.len()].copy_from_slice(bytes);
746}
747
748fn nodemask_from_nodes(nodes: &Vec<usize>) -> usize {
749    let mut mask = 0;
750    for node in nodes {
751        mask |= 1 << node;
752    }
753    mask
754}
755
756fn llcmask_from_llcs(llcs: &BTreeMap<usize, Arc<Llc>>) -> usize {
757    let mut mask = 0;
758    for (_, cache) in llcs {
759        mask |= 1 << cache.id;
760    }
761    mask
762}
763
764fn read_cpu_ctxs(skel: &BpfSkel) -> Result<Vec<bpf_intf::cpu_ctx>> {
765    let mut cpu_ctxs = vec![];
766    let cpu_ctxs_vec = skel
767        .maps
768        .cpu_ctxs
769        .lookup_percpu(&0u32.to_ne_bytes(), libbpf_rs::MapFlags::ANY)
770        .context("Failed to lookup cpu_ctx")?
771        .unwrap();
772    for cpu in 0..*NR_CPUS_POSSIBLE {
773        cpu_ctxs.push(*unsafe {
774            &*(cpu_ctxs_vec[cpu].as_slice().as_ptr() as *const bpf_intf::cpu_ctx)
775        });
776    }
777    Ok(cpu_ctxs)
778}
779
780#[derive(Clone, Debug)]
781struct BpfStats {
782    gstats: Vec<u64>,
783    lstats: Vec<Vec<u64>>,
784    lstats_sums: Vec<u64>,
785    llc_lstats: Vec<Vec<Vec<u64>>>, // [layer][llc][stat]
786}
787
788impl BpfStats {
789    fn read(skel: &BpfSkel, cpu_ctxs: &[bpf_intf::cpu_ctx]) -> Self {
790        let nr_layers = skel.maps.rodata_data.nr_layers as usize;
791        let nr_llcs = skel.maps.rodata_data.nr_llcs as usize;
792        let mut gstats = vec![0u64; NR_GSTATS];
793        let mut lstats = vec![vec![0u64; NR_LSTATS]; nr_layers];
794        let mut llc_lstats = vec![vec![vec![0u64; NR_LLC_LSTATS]; nr_llcs]; nr_layers];
795
796        for cpu in 0..*NR_CPUS_POSSIBLE {
797            for stat in 0..NR_GSTATS {
798                gstats[stat] += cpu_ctxs[cpu].gstats[stat];
799            }
800            for layer in 0..nr_layers {
801                for stat in 0..NR_LSTATS {
802                    lstats[layer][stat] += cpu_ctxs[cpu].lstats[layer][stat];
803                }
804            }
805        }
806
807        let mut lstats_sums = vec![0u64; NR_LSTATS];
808        for layer in 0..nr_layers {
809            for stat in 0..NR_LSTATS {
810                lstats_sums[stat] += lstats[layer][stat];
811            }
812        }
813
814        for llc_id in 0..nr_llcs {
815            // XXX - This would be a lot easier if llc_ctx were in
816            // the bss. Unfortunately, kernel < v6.12 crashes and
817            // kernel >= v6.12 fails verification after such
818            // conversion due to seemingly verifier bugs. Convert to
819            // bss maps later.
820            let key = llc_id as u32;
821            let llc_id_slice =
822                unsafe { std::slice::from_raw_parts((&key as *const u32) as *const u8, 4) };
823            let v = skel
824                .maps
825                .llc_data
826                .lookup(llc_id_slice, libbpf_rs::MapFlags::ANY)
827                .unwrap()
828                .unwrap();
829            let llcc = unsafe { *(v.as_slice().as_ptr() as *const bpf_intf::llc_ctx) };
830
831            for layer_id in 0..nr_layers {
832                for stat_id in 0..NR_LLC_LSTATS {
833                    llc_lstats[layer_id][llc_id][stat_id] = llcc.lstats[layer_id][stat_id];
834                }
835            }
836        }
837
838        Self {
839            gstats,
840            lstats,
841            lstats_sums,
842            llc_lstats,
843        }
844    }
845}
846
847impl<'a, 'b> Sub<&'b BpfStats> for &'a BpfStats {
848    type Output = BpfStats;
849
850    fn sub(self, rhs: &'b BpfStats) -> BpfStats {
851        let vec_sub = |l: &[u64], r: &[u64]| l.iter().zip(r.iter()).map(|(l, r)| *l - *r).collect();
852        BpfStats {
853            gstats: vec_sub(&self.gstats, &rhs.gstats),
854            lstats: self
855                .lstats
856                .iter()
857                .zip(rhs.lstats.iter())
858                .map(|(l, r)| vec_sub(l, r))
859                .collect(),
860            lstats_sums: vec_sub(&self.lstats_sums, &rhs.lstats_sums),
861            llc_lstats: self
862                .llc_lstats
863                .iter()
864                .zip(rhs.llc_lstats.iter())
865                .map(|(l_layer, r_layer)| {
866                    l_layer
867                        .iter()
868                        .zip(r_layer.iter())
869                        .map(|(l_llc, r_llc)| {
870                            let (l_llc, mut r_llc) = (l_llc.clone(), r_llc.clone());
871                            // Lat is not subtractable, take L side.
872                            r_llc[bpf_intf::llc_layer_stat_id_LLC_LSTAT_LAT as usize] = 0;
873                            vec_sub(&l_llc, &r_llc)
874                        })
875                        .collect()
876                })
877                .collect(),
878        }
879    }
880}
881
882#[derive(Clone, Debug)]
883struct Stats {
884    at: Instant,
885    elapsed: Duration,
886    nr_layers: usize,
887    nr_layer_tasks: Vec<usize>,
888    nr_nodes: usize,
889
890    total_util: f64, // Running AVG of sum of layer_utils
891    layer_utils: Vec<Vec<f64>>,
892    prev_layer_usages: Vec<Vec<u64>>,
893
894    cpu_busy: f64, // Read from /proc, maybe higher than total_util
895    prev_total_cpu: procfs::CpuStat,
896
897    bpf_stats: BpfStats,
898    prev_bpf_stats: BpfStats,
899
900    processing_dur: Duration,
901    prev_processing_dur: Duration,
902
903    layer_slice_us: Vec<u64>,
904}
905
906impl Stats {
907    fn read_layer_usages(cpu_ctxs: &[bpf_intf::cpu_ctx], nr_layers: usize) -> Vec<Vec<u64>> {
908        let mut layer_usages = vec![vec![0u64; NR_LAYER_USAGES]; nr_layers];
909
910        for cpu in 0..*NR_CPUS_POSSIBLE {
911            for layer in 0..nr_layers {
912                for usage in 0..NR_LAYER_USAGES {
913                    layer_usages[layer][usage] += cpu_ctxs[cpu].layer_usages[layer][usage];
914                }
915            }
916        }
917
918        layer_usages
919    }
920
921    fn new(skel: &mut BpfSkel, proc_reader: &procfs::ProcReader) -> Result<Self> {
922        let nr_layers = skel.maps.rodata_data.nr_layers as usize;
923        let cpu_ctxs = read_cpu_ctxs(skel)?;
924        let bpf_stats = BpfStats::read(skel, &cpu_ctxs);
925        let nr_nodes = skel.maps.rodata_data.nr_nodes as usize;
926
927        Ok(Self {
928            at: Instant::now(),
929            elapsed: Default::default(),
930            nr_layers,
931            nr_layer_tasks: vec![0; nr_layers],
932            nr_nodes,
933
934            total_util: 0.0,
935            layer_utils: vec![vec![0.0; NR_LAYER_USAGES]; nr_layers],
936            prev_layer_usages: Self::read_layer_usages(&cpu_ctxs, nr_layers),
937
938            cpu_busy: 0.0,
939            prev_total_cpu: read_total_cpu(proc_reader)?,
940
941            bpf_stats: bpf_stats.clone(),
942            prev_bpf_stats: bpf_stats,
943
944            processing_dur: Default::default(),
945            prev_processing_dur: Default::default(),
946
947            layer_slice_us: vec![0; nr_layers],
948        })
949    }
950
951    fn refresh(
952        &mut self,
953        skel: &mut BpfSkel,
954        proc_reader: &procfs::ProcReader,
955        now: Instant,
956        cur_processing_dur: Duration,
957    ) -> Result<()> {
958        let elapsed = now.duration_since(self.at);
959        let elapsed_f64 = elapsed.as_secs_f64();
960        let cpu_ctxs = read_cpu_ctxs(skel)?;
961
962        let nr_layer_tasks: Vec<usize> = skel
963            .maps
964            .bss_data
965            .layers
966            .iter()
967            .take(self.nr_layers)
968            .map(|layer| layer.nr_tasks as usize)
969            .collect();
970        let layer_slice_us: Vec<u64> = skel
971            .maps
972            .bss_data
973            .layers
974            .iter()
975            .take(self.nr_layers)
976            .map(|layer| layer.slice_ns / 1000_u64)
977            .collect();
978
979        let cur_layer_usages = Self::read_layer_usages(&cpu_ctxs, self.nr_layers);
980        let cur_layer_utils: Vec<Vec<f64>> = cur_layer_usages
981            .iter()
982            .zip(self.prev_layer_usages.iter())
983            .map(|(cur, prev)| {
984                cur.iter()
985                    .zip(prev.iter())
986                    .map(|(c, p)| (c - p) as f64 / 1_000_000_000.0 / elapsed_f64)
987                    .collect()
988            })
989            .collect();
990        let layer_utils: Vec<Vec<f64>> = cur_layer_utils
991            .iter()
992            .zip(self.layer_utils.iter())
993            .map(|(cur, prev)| {
994                cur.iter()
995                    .zip(prev.iter())
996                    .map(|(c, p)| {
997                        let decay = USAGE_DECAY.powf(elapsed_f64);
998                        p * decay + c * (1.0 - decay)
999                    })
1000                    .collect()
1001            })
1002            .collect();
1003
1004        let cur_total_cpu = read_total_cpu(proc_reader)?;
1005        let cpu_busy = calc_util(&cur_total_cpu, &self.prev_total_cpu)?;
1006
1007        let cur_bpf_stats = BpfStats::read(skel, &cpu_ctxs);
1008        let bpf_stats = &cur_bpf_stats - &self.prev_bpf_stats;
1009
1010        let processing_dur = cur_processing_dur
1011            .checked_sub(self.prev_processing_dur)
1012            .unwrap();
1013
1014        *self = Self {
1015            at: now,
1016            elapsed,
1017            nr_layers: self.nr_layers,
1018            nr_layer_tasks,
1019            nr_nodes: self.nr_nodes,
1020
1021            total_util: layer_utils
1022                .iter()
1023                .map(|x| x.iter().take(LAYER_USAGE_SUM_UPTO + 1).sum::<f64>())
1024                .sum(),
1025            layer_utils,
1026            prev_layer_usages: cur_layer_usages,
1027
1028            cpu_busy,
1029            prev_total_cpu: cur_total_cpu,
1030
1031            bpf_stats,
1032            prev_bpf_stats: cur_bpf_stats,
1033
1034            processing_dur,
1035            prev_processing_dur: cur_processing_dur,
1036
1037            layer_slice_us,
1038        };
1039        Ok(())
1040    }
1041}
1042
1043#[derive(Debug)]
1044struct Layer {
1045    name: String,
1046    kind: LayerKind,
1047    growth_algo: LayerGrowthAlgo,
1048    core_order: Vec<usize>,
1049
1050    target_llc_cpus: (usize, usize),
1051    assigned_llcs: Vec<usize>,
1052
1053    nr_cpus: usize,
1054    nr_llc_cpus: Vec<usize>,
1055    cpus: Cpumask,
1056    allowed_cpus: Cpumask,
1057}
1058
1059fn get_kallsyms_addr(sym_name: &str) -> Result<u64> {
1060    fs::read_to_string("/proc/kallsyms")?
1061        .lines()
1062        .find(|line| line.contains(sym_name))
1063        .and_then(|line| line.split_whitespace().next())
1064        .and_then(|addr| u64::from_str_radix(addr, 16).ok())
1065        .ok_or_else(|| anyhow!("Symbol '{}' not found", sym_name))
1066}
1067
1068fn resolve_cpus_pct_range(
1069    cpus_range: &Option<(usize, usize)>,
1070    cpus_range_frac: &Option<(f64, f64)>,
1071    max_cpus: usize,
1072) -> Result<(usize, usize)> {
1073    match (cpus_range, cpus_range_frac) {
1074        (Some(_x), Some(_y)) => {
1075            bail!("cpus_range cannot be used with cpus_pct.");
1076        }
1077        (Some((cpus_range_min, cpus_range_max)), None) => Ok((*cpus_range_min, *cpus_range_max)),
1078        (None, Some((cpus_frac_min, cpus_frac_max))) => {
1079            if *cpus_frac_min < 0_f64
1080                || *cpus_frac_min > 1_f64
1081                || *cpus_frac_max < 0_f64
1082                || *cpus_frac_max > 1_f64
1083            {
1084                bail!("cpus_range_frac values must be between 0.0 and 1.0");
1085            }
1086            let cpus_min_count = ((max_cpus as f64) * cpus_frac_min).round_ties_even() as usize;
1087            let cpus_max_count = ((max_cpus as f64) * cpus_frac_max).round_ties_even() as usize;
1088            Ok((
1089                std::cmp::max(cpus_min_count, 1),
1090                std::cmp::min(cpus_max_count, max_cpus),
1091            ))
1092        }
1093        (None, None) => Ok((0, max_cpus)),
1094    }
1095}
1096
1097impl Layer {
1098    fn new(spec: &LayerSpec, topo: &Topology, core_order: &Vec<usize>) -> Result<Self> {
1099        let name = &spec.name;
1100        let kind = spec.kind.clone();
1101        let mut allowed_cpus = Cpumask::new();
1102        match &kind {
1103            LayerKind::Confined {
1104                cpus_range,
1105                cpus_range_frac,
1106                common: LayerCommon { nodes, llcs, .. },
1107                ..
1108            } => {
1109                let cpus_range =
1110                    resolve_cpus_pct_range(cpus_range, cpus_range_frac, topo.all_cpus.len())?;
1111                if cpus_range.0 > cpus_range.1 || cpus_range.1 == 0 {
1112                    bail!("invalid cpus_range {:?}", cpus_range);
1113                }
1114                if nodes.is_empty() && llcs.is_empty() {
1115                    allowed_cpus.set_all();
1116                } else {
1117                    // build up the cpus bitset
1118                    for (node_id, node) in &topo.nodes {
1119                        // first do the matching for nodes
1120                        if nodes.contains(node_id) {
1121                            for &id in node.all_cpus.keys() {
1122                                allowed_cpus.set_cpu(id)?;
1123                            }
1124                        }
1125                        // next match on any LLCs
1126                        for (llc_id, llc) in &node.llcs {
1127                            if llcs.contains(llc_id) {
1128                                for &id in llc.all_cpus.keys() {
1129                                    allowed_cpus.set_cpu(id)?;
1130                                }
1131                            }
1132                        }
1133                    }
1134                }
1135            }
1136            LayerKind::Grouped {
1137                common: LayerCommon { nodes, llcs, .. },
1138                ..
1139            }
1140            | LayerKind::Open {
1141                common: LayerCommon { nodes, llcs, .. },
1142                ..
1143            } => {
1144                if nodes.is_empty() && llcs.is_empty() {
1145                    allowed_cpus.set_all();
1146                } else {
1147                    // build up the cpus bitset
1148                    for (node_id, node) in &topo.nodes {
1149                        // first do the matching for nodes
1150                        if nodes.contains(node_id) {
1151                            for &id in node.all_cpus.keys() {
1152                                allowed_cpus.set_cpu(id)?;
1153                            }
1154                        }
1155                        // next match on any LLCs
1156                        for (llc_id, llc) in &node.llcs {
1157                            if llcs.contains(llc_id) {
1158                                for &id in llc.all_cpus.keys() {
1159                                    allowed_cpus.set_cpu(id)?;
1160                                }
1161                            }
1162                        }
1163                    }
1164                }
1165            }
1166        }
1167
1168        // Util can be above 1.0 for grouped layers if
1169        // util_includes_open_cputime is set.
1170        if let Some(util_range) = kind.util_range() {
1171            if util_range.0 < 0.0 || util_range.1 < 0.0 || util_range.0 >= util_range.1 {
1172                bail!("invalid util_range {:?}", util_range);
1173            }
1174        }
1175
1176        let layer_growth_algo = kind.common().growth_algo.clone();
1177
1178        debug!(
1179            "layer: {} algo: {:?} core order: {:?}",
1180            name, &layer_growth_algo, core_order
1181        );
1182
1183        Ok(Self {
1184            name: name.into(),
1185            kind,
1186            growth_algo: layer_growth_algo,
1187            core_order: core_order.clone(),
1188
1189            target_llc_cpus: (0, 0),
1190            assigned_llcs: vec![],
1191
1192            nr_cpus: 0,
1193            nr_llc_cpus: vec![0; topo.all_llcs.len()],
1194            cpus: Cpumask::new(),
1195            allowed_cpus,
1196        })
1197    }
1198
1199    fn free_some_cpus(&mut self, cpu_pool: &mut CpuPool, max_to_free: usize) -> Result<usize> {
1200        let cpus_to_free = match cpu_pool.next_to_free(&self.cpus, self.core_order.iter().rev())? {
1201            Some(ret) => ret.clone(),
1202            None => return Ok(0),
1203        };
1204
1205        let nr_to_free = cpus_to_free.weight();
1206
1207        Ok(if nr_to_free <= max_to_free {
1208            trace!("[{}] freeing CPUs: {}", self.name, &cpus_to_free);
1209            self.cpus &= &cpus_to_free.not();
1210            self.nr_cpus -= nr_to_free;
1211            for cpu in cpus_to_free.iter() {
1212                self.nr_llc_cpus[cpu_pool.topo.all_cpus[&cpu].llc_id] -= 1;
1213            }
1214            cpu_pool.free(&cpus_to_free)?;
1215            nr_to_free
1216        } else {
1217            0
1218        })
1219    }
1220
1221    fn alloc_some_cpus(&mut self, cpu_pool: &mut CpuPool) -> Result<usize> {
1222        let new_cpus = match cpu_pool
1223            .alloc_cpus(&self.allowed_cpus, &self.core_order)
1224            .clone()
1225        {
1226            Some(ret) => ret.clone(),
1227            None => {
1228                trace!("layer-{} can't grow, no CPUs", &self.name);
1229                return Ok(0);
1230            }
1231        };
1232
1233        let nr_new_cpus = new_cpus.weight();
1234
1235        trace!("[{}] adding CPUs: {}", &self.name, &new_cpus);
1236        self.cpus |= &new_cpus;
1237        self.nr_cpus += nr_new_cpus;
1238        for cpu in new_cpus.iter() {
1239            self.nr_llc_cpus[cpu_pool.topo.all_cpus[&cpu].llc_id] += 1;
1240        }
1241        Ok(nr_new_cpus)
1242    }
1243}
1244
1245struct Scheduler<'a> {
1246    skel: BpfSkel<'a>,
1247    struct_ops: Option<libbpf_rs::Link>,
1248    layer_specs: Vec<LayerSpec>,
1249
1250    sched_intv: Duration,
1251    layer_refresh_intv: Duration,
1252
1253    cpu_pool: CpuPool,
1254    layers: Vec<Layer>,
1255    idle_qos_enabled: bool,
1256
1257    proc_reader: procfs::ProcReader,
1258    sched_stats: Stats,
1259
1260    nr_layer_cpus_ranges: Vec<(usize, usize)>,
1261    processing_dur: Duration,
1262
1263    topo: Arc<Topology>,
1264    netdevs: BTreeMap<String, NetDev>,
1265    stats_server: StatsServer<StatsReq, StatsRes>,
1266}
1267
1268impl<'a> Scheduler<'a> {
1269    fn init_layers(skel: &mut OpenBpfSkel, specs: &[LayerSpec], topo: &Topology) -> Result<()> {
1270        skel.maps.rodata_data.nr_layers = specs.len() as u32;
1271        let mut perf_set = false;
1272
1273        let mut layer_iteration_order = (0..specs.len()).collect::<Vec<_>>();
1274        let mut layer_weights: Vec<usize> = vec![];
1275
1276        for (spec_i, spec) in specs.iter().enumerate() {
1277            let layer = &mut skel.maps.bss_data.layers[spec_i];
1278
1279            for (or_i, or) in spec.matches.iter().enumerate() {
1280                for (and_i, and) in or.iter().enumerate() {
1281                    let mt = &mut layer.matches[or_i].matches[and_i];
1282
1283                    // Rules are allowlist-based by default
1284                    mt.exclude.write(false);
1285
1286                    match and {
1287                        LayerMatch::CgroupPrefix(prefix) => {
1288                            mt.kind = bpf_intf::layer_match_kind_MATCH_CGROUP_PREFIX as i32;
1289                            copy_into_cstr(&mut mt.cgroup_prefix, prefix.as_str());
1290                        }
1291                        LayerMatch::CgroupSuffix(suffix) => {
1292                            mt.kind = bpf_intf::layer_match_kind_MATCH_CGROUP_SUFFIX as i32;
1293                            copy_into_cstr(&mut mt.cgroup_suffix, suffix.as_str());
1294                        }
1295                        LayerMatch::CgroupContains(substr) => {
1296                            mt.kind = bpf_intf::layer_match_kind_MATCH_CGROUP_CONTAINS as i32;
1297                            copy_into_cstr(&mut mt.cgroup_substr, substr.as_str());
1298                        }
1299                        LayerMatch::CommPrefix(prefix) => {
1300                            mt.kind = bpf_intf::layer_match_kind_MATCH_COMM_PREFIX as i32;
1301                            copy_into_cstr(&mut mt.comm_prefix, prefix.as_str());
1302                        }
1303                        LayerMatch::CommPrefixExclude(prefix) => {
1304                            mt.kind = bpf_intf::layer_match_kind_MATCH_COMM_PREFIX as i32;
1305                            mt.exclude.write(true);
1306                            copy_into_cstr(&mut mt.comm_prefix, prefix.as_str());
1307                        }
1308                        LayerMatch::PcommPrefix(prefix) => {
1309                            mt.kind = bpf_intf::layer_match_kind_MATCH_PCOMM_PREFIX as i32;
1310                            copy_into_cstr(&mut mt.pcomm_prefix, prefix.as_str());
1311                        }
1312                        LayerMatch::PcommPrefixExclude(prefix) => {
1313                            mt.kind = bpf_intf::layer_match_kind_MATCH_PCOMM_PREFIX as i32;
1314                            mt.exclude.write(true);
1315                            copy_into_cstr(&mut mt.pcomm_prefix, prefix.as_str());
1316                        }
1317                        LayerMatch::NiceAbove(nice) => {
1318                            mt.kind = bpf_intf::layer_match_kind_MATCH_NICE_ABOVE as i32;
1319                            mt.nice = *nice;
1320                        }
1321                        LayerMatch::NiceBelow(nice) => {
1322                            mt.kind = bpf_intf::layer_match_kind_MATCH_NICE_BELOW as i32;
1323                            mt.nice = *nice;
1324                        }
1325                        LayerMatch::NiceEquals(nice) => {
1326                            mt.kind = bpf_intf::layer_match_kind_MATCH_NICE_EQUALS as i32;
1327                            mt.nice = *nice;
1328                        }
1329                        LayerMatch::UIDEquals(user_id) => {
1330                            mt.kind = bpf_intf::layer_match_kind_MATCH_USER_ID_EQUALS as i32;
1331                            mt.user_id = *user_id;
1332                        }
1333                        LayerMatch::GIDEquals(group_id) => {
1334                            mt.kind = bpf_intf::layer_match_kind_MATCH_GROUP_ID_EQUALS as i32;
1335                            mt.group_id = *group_id;
1336                        }
1337                        LayerMatch::PIDEquals(pid) => {
1338                            mt.kind = bpf_intf::layer_match_kind_MATCH_PID_EQUALS as i32;
1339                            mt.pid = *pid;
1340                        }
1341                        LayerMatch::PPIDEquals(ppid) => {
1342                            mt.kind = bpf_intf::layer_match_kind_MATCH_PPID_EQUALS as i32;
1343                            mt.ppid = *ppid;
1344                        }
1345                        LayerMatch::TGIDEquals(tgid) => {
1346                            mt.kind = bpf_intf::layer_match_kind_MATCH_TGID_EQUALS as i32;
1347                            mt.tgid = *tgid;
1348                        }
1349                        LayerMatch::NSPIDEquals(nsid, pid) => {
1350                            mt.kind = bpf_intf::layer_match_kind_MATCH_NSPID_EQUALS as i32;
1351                            mt.nsid = *nsid;
1352                            mt.pid = *pid;
1353                        }
1354                        LayerMatch::NSEquals(nsid) => {
1355                            mt.kind = bpf_intf::layer_match_kind_MATCH_NS_EQUALS as i32;
1356                            mt.nsid = *nsid as u64;
1357                        }
1358                        LayerMatch::CmdJoin(joincmd) => {
1359                            mt.kind = bpf_intf::layer_match_kind_MATCH_SCXCMD_JOIN as i32;
1360                            copy_into_cstr(&mut mt.comm_prefix, joincmd);
1361                        }
1362                        LayerMatch::IsGroupLeader(polarity) => {
1363                            mt.kind = bpf_intf::layer_match_kind_MATCH_IS_GROUP_LEADER as i32;
1364                            mt.is_group_leader.write(*polarity);
1365                        }
1366                        LayerMatch::IsKthread(polarity) => {
1367                            mt.kind = bpf_intf::layer_match_kind_MATCH_IS_KTHREAD as i32;
1368                            mt.is_kthread.write(*polarity);
1369                        }
1370                        LayerMatch::UsedGpuTid(polarity) => {
1371                            mt.kind = bpf_intf::layer_match_kind_MATCH_USED_GPU_TID as i32;
1372                            mt.used_gpu_tid.write(*polarity);
1373                        }
1374                        LayerMatch::UsedGpuPid(polarity) => {
1375                            mt.kind = bpf_intf::layer_match_kind_MATCH_USED_GPU_PID as i32;
1376                            mt.used_gpu_pid.write(*polarity);
1377                        }
1378                        LayerMatch::AvgRuntime(min, max) => {
1379                            mt.kind = bpf_intf::layer_match_kind_MATCH_AVG_RUNTIME as i32;
1380                            mt.min_avg_runtime_us = *min;
1381                            mt.max_avg_runtime_us = *max;
1382                        }
1383                    }
1384                }
1385                layer.matches[or_i].nr_match_ands = or.len() as i32;
1386            }
1387
1388            layer.nr_match_ors = spec.matches.len() as u32;
1389            layer.kind = spec.kind.as_bpf_enum();
1390
1391            {
1392                let LayerCommon {
1393                    min_exec_us,
1394                    yield_ignore,
1395                    perf,
1396                    preempt,
1397                    preempt_first,
1398                    exclusive,
1399                    allow_node_aligned,
1400                    skip_remote_node,
1401                    prev_over_idle_core,
1402                    growth_algo,
1403                    nodes,
1404                    slice_us,
1405                    fifo,
1406                    weight,
1407                    disallow_open_after_us,
1408                    disallow_preempt_after_us,
1409                    xllc_mig_min_us,
1410                    placement,
1411                    ..
1412                } = spec.kind.common();
1413
1414                layer.slice_ns = *slice_us * 1000;
1415                layer.fifo.write(*fifo);
1416                layer.min_exec_ns = min_exec_us * 1000;
1417                layer.yield_step_ns = if *yield_ignore > 0.999 {
1418                    0
1419                } else if *yield_ignore < 0.001 {
1420                    layer.slice_ns
1421                } else {
1422                    (layer.slice_ns as f64 * (1.0 - *yield_ignore)) as u64
1423                };
1424                let mut layer_name: String = spec.name.clone();
1425                layer_name.truncate(MAX_LAYER_NAME);
1426                copy_into_cstr(&mut layer.name, layer_name.as_str());
1427                layer.preempt.write(*preempt);
1428                layer.preempt_first.write(*preempt_first);
1429                layer.excl.write(*exclusive);
1430                layer.allow_node_aligned.write(*allow_node_aligned);
1431                layer.skip_remote_node.write(*skip_remote_node);
1432                layer.prev_over_idle_core.write(*prev_over_idle_core);
1433                layer.growth_algo = growth_algo.as_bpf_enum();
1434                layer.weight = *weight;
1435                layer.disallow_open_after_ns = match disallow_open_after_us.unwrap() {
1436                    v if v == u64::MAX => v,
1437                    v => v * 1000,
1438                };
1439                layer.disallow_preempt_after_ns = match disallow_preempt_after_us.unwrap() {
1440                    v if v == u64::MAX => v,
1441                    v => v * 1000,
1442                };
1443                layer.xllc_mig_min_ns = (xllc_mig_min_us * 1000.0) as u64;
1444                layer_weights.push(layer.weight.try_into().unwrap());
1445                layer.perf = u32::try_from(*perf)?;
1446                layer.node_mask = nodemask_from_nodes(nodes) as u64;
1447                for (topo_node_id, topo_node) in &topo.nodes {
1448                    if !nodes.is_empty() && !nodes.contains(topo_node_id) {
1449                        continue;
1450                    }
1451                    layer.llc_mask |= llcmask_from_llcs(&topo_node.llcs) as u64;
1452                }
1453
1454                let task_place = |place: u32| crate::types::layer_task_place(place);
1455                layer.task_place = match placement {
1456                    LayerPlacement::Standard => {
1457                        task_place(bpf_intf::layer_task_place_PLACEMENT_STD as u32)
1458                    }
1459                    LayerPlacement::Sticky => {
1460                        task_place(bpf_intf::layer_task_place_PLACEMENT_STICK as u32)
1461                    }
1462                    LayerPlacement::Floating => {
1463                        task_place(bpf_intf::layer_task_place_PLACEMENT_FLOAT as u32)
1464                    }
1465                };
1466            }
1467
1468            layer.is_protected.write(match spec.kind {
1469                LayerKind::Open { .. } => false,
1470                LayerKind::Confined { protected, .. } | LayerKind::Grouped { protected, .. } => {
1471                    protected
1472                }
1473            });
1474
1475            match &spec.cpuset {
1476                Some(mask) => {
1477                    Self::update_cpumask(&mask, &mut layer.cpuset);
1478                }
1479                None => {
1480                    for i in 0..layer.cpuset.len() {
1481                        layer.cpuset[i] = u8::MAX;
1482                    }
1483                }
1484            };
1485
1486            perf_set |= layer.perf > 0;
1487        }
1488
1489        layer_iteration_order.sort_by(|i, j| layer_weights[*i].cmp(&layer_weights[*j]));
1490        for (idx, layer_idx) in layer_iteration_order.iter().enumerate() {
1491            skel.maps.rodata_data.layer_iteration_order[idx] = *layer_idx as u32;
1492        }
1493
1494        if perf_set && !compat::ksym_exists("scx_bpf_cpuperf_set")? {
1495            warn!("cpufreq support not available, ignoring perf configurations");
1496        }
1497
1498        Ok(())
1499    }
1500
1501    fn init_nodes(skel: &mut OpenBpfSkel, _opts: &Opts, topo: &Topology) {
1502        skel.maps.rodata_data.nr_nodes = topo.nodes.len() as u32;
1503        skel.maps.rodata_data.nr_llcs = 0;
1504
1505        for (&node_id, node) in &topo.nodes {
1506            debug!("configuring node {}, LLCs {:?}", node_id, node.llcs.len());
1507            skel.maps.rodata_data.nr_llcs += node.llcs.len() as u32;
1508            let raw_numa_slice = node.span.as_raw_slice();
1509            let node_cpumask_slice = &mut skel.maps.rodata_data.numa_cpumasks[node_id];
1510            let (left, _) = node_cpumask_slice.split_at_mut(raw_numa_slice.len());
1511            left.clone_from_slice(raw_numa_slice);
1512            debug!(
1513                "node {} mask: {:?}",
1514                node_id, skel.maps.rodata_data.numa_cpumasks[node_id]
1515            );
1516
1517            for llc in node.llcs.values() {
1518                debug!("configuring llc {:?} for node {:?}", llc.id, node_id);
1519                skel.maps.rodata_data.llc_numa_id_map[llc.id] = node_id as u32;
1520            }
1521        }
1522
1523        for cpu in topo.all_cpus.values() {
1524            skel.maps.rodata_data.cpu_llc_id_map[cpu.id] = cpu.llc_id as u32;
1525        }
1526    }
1527
1528    fn init_cpu_prox_map(topo: &Topology, cpu_ctxs: &mut [bpf_intf::cpu_ctx]) {
1529        let radiate = |mut vec: Vec<usize>, center_id: usize| -> Vec<usize> {
1530            vec.sort_by_key(|&id| (center_id as i32 - id as i32).abs());
1531            vec
1532        };
1533        let radiate_cpu =
1534            |mut vec: Vec<usize>, center_cpu: usize, center_core: usize| -> Vec<usize> {
1535                vec.sort_by_key(|&id| {
1536                    (
1537                        (center_core as i32 - topo.all_cpus.get(&id).unwrap().core_id as i32).abs(),
1538                        (center_cpu as i32 - id as i32).abs(),
1539                    )
1540                });
1541                vec
1542            };
1543
1544        for (&cpu_id, cpu) in &topo.all_cpus {
1545            // Collect the spans.
1546            let mut core_span = topo.all_cores[&cpu.core_id].span.clone();
1547            let llc_span = &topo.all_llcs[&cpu.llc_id].span;
1548            let node_span = &topo.nodes[&cpu.node_id].span;
1549            let sys_span = &topo.span;
1550
1551            // Make the spans exclusive.
1552            let sys_span = sys_span.and(&node_span.not());
1553            let node_span = node_span.and(&llc_span.not());
1554            let llc_span = llc_span.and(&core_span.not());
1555            core_span.clear_cpu(cpu_id).unwrap();
1556
1557            // Convert them into arrays.
1558            let mut sys_order: Vec<usize> = sys_span.iter().collect();
1559            let mut node_order: Vec<usize> = node_span.iter().collect();
1560            let mut llc_order: Vec<usize> = llc_span.iter().collect();
1561            let mut core_order: Vec<usize> = core_span.iter().collect();
1562
1563            // Shuffle them so that different CPUs follow different orders.
1564            // Each CPU radiates in both directions based on the cpu id and
1565            // radiates out to the closest cores based on core ids.
1566
1567            sys_order = radiate_cpu(sys_order, cpu_id, cpu.core_id);
1568            node_order = radiate(node_order, cpu.node_id);
1569            llc_order = radiate_cpu(llc_order, cpu_id, cpu.core_id);
1570            core_order = radiate_cpu(core_order, cpu_id, cpu.core_id);
1571
1572            // Concatenate and record the topology boundaries.
1573            let mut order: Vec<usize> = vec![];
1574            let mut idx: usize = 0;
1575
1576            idx += 1;
1577            order.push(cpu_id);
1578
1579            idx += core_order.len();
1580            order.append(&mut core_order);
1581            let core_end = idx;
1582
1583            idx += llc_order.len();
1584            order.append(&mut llc_order);
1585            let llc_end = idx;
1586
1587            idx += node_order.len();
1588            order.append(&mut node_order);
1589            let node_end = idx;
1590
1591            idx += sys_order.len();
1592            order.append(&mut sys_order);
1593            let sys_end = idx;
1594
1595            debug!(
1596                "CPU[{}] proximity map[{}/{}/{}/{}]: {:?}",
1597                cpu_id, core_end, llc_end, node_end, sys_end, &order
1598            );
1599
1600            // Record in cpu_ctx.
1601            let pmap = &mut cpu_ctxs[cpu_id].prox_map;
1602            for (i, &cpu) in order.iter().enumerate() {
1603                pmap.cpus[i] = cpu as u16;
1604            }
1605            pmap.core_end = core_end as u32;
1606            pmap.llc_end = llc_end as u32;
1607            pmap.node_end = node_end as u32;
1608            pmap.sys_end = sys_end as u32;
1609        }
1610    }
1611
1612    fn convert_cpu_ctxs(cpu_ctxs: Vec<bpf_intf::cpu_ctx>) -> Vec<Vec<u8>> {
1613        cpu_ctxs
1614            .into_iter()
1615            .map(|cpu_ctx| {
1616                let bytes = unsafe {
1617                    std::slice::from_raw_parts(
1618                        &cpu_ctx as *const bpf_intf::cpu_ctx as *const u8,
1619                        std::mem::size_of::<bpf_intf::cpu_ctx>(),
1620                    )
1621                };
1622                bytes.to_vec()
1623            })
1624            .collect()
1625    }
1626
1627    fn init_cpus(skel: &BpfSkel, layer_specs: &[LayerSpec], topo: &Topology) -> Result<()> {
1628        let key = (0_u32).to_ne_bytes();
1629        let mut cpu_ctxs: Vec<bpf_intf::cpu_ctx> = vec![];
1630        let cpu_ctxs_vec = skel
1631            .maps
1632            .cpu_ctxs
1633            .lookup_percpu(&key, libbpf_rs::MapFlags::ANY)
1634            .context("Failed to lookup cpu_ctx")?
1635            .unwrap();
1636
1637        let op_layers: Vec<u32> = layer_specs
1638            .iter()
1639            .enumerate()
1640            .filter(|(_idx, spec)| match &spec.kind {
1641                LayerKind::Open { .. } => spec.kind.common().preempt,
1642                _ => false,
1643            })
1644            .map(|(idx, _)| idx as u32)
1645            .collect();
1646        let on_layers: Vec<u32> = layer_specs
1647            .iter()
1648            .enumerate()
1649            .filter(|(_idx, spec)| match &spec.kind {
1650                LayerKind::Open { .. } => !spec.kind.common().preempt,
1651                _ => false,
1652            })
1653            .map(|(idx, _)| idx as u32)
1654            .collect();
1655        let gp_layers: Vec<u32> = layer_specs
1656            .iter()
1657            .enumerate()
1658            .filter(|(_idx, spec)| match &spec.kind {
1659                LayerKind::Grouped { .. } => spec.kind.common().preempt,
1660                _ => false,
1661            })
1662            .map(|(idx, _)| idx as u32)
1663            .collect();
1664        let gn_layers: Vec<u32> = layer_specs
1665            .iter()
1666            .enumerate()
1667            .filter(|(_idx, spec)| match &spec.kind {
1668                LayerKind::Grouped { .. } => !spec.kind.common().preempt,
1669                _ => false,
1670            })
1671            .map(|(idx, _)| idx as u32)
1672            .collect();
1673
1674        // FIXME - this incorrectly assumes all possible CPUs are consecutive.
1675        for cpu in 0..*NR_CPUS_POSSIBLE {
1676            cpu_ctxs.push(*unsafe {
1677                &*(cpu_ctxs_vec[cpu].as_slice().as_ptr() as *const bpf_intf::cpu_ctx)
1678            });
1679
1680            let topo_cpu = topo.all_cpus.get(&cpu).unwrap();
1681            let is_big = topo_cpu.core_type == CoreType::Big { turbo: true };
1682            cpu_ctxs[cpu].cpu = cpu as i32;
1683            cpu_ctxs[cpu].layer_id = MAX_LAYERS as u32;
1684            cpu_ctxs[cpu].task_layer_id = MAX_LAYERS as u32;
1685            cpu_ctxs[cpu].is_big = is_big;
1686
1687            fastrand::seed(cpu as u64);
1688
1689            let mut ogp_order = op_layers.clone();
1690            ogp_order.append(&mut gp_layers.clone());
1691            fastrand::shuffle(&mut ogp_order);
1692
1693            let mut ogn_order = on_layers.clone();
1694            ogn_order.append(&mut gn_layers.clone());
1695            fastrand::shuffle(&mut ogn_order);
1696
1697            let mut op_order = op_layers.clone();
1698            fastrand::shuffle(&mut op_order);
1699
1700            let mut on_order = on_layers.clone();
1701            fastrand::shuffle(&mut on_order);
1702
1703            let mut gp_order = gp_layers.clone();
1704            fastrand::shuffle(&mut gp_order);
1705
1706            let mut gn_order = gn_layers.clone();
1707            fastrand::shuffle(&mut gn_order);
1708
1709            for i in 0..MAX_LAYERS {
1710                cpu_ctxs[cpu].ogp_layer_order[i] =
1711                    ogp_order.get(i).cloned().unwrap_or(MAX_LAYERS as u32);
1712                cpu_ctxs[cpu].ogn_layer_order[i] =
1713                    ogn_order.get(i).cloned().unwrap_or(MAX_LAYERS as u32);
1714
1715                cpu_ctxs[cpu].op_layer_order[i] =
1716                    op_order.get(i).cloned().unwrap_or(MAX_LAYERS as u32);
1717                cpu_ctxs[cpu].on_layer_order[i] =
1718                    on_order.get(i).cloned().unwrap_or(MAX_LAYERS as u32);
1719                cpu_ctxs[cpu].gp_layer_order[i] =
1720                    gp_order.get(i).cloned().unwrap_or(MAX_LAYERS as u32);
1721                cpu_ctxs[cpu].gn_layer_order[i] =
1722                    gn_order.get(i).cloned().unwrap_or(MAX_LAYERS as u32);
1723            }
1724        }
1725
1726        Self::init_cpu_prox_map(topo, &mut cpu_ctxs);
1727
1728        skel.maps
1729            .cpu_ctxs
1730            .update_percpu(
1731                &key,
1732                &Self::convert_cpu_ctxs(cpu_ctxs),
1733                libbpf_rs::MapFlags::ANY,
1734            )
1735            .context("Failed to update cpu_ctx")?;
1736
1737        Ok(())
1738    }
1739
1740    fn init_llc_prox_map(skel: &mut BpfSkel, topo: &Topology) -> Result<()> {
1741        for (&llc_id, llc) in &topo.all_llcs {
1742            // Collect the orders.
1743            let mut node_order: Vec<usize> =
1744                topo.nodes[&llc.node_id].llcs.keys().cloned().collect();
1745            let mut sys_order: Vec<usize> = topo.all_llcs.keys().cloned().collect();
1746
1747            // Make the orders exclusive.
1748            sys_order.retain(|id| !node_order.contains(id));
1749            node_order.retain(|&id| id != llc_id);
1750
1751            // Shufle so that different LLCs follow different orders. See
1752            // init_cpu_prox_map().
1753            fastrand::seed(llc_id as u64);
1754            fastrand::shuffle(&mut sys_order);
1755            fastrand::shuffle(&mut node_order);
1756
1757            // Concatenate and record the node boundary.
1758            let mut order: Vec<usize> = vec![];
1759            let mut idx: usize = 0;
1760
1761            idx += 1;
1762            order.push(llc_id);
1763
1764            idx += node_order.len();
1765            order.append(&mut node_order);
1766            let node_end = idx;
1767
1768            idx += sys_order.len();
1769            order.append(&mut sys_order);
1770            let sys_end = idx;
1771
1772            debug!(
1773                "LLC[{}] proximity map[{}/{}]: {:?}",
1774                llc_id, node_end, sys_end, &order
1775            );
1776
1777            // Record in llc_ctx.
1778            //
1779            // XXX - This would be a lot easier if llc_ctx were in the bss.
1780            // See BpfStats::read().
1781            let key = llc_id as u32;
1782            let llc_id_slice =
1783                unsafe { std::slice::from_raw_parts((&key as *const u32) as *const u8, 4) };
1784            let v = skel
1785                .maps
1786                .llc_data
1787                .lookup(llc_id_slice, libbpf_rs::MapFlags::ANY)
1788                .unwrap()
1789                .unwrap();
1790            let mut llcc = unsafe { *(v.as_slice().as_ptr() as *const bpf_intf::llc_ctx) };
1791
1792            let pmap = &mut llcc.prox_map;
1793            for (i, &llc_id) in order.iter().enumerate() {
1794                pmap.llcs[i] = llc_id as u16;
1795            }
1796            pmap.node_end = node_end as u32;
1797            pmap.sys_end = sys_end as u32;
1798
1799            let v = unsafe {
1800                std::slice::from_raw_parts(
1801                    &llcc as *const bpf_intf::llc_ctx as *const u8,
1802                    std::mem::size_of::<bpf_intf::llc_ctx>(),
1803                )
1804            };
1805
1806            skel.maps
1807                .llc_data
1808                .update(llc_id_slice, v, libbpf_rs::MapFlags::ANY)?
1809        }
1810
1811        Ok(())
1812    }
1813
1814    fn init(
1815        opts: &'a Opts,
1816        layer_specs: &[LayerSpec],
1817        open_object: &'a mut MaybeUninit<OpenObject>,
1818    ) -> Result<Self> {
1819        let nr_layers = layer_specs.len();
1820        let mut disable_topology = opts.disable_topology.unwrap_or(false);
1821
1822        let topo = Arc::new(if disable_topology {
1823            Topology::with_flattened_llc_node()?
1824        } else {
1825            Topology::new()?
1826        });
1827
1828        /*
1829         * FIXME: scx_layered incorrectly assumes that node, LLC and CPU IDs
1830         * are consecutive. Verify that they are on this system and bail if
1831         * not. It's lucky that core ID is not used anywhere as core IDs are
1832         * not consecutive on some Ryzen CPUs.
1833         */
1834        if topo.nodes.keys().enumerate().any(|(i, &k)| i != k) {
1835            bail!("Holes in node IDs detected: {:?}", topo.nodes.keys());
1836        }
1837        if topo.all_llcs.keys().enumerate().any(|(i, &k)| i != k) {
1838            bail!("Holes in LLC IDs detected: {:?}", topo.all_llcs.keys());
1839        }
1840        if topo.all_cpus.keys().enumerate().any(|(i, &k)| i != k) {
1841            bail!("Holes in CPU IDs detected: {:?}", topo.all_cpus.keys());
1842        }
1843
1844        let netdevs = if opts.netdev_irq_balance {
1845            warn!(
1846                "Experimental netdev IRQ balancing enabled. Reset IRQ masks of network devices after use!!!"
1847            );
1848            read_netdevs()?
1849        } else {
1850            BTreeMap::new()
1851        };
1852
1853        if !disable_topology {
1854            if topo.nodes.len() == 1 && topo.nodes[&0].llcs.len() == 1 {
1855                disable_topology = true;
1856            };
1857            info!(
1858                "Topology awareness not specified, selecting {} based on hardware",
1859                if disable_topology {
1860                    "disabled"
1861                } else {
1862                    "enabled"
1863                }
1864            );
1865        };
1866
1867        let cpu_pool = CpuPool::new(topo.clone())?;
1868
1869        // If disabling topology awareness clear out any set NUMA/LLC configs and
1870        // it will fallback to using all cores.
1871        let layer_specs: Vec<_> = if disable_topology {
1872            info!("Disabling topology awareness");
1873            layer_specs
1874                .iter()
1875                .cloned()
1876                .map(|mut s| {
1877                    s.kind.common_mut().nodes.clear();
1878                    s.kind.common_mut().llcs.clear();
1879                    s
1880                })
1881                .collect()
1882        } else {
1883            layer_specs.to_vec()
1884        };
1885
1886        // Open the BPF prog first for verification.
1887        let mut skel_builder = BpfSkelBuilder::default();
1888        skel_builder.obj_builder.debug(opts.verbose > 1);
1889        init_libbpf_logging(None);
1890        let mut skel = scx_ops_open!(skel_builder, open_object, layered)?;
1891
1892        // enable autoloads for conditionally loaded things
1893        // immediately after creating skel (because this is always before loading)
1894        if opts.enable_gpu_support {
1895            // by default, enable open if gpu support is enabled.
1896            // open has been observed to be relatively cheap to kprobe.
1897            if opts.gpu_kprobe_level >= 1 {
1898                compat::cond_kprobe_enable("nvidia_open", &skel.progs.kprobe_nvidia_open)?;
1899            }
1900            // enable the rest progressively based upon how often they are called
1901            // for observed workloads
1902            if opts.gpu_kprobe_level >= 2 {
1903                compat::cond_kprobe_enable("nvidia_mmap", &skel.progs.kprobe_nvidia_mmap)?;
1904            }
1905            if opts.gpu_kprobe_level >= 3 {
1906                compat::cond_kprobe_enable("nvidia_poll", &skel.progs.kprobe_nvidia_poll)?;
1907            }
1908        }
1909
1910        let ext_sched_class_addr = get_kallsyms_addr("ext_sched_class");
1911        let idle_sched_class_addr = get_kallsyms_addr("idle_sched_class");
1912
1913        if ext_sched_class_addr.is_ok() && idle_sched_class_addr.is_ok() {
1914            skel.maps.rodata_data.ext_sched_class_addr = ext_sched_class_addr.unwrap();
1915            skel.maps.rodata_data.idle_sched_class_addr = idle_sched_class_addr.unwrap();
1916        } else {
1917            warn!(
1918                "Unable to get sched_class addresses from /proc/kallsyms, disabling skip_preempt."
1919            );
1920        }
1921
1922        skel.maps.rodata_data.slice_ns = scx_enums.SCX_SLICE_DFL;
1923        skel.maps.rodata_data.max_exec_ns = 20 * scx_enums.SCX_SLICE_DFL;
1924
1925        // Initialize skel according to @opts.
1926        skel.struct_ops.layered_mut().exit_dump_len = opts.exit_dump_len;
1927
1928        if !opts.disable_queued_wakeup {
1929            match *compat::SCX_OPS_ALLOW_QUEUED_WAKEUP {
1930                0 => info!("Kernel does not support queued wakeup optimization"),
1931                v => skel.struct_ops.layered_mut().flags |= v,
1932            }
1933        }
1934
1935        skel.maps.rodata_data.percpu_kthread_preempt = !opts.disable_percpu_kthread_preempt;
1936        skel.maps.rodata_data.percpu_kthread_preempt_all =
1937            !opts.disable_percpu_kthread_preempt && opts.percpu_kthread_preempt_all;
1938        skel.maps.rodata_data.debug = opts.verbose as u32;
1939        skel.maps.rodata_data.slice_ns = opts.slice_us * 1000;
1940        skel.maps.rodata_data.max_exec_ns = if opts.max_exec_us > 0 {
1941            opts.max_exec_us * 1000
1942        } else {
1943            opts.slice_us * 1000 * 20
1944        };
1945        skel.maps.rodata_data.nr_cpu_ids = *NR_CPU_IDS as u32;
1946        skel.maps.rodata_data.nr_possible_cpus = *NR_CPUS_POSSIBLE as u32;
1947        skel.maps.rodata_data.smt_enabled = topo.smt_enabled;
1948        skel.maps.rodata_data.has_little_cores = topo.has_little_cores();
1949        skel.maps.rodata_data.xnuma_preemption = opts.xnuma_preemption;
1950        skel.maps.rodata_data.antistall_sec = opts.antistall_sec;
1951        skel.maps.rodata_data.monitor_disable = opts.monitor_disable;
1952        skel.maps.rodata_data.lo_fb_wait_ns = opts.lo_fb_wait_us * 1000;
1953        skel.maps.rodata_data.lo_fb_share_ppk = ((opts.lo_fb_share * 1024.0) as u32).clamp(1, 1024);
1954        skel.maps.rodata_data.enable_antistall = !opts.disable_antistall;
1955        skel.maps.rodata_data.enable_match_debug = opts.enable_match_debug;
1956        skel.maps.rodata_data.enable_gpu_support = opts.enable_gpu_support;
1957
1958        for (cpu, sib) in topo.sibling_cpus().iter().enumerate() {
1959            skel.maps.rodata_data.__sibling_cpu[cpu] = *sib;
1960        }
1961        for cpu in topo.all_cpus.keys() {
1962            skel.maps.rodata_data.all_cpus[cpu / 8] |= 1 << (cpu % 8);
1963        }
1964
1965        skel.maps.rodata_data.nr_op_layers = layer_specs
1966            .iter()
1967            .filter(|spec| match &spec.kind {
1968                LayerKind::Open { .. } => spec.kind.common().preempt,
1969                _ => false,
1970            })
1971            .count() as u32;
1972        skel.maps.rodata_data.nr_on_layers = layer_specs
1973            .iter()
1974            .filter(|spec| match &spec.kind {
1975                LayerKind::Open { .. } => !spec.kind.common().preempt,
1976                _ => false,
1977            })
1978            .count() as u32;
1979        skel.maps.rodata_data.nr_gp_layers = layer_specs
1980            .iter()
1981            .filter(|spec| match &spec.kind {
1982                LayerKind::Grouped { .. } => spec.kind.common().preempt,
1983                _ => false,
1984            })
1985            .count() as u32;
1986        skel.maps.rodata_data.nr_gn_layers = layer_specs
1987            .iter()
1988            .filter(|spec| match &spec.kind {
1989                LayerKind::Grouped { .. } => !spec.kind.common().preempt,
1990                _ => false,
1991            })
1992            .count() as u32;
1993        skel.maps.rodata_data.nr_excl_layers = layer_specs
1994            .iter()
1995            .filter(|spec| spec.kind.common().exclusive)
1996            .count() as u32;
1997
1998        let mut min_open = u64::MAX;
1999        let mut min_preempt = u64::MAX;
2000
2001        for spec in layer_specs.iter() {
2002            if let LayerKind::Open { common, .. } = &spec.kind {
2003                min_open = min_open.min(common.disallow_open_after_us.unwrap());
2004                min_preempt = min_preempt.min(common.disallow_preempt_after_us.unwrap());
2005            }
2006        }
2007
2008        skel.maps.rodata_data.min_open_layer_disallow_open_after_ns = match min_open {
2009            u64::MAX => *DFL_DISALLOW_OPEN_AFTER_US,
2010            v => v,
2011        };
2012        skel.maps
2013            .rodata_data
2014            .min_open_layer_disallow_preempt_after_ns = match min_preempt {
2015            u64::MAX => *DFL_DISALLOW_PREEMPT_AFTER_US,
2016            v => v,
2017        };
2018
2019        // Consider all layers empty at the beginning.
2020        for i in 0..layer_specs.len() {
2021            skel.maps.bss_data.empty_layer_ids[i] = i as u32;
2022        }
2023        skel.maps.bss_data.nr_empty_layer_ids = nr_layers as u32;
2024
2025        Self::init_layers(&mut skel, &layer_specs, &topo)?;
2026        Self::init_nodes(&mut skel, opts, &topo);
2027
2028        // We set the pin path before loading the skeleton. This will ensure
2029        // libbpf creates and pins the map, or reuses the pinned map fd for us,
2030        // so that we can keep reusing the older map already pinned on scheduler
2031        // restarts.
2032        let layered_task_hint_map_path = &opts.task_hint_map;
2033        let hint_map = &mut skel.maps.scx_layered_task_hint_map;
2034        // Only set pin path if a path is provided.
2035        if layered_task_hint_map_path.is_empty() == false {
2036            hint_map.set_pin_path(layered_task_hint_map_path).unwrap();
2037        }
2038
2039        let mut skel = scx_ops_load!(skel, layered, uei)?;
2040
2041        let mut layers = vec![];
2042        let layer_growth_orders =
2043            LayerGrowthAlgo::layer_core_orders(&cpu_pool, &layer_specs, &topo)?;
2044        for (idx, spec) in layer_specs.iter().enumerate() {
2045            let growth_order = layer_growth_orders
2046                .get(&idx)
2047                .with_context(|| "layer has no growth order".to_string())?;
2048            layers.push(Layer::new(spec, &topo, growth_order)?);
2049        }
2050
2051        let mut idle_qos_enabled = layers
2052            .iter()
2053            .any(|layer| layer.kind.common().idle_resume_us.unwrap_or(0) > 0);
2054        if idle_qos_enabled && !cpu_idle_resume_latency_supported() {
2055            warn!("idle_resume_us not supported, ignoring");
2056            idle_qos_enabled = false;
2057        }
2058
2059        Self::init_cpus(&skel, &layer_specs, &topo)?;
2060        Self::init_llc_prox_map(&mut skel, &topo)?;
2061
2062        // Other stuff.
2063        let proc_reader = procfs::ProcReader::new();
2064
2065        // Handle setup if layered is running in a pid namespace.
2066        let input = ProgramInput {
2067            ..Default::default()
2068        };
2069        let prog = &mut skel.progs.initialize_pid_namespace;
2070
2071        let _ = prog.test_run(input);
2072
2073        // XXX If we try to refresh the cpumasks here before attaching, we
2074        // sometimes (non-deterministically) don't see the updated values in
2075        // BPF. It would be better to update the cpumasks here before we
2076        // attach, but the value will quickly converge anyways so it's not a
2077        // huge problem in the interim until we figure it out.
2078
2079        // Allow all tasks to open and write to BPF task hint map, now that
2080        // we should have it pinned at the desired location.
2081        if layered_task_hint_map_path.is_empty() == false {
2082            let path = CString::new(layered_task_hint_map_path.as_bytes()).unwrap();
2083            let mode: libc::mode_t = 0o666;
2084            unsafe {
2085                if libc::chmod(path.as_ptr(), mode) != 0 {
2086                    trace!("'chmod' to 666 of task hint map failed, continuing...");
2087                }
2088            }
2089        }
2090
2091        // Attach.
2092        let struct_ops = scx_ops_attach!(skel, layered)?;
2093        let stats_server = StatsServer::new(stats::server_data()).launch()?;
2094
2095        let sched = Self {
2096            struct_ops: Some(struct_ops),
2097            layer_specs,
2098
2099            sched_intv: Duration::from_secs_f64(opts.interval),
2100            layer_refresh_intv: Duration::from_millis(opts.layer_refresh_ms_avgruntime),
2101
2102            cpu_pool,
2103            layers,
2104            idle_qos_enabled,
2105
2106            sched_stats: Stats::new(&mut skel, &proc_reader)?,
2107
2108            nr_layer_cpus_ranges: vec![(0, 0); nr_layers],
2109            processing_dur: Default::default(),
2110
2111            proc_reader,
2112            skel,
2113
2114            topo,
2115            netdevs,
2116            stats_server,
2117        };
2118
2119        info!("Layered Scheduler Attached. Run `scx_layered --monitor` for metrics.");
2120
2121        Ok(sched)
2122    }
2123
2124    fn update_cpumask(mask: &Cpumask, bpfmask: &mut [u8]) {
2125        for cpu in 0..mask.len() {
2126            if mask.test_cpu(cpu) {
2127                bpfmask[cpu / 8] |= 1 << (cpu % 8);
2128            } else {
2129                bpfmask[cpu / 8] &= !(1 << (cpu % 8));
2130            }
2131        }
2132    }
2133
2134    fn update_bpf_layer_cpumask(layer: &Layer, bpf_layer: &mut types::layer) {
2135        trace!("[{}] Updating BPF CPUs: {}", layer.name, &layer.cpus);
2136        Self::update_cpumask(&layer.cpus, &mut bpf_layer.cpus);
2137
2138        bpf_layer.nr_cpus = layer.nr_cpus as u32;
2139        for (llc_id, &nr_llc_cpus) in layer.nr_llc_cpus.iter().enumerate() {
2140            bpf_layer.nr_llc_cpus[llc_id] = nr_llc_cpus as u32;
2141        }
2142
2143        bpf_layer.refresh_cpus = 1;
2144    }
2145
2146    fn update_netdev_cpumasks(&mut self) -> Result<()> {
2147        let available_cpus = self.cpu_pool.available_cpus();
2148        if available_cpus.is_empty() {
2149            return Ok(());
2150        }
2151
2152        for (iface, netdev) in self.netdevs.iter_mut() {
2153            let node = self
2154                .topo
2155                .nodes
2156                .values()
2157                .take_while(|n| n.id == netdev.node())
2158                .next()
2159                .ok_or_else(|| anyhow!("Failed to get netdev node"))?;
2160            let node_cpus = node.span.clone();
2161            for (irq, irqmask) in netdev.irqs.iter_mut() {
2162                irqmask.clear_all();
2163                for cpu in available_cpus.iter() {
2164                    if !node_cpus.test_cpu(cpu) {
2165                        continue;
2166                    }
2167                    let _ = irqmask.set_cpu(cpu);
2168                }
2169                // If no CPUs are available in the node then spread the load across the node
2170                if irqmask.weight() == 0 {
2171                    for cpu in node_cpus.iter() {
2172                        let _ = irqmask.set_cpu(cpu);
2173                    }
2174                }
2175                trace!("{} updating irq {} cpumask {:?}", iface, irq, irqmask);
2176            }
2177            netdev.apply_cpumasks()?;
2178        }
2179
2180        Ok(())
2181    }
2182
2183    /// Calculate how many CPUs each layer would like to have if there were
2184    /// no competition. The CPU range is determined by applying the inverse
2185    /// of util_range and then capping by cpus_range. If the current
2186    /// allocation is within the acceptable range, no change is made.
2187    /// Returns (target, min) pair for each layer.
2188    fn calc_target_nr_cpus(&self) -> Vec<(usize, usize)> {
2189        let nr_cpus = self.cpu_pool.topo.all_cpus.len();
2190        let utils = &self.sched_stats.layer_utils;
2191
2192        let mut records: Vec<(u64, u64, u64, usize, usize, usize)> = vec![];
2193        let mut targets: Vec<(usize, usize)> = vec![];
2194
2195        for (idx, layer) in self.layers.iter().enumerate() {
2196            targets.push(match &layer.kind {
2197                LayerKind::Confined {
2198                    util_range,
2199                    cpus_range,
2200                    cpus_range_frac,
2201                    ..
2202                }
2203                | LayerKind::Grouped {
2204                    util_range,
2205                    cpus_range,
2206                    cpus_range_frac,
2207                    ..
2208                } => {
2209                    // A grouped layer can choose to include open cputime
2210                    // for sizing. Also, as an empty layer can only get CPU
2211                    // time through fallback (counted as owned) or open
2212                    // execution, add open cputime for empty layers.
2213                    let owned = utils[idx][LAYER_USAGE_OWNED];
2214                    let open = utils[idx][LAYER_USAGE_OPEN];
2215
2216                    let mut util = owned;
2217                    if layer.kind.util_includes_open_cputime() || layer.nr_cpus == 0 {
2218                        util += open;
2219                    }
2220
2221                    let util = if util < 0.01 { 0.0 } else { util };
2222                    let low = (util / util_range.1).ceil() as usize;
2223                    let high = ((util / util_range.0).floor() as usize).max(low);
2224                    let target = layer.cpus.weight().clamp(low, high);
2225                    let cpus_range =
2226                        resolve_cpus_pct_range(cpus_range, cpus_range_frac, nr_cpus).unwrap();
2227
2228                    records.push((
2229                        (owned * 100.0) as u64,
2230                        (open * 100.0) as u64,
2231                        (util * 100.0) as u64,
2232                        low,
2233                        high,
2234                        target,
2235                    ));
2236
2237                    (target.clamp(cpus_range.0, cpus_range.1), cpus_range.0)
2238                }
2239                LayerKind::Open { .. } => (0, 0),
2240            });
2241        }
2242
2243        trace!("initial targets: {:?}", &targets);
2244        trace!("(owned, open, util, low, high, target): {:?}", &records);
2245        targets
2246    }
2247
2248    /// Given (target, min) pair for each layer which was determined
2249    /// assuming infinite number of CPUs, distribute the actual CPUs
2250    /// according to their weights.
2251    fn weighted_target_nr_cpus(&self, targets: &[(usize, usize)]) -> Vec<usize> {
2252        let mut nr_left = self.cpu_pool.topo.all_cpus.len();
2253        let weights: Vec<usize> = self
2254            .layers
2255            .iter()
2256            .map(|layer| layer.kind.common().weight as usize)
2257            .collect();
2258        let mut cands: BTreeMap<usize, (usize, usize, usize)> = targets
2259            .iter()
2260            .zip(&weights)
2261            .enumerate()
2262            .map(|(i, ((target, min), weight))| (i, (*target, *min, *weight)))
2263            .collect();
2264        let mut weight_sum: usize = weights.iter().sum();
2265        let mut weighted: Vec<usize> = vec![0; self.layers.len()];
2266
2267        trace!("cands: {:?}", &cands);
2268
2269        // First, accept all layers that are <= min.
2270        cands.retain(|&i, &mut (target, min, weight)| {
2271            if target <= min {
2272                let target = target.min(nr_left);
2273                weighted[i] = target;
2274                weight_sum -= weight;
2275                nr_left -= target;
2276                false
2277            } else {
2278                true
2279            }
2280        });
2281
2282        trace!("cands after accepting mins: {:?}", &cands);
2283
2284        // Keep accepting ones under their allotted share.
2285        let calc_share = |nr_left, weight, weight_sum| {
2286            (((nr_left * weight) as f64 / weight_sum as f64).ceil() as usize).min(nr_left)
2287        };
2288
2289        while !cands.is_empty() {
2290            let mut progress = false;
2291
2292            cands.retain(|&i, &mut (target, _min, weight)| {
2293                let share = calc_share(nr_left, weight, weight_sum);
2294                if target <= share {
2295                    weighted[i] = target;
2296                    weight_sum -= weight;
2297                    nr_left -= target;
2298                    progress = true;
2299                    false
2300                } else {
2301                    true
2302                }
2303            });
2304
2305            if !progress {
2306                break;
2307            }
2308        }
2309
2310        trace!("cands after accepting under allotted: {:?}", &cands);
2311
2312        // The remaining candidates are in contention with each other,
2313        // distribute according to the shares.
2314        let nr_to_share = nr_left;
2315        for (i, (_target, _min, weight)) in cands.into_iter() {
2316            let share = calc_share(nr_to_share, weight, weight_sum).min(nr_left);
2317            weighted[i] = share;
2318            nr_left -= share;
2319        }
2320
2321        trace!("weighted: {:?}", &weighted);
2322
2323        weighted
2324    }
2325
2326    // Figure out a tuple (LLCs, extra_cpus) in terms of the target CPUs
2327    // computed by weighted_target_nr_cpus. Returns the number of full LLCs
2328    // occupied by a layer, and any extra CPUs that don't occupy a full LLC.
2329    fn compute_target_llcs(target: usize, topo: &Topology) -> (usize, usize) {
2330        // TODO(kkd): We assume each LLC has equal number of cores.
2331        let cores_per_llc = topo.all_cores.len() / topo.all_llcs.len();
2332        // TODO(kkd): We assume each core has fixed number of threads.
2333        let cpus_per_core = topo.all_cores.first_key_value().unwrap().1.cpus.len();
2334        let cpus_per_llc = cores_per_llc * cpus_per_core;
2335
2336        let full = target / cpus_per_llc;
2337        let extra = target % cpus_per_llc;
2338
2339        (full, extra.div_ceil(cpus_per_core))
2340    }
2341
2342    // Recalculate the core order for layers using StickyDynamic growth
2343    // algorithm. Tuples from compute_target_llcs are used to decide how many
2344    // LLCs and cores should be assigned to each layer, logic to alloc and free
2345    // CPUs operates on that core order. This happens in three logical steps, we
2346    // first free LLCs from layers that shrunk from last recomputation, then
2347    // distribute freed LLCs to growing layers, and then spill over remaining
2348    // cores in free LLCs.
2349    fn recompute_layer_core_order(&mut self, layer_targets: &Vec<(usize, usize)>) {
2350        // Collect freed LLCs from shrinking layers.
2351        debug!(
2352            " free: before pass: free_llcs={:?}",
2353            self.cpu_pool.free_llcs
2354        );
2355        for &(idx, target) in layer_targets.iter().rev() {
2356            let layer = &mut self.layers[idx];
2357            let old_tlc = layer.target_llc_cpus;
2358            let new_tlc = Self::compute_target_llcs(target, &self.topo);
2359
2360            if layer.growth_algo != LayerGrowthAlgo::StickyDynamic {
2361                continue;
2362            }
2363
2364            let mut to_free = (old_tlc.0 as i32 - new_tlc.0 as i32).max(0) as usize;
2365
2366            debug!(
2367                " free: layer={} old_tlc={:?} new_tlc={:?} to_free={} assigned={} free={}",
2368                layer.name,
2369                old_tlc,
2370                new_tlc,
2371                to_free,
2372                layer.assigned_llcs.len(),
2373                self.cpu_pool.free_llcs.len()
2374            );
2375
2376            while to_free > 0 && layer.assigned_llcs.len() > 0 {
2377                let llc = layer.assigned_llcs.pop().unwrap();
2378                self.cpu_pool.free_llcs.push((llc, 0));
2379                to_free -= 1;
2380
2381                debug!(" layer={} freed_llc={}", layer.name, llc);
2382            }
2383        }
2384        debug!(" free: after pass: free_llcs={:?}", self.cpu_pool.free_llcs);
2385
2386        // Redistribute the freed LLCs to growing layers.
2387        for &(idx, target) in layer_targets.iter().rev() {
2388            let layer = &mut self.layers[idx];
2389            let old_tlc = layer.target_llc_cpus;
2390            let new_tlc = Self::compute_target_llcs(target, &self.topo);
2391
2392            if layer.growth_algo != LayerGrowthAlgo::StickyDynamic {
2393                continue;
2394            }
2395
2396            let mut to_alloc = (new_tlc.0 as i32 - old_tlc.0 as i32).max(0) as usize;
2397
2398            debug!(
2399                " alloc: layer={} old_tlc={:?} new_tlc={:?} to_alloc={} assigned={} free={}",
2400                layer.name,
2401                old_tlc,
2402                new_tlc,
2403                to_alloc,
2404                layer.assigned_llcs.len(),
2405                self.cpu_pool.free_llcs.len()
2406            );
2407
2408            while to_alloc > 0
2409                && self.cpu_pool.free_llcs.len() > 0
2410                && to_alloc <= self.cpu_pool.free_llcs.len()
2411            {
2412                let llc = self.cpu_pool.free_llcs.pop().unwrap().0;
2413                layer.assigned_llcs.push(llc);
2414                to_alloc -= 1;
2415
2416                debug!(" layer={} alloc_llc={}", layer.name, llc);
2417            }
2418
2419            debug!(
2420                " alloc: layer={} assigned_llcs={:?}",
2421                layer.name, layer.assigned_llcs
2422            );
2423
2424            // Update for next iteration.
2425            layer.target_llc_cpus = new_tlc;
2426        }
2427
2428        // Spillover overflowing cores into free LLCs. Bigger layers get to take
2429        // a chunk before smaller layers.
2430        for &(idx, _) in layer_targets.iter() {
2431            let mut core_order = vec![];
2432            let layer = &mut self.layers[idx];
2433
2434            if layer.growth_algo != LayerGrowthAlgo::StickyDynamic {
2435                continue;
2436            }
2437
2438            let tlc = layer.target_llc_cpus;
2439            let mut extra = tlc.1;
2440            // TODO(kkd): Move this logic into cpu_pool? What's the best place?
2441            let cores_per_llc = self.topo.all_cores.len() / self.topo.all_llcs.len();
2442            let cpus_per_core = self.topo.all_cores.first_key_value().unwrap().1.cpus.len();
2443            let cpus_per_llc = cores_per_llc * cpus_per_core;
2444
2445            // Consume from front since we pop from the back.
2446            for i in 0..self.cpu_pool.free_llcs.len() {
2447                let free_vec = &mut self.cpu_pool.free_llcs;
2448                // Available CPUs in LLC.
2449                let avail = cpus_per_llc - free_vec[i].1;
2450                // The amount we'll use.
2451                let mut used = extra.min(avail);
2452
2453                let shift = free_vec[i].1;
2454                free_vec[i].1 += used;
2455
2456                let llc_id = free_vec[i].0;
2457                let llc = self.topo.all_llcs.get(&llc_id).unwrap();
2458
2459                for core in llc.cores.iter().skip(shift) {
2460                    core_order.push(core.1.id);
2461                    if used == 0 {
2462                        break;
2463                    }
2464                    used -= 1;
2465                }
2466
2467                extra -= used;
2468                if extra == 0 {
2469                    break;
2470                }
2471            }
2472
2473            core_order.reverse();
2474            layer.core_order = core_order;
2475        }
2476
2477        // Reset consumed entries in free LLCs.
2478        for i in 0..self.cpu_pool.free_llcs.len() {
2479            self.cpu_pool.free_llcs[i].1 = 0;
2480        }
2481
2482        for &(idx, _) in layer_targets.iter() {
2483            let layer = &mut self.layers[idx];
2484
2485            if layer.growth_algo != LayerGrowthAlgo::StickyDynamic {
2486                continue;
2487            }
2488
2489            for core in self.topo.all_cores.iter() {
2490                let llc_id = core.1.llc_id;
2491                if layer.assigned_llcs.contains(&llc_id) {
2492                    layer.core_order.push(core.1.id);
2493                }
2494            }
2495            // Update core_order for the layer, but reverse to keep the start stable.
2496            layer.core_order.reverse();
2497
2498            debug!(
2499                " alloc: layer={} core_order={:?}",
2500                layer.name, layer.core_order
2501            );
2502        }
2503    }
2504
2505    fn refresh_cpumasks(&mut self) -> Result<()> {
2506        let layer_is_open = |layer: &Layer| matches!(layer.kind, LayerKind::Open { .. });
2507
2508        let mut updated = false;
2509        let targets = self.calc_target_nr_cpus();
2510        let targets = self.weighted_target_nr_cpus(&targets);
2511
2512        let mut ascending: Vec<(usize, usize)> = targets.iter().copied().enumerate().collect();
2513        ascending.sort_by(|a, b| a.1.cmp(&b.1));
2514
2515        self.recompute_layer_core_order(&ascending);
2516
2517        // If any layer is growing, guarantee that the largest layer that is
2518        // freeing CPUs frees at least one CPU.
2519        let mut force_free = self
2520            .layers
2521            .iter()
2522            .zip(targets.iter())
2523            .any(|(layer, &target)| layer.nr_cpus < target);
2524
2525        // Shrink all layers first so that CPUs are available for
2526        // redistribution. Do so in the descending target number of CPUs
2527        // order.
2528        for &(idx, target) in ascending.iter().rev() {
2529            let layer = &mut self.layers[idx];
2530            if layer_is_open(layer) {
2531                continue;
2532            }
2533
2534            let nr_cur = layer.cpus.weight();
2535            if nr_cur <= target {
2536                continue;
2537            }
2538            let mut nr_to_free = nr_cur - target;
2539
2540            // There's some dampening built into util metrics but slow down
2541            // freeing further to avoid unnecessary changes. This is solely
2542            // based on intution. Drop or update according to real-world
2543            // behavior.
2544            let nr_to_break_at = nr_to_free / 2;
2545
2546            let mut freed = false;
2547
2548            while nr_to_free > 0 {
2549                let max_to_free = if force_free {
2550                    force_free = false;
2551                    layer.nr_cpus
2552                } else {
2553                    nr_to_free
2554                };
2555
2556                let nr_freed = layer.free_some_cpus(&mut self.cpu_pool, max_to_free)?;
2557                if nr_freed == 0 {
2558                    break;
2559                }
2560
2561                nr_to_free = nr_to_free.saturating_sub(nr_freed);
2562                freed = true;
2563
2564                if nr_to_free <= nr_to_break_at {
2565                    break;
2566                }
2567            }
2568
2569            if freed {
2570                Self::update_bpf_layer_cpumask(layer, &mut self.skel.maps.bss_data.layers[idx]);
2571                updated = true;
2572            }
2573        }
2574
2575        // Grow layers. Do so in the ascending target number of CPUs order
2576        // so that we're always more generous to smaller layers. This avoids
2577        // starving small layers and shouldn't make noticable difference for
2578        // bigger layers as work conservation should still be achieved
2579        // through open execution.
2580        for &(idx, target) in &ascending {
2581            let layer = &mut self.layers[idx];
2582
2583            if layer_is_open(layer) {
2584                continue;
2585            }
2586
2587            let nr_cur = layer.cpus.weight();
2588            if nr_cur >= target {
2589                continue;
2590            }
2591
2592            let mut nr_to_alloc = target - nr_cur;
2593            let mut alloced = false;
2594
2595            while nr_to_alloc > 0 {
2596                let nr_alloced = layer.alloc_some_cpus(&mut self.cpu_pool)?;
2597                if nr_alloced == 0 {
2598                    break;
2599                }
2600                alloced = true;
2601                nr_to_alloc -= nr_alloced.min(nr_to_alloc);
2602            }
2603
2604            if alloced {
2605                Self::update_bpf_layer_cpumask(layer, &mut self.skel.maps.bss_data.layers[idx]);
2606                updated = true;
2607            }
2608        }
2609
2610        // Give the rest to the open layers.
2611        if updated {
2612            for (idx, layer) in self.layers.iter_mut().enumerate() {
2613                if !layer_is_open(layer) {
2614                    continue;
2615                }
2616
2617                let bpf_layer = &mut self.skel.maps.bss_data.layers[idx];
2618                let available_cpus = self.cpu_pool.available_cpus().and(&layer.allowed_cpus);
2619                let nr_available_cpus = available_cpus.weight();
2620
2621                // Open layers need the intersection of allowed cpus and
2622                // available cpus.
2623                layer.cpus = available_cpus;
2624                layer.nr_cpus = nr_available_cpus;
2625                Self::update_bpf_layer_cpumask(layer, bpf_layer);
2626            }
2627
2628            self.skel.maps.bss_data.fallback_cpu = self.cpu_pool.fallback_cpu as u32;
2629
2630            for (lidx, layer) in self.layers.iter().enumerate() {
2631                self.nr_layer_cpus_ranges[lidx] = (
2632                    self.nr_layer_cpus_ranges[lidx].0.min(layer.nr_cpus),
2633                    self.nr_layer_cpus_ranges[lidx].1.max(layer.nr_cpus),
2634                );
2635            }
2636
2637            // Trigger updates on the BPF side.
2638            let input = ProgramInput {
2639                ..Default::default()
2640            };
2641            let prog = &mut self.skel.progs.refresh_layer_cpumasks;
2642            let _ = prog.test_run(input);
2643
2644            // Update empty_layers.
2645            let empty_layer_ids: Vec<u32> = self
2646                .layers
2647                .iter()
2648                .enumerate()
2649                .filter(|(_idx, layer)| layer.nr_cpus == 0)
2650                .map(|(idx, _layer)| idx as u32)
2651                .collect();
2652            for i in 0..self.layers.len() {
2653                self.skel.maps.bss_data.empty_layer_ids[i] =
2654                    empty_layer_ids.get(i).cloned().unwrap_or(MAX_LAYERS as u32);
2655            }
2656            self.skel.maps.bss_data.nr_empty_layer_ids = empty_layer_ids.len() as u32;
2657        }
2658
2659        let _ = self.update_netdev_cpumasks();
2660        Ok(())
2661    }
2662
2663    fn refresh_idle_qos(&mut self) -> Result<()> {
2664        if !self.idle_qos_enabled {
2665            return Ok(());
2666        }
2667
2668        let mut cpu_idle_qos = vec![0; *NR_CPU_IDS];
2669        for layer in self.layers.iter() {
2670            let idle_resume_us = layer.kind.common().idle_resume_us.unwrap_or(0) as i32;
2671            for cpu in layer.cpus.iter() {
2672                cpu_idle_qos[cpu] = idle_resume_us;
2673            }
2674        }
2675
2676        for (cpu, idle_resume_usec) in cpu_idle_qos.iter().enumerate() {
2677            update_cpu_idle_resume_latency(cpu, *idle_resume_usec)?;
2678        }
2679
2680        Ok(())
2681    }
2682
2683    fn step(&mut self) -> Result<()> {
2684        let started_at = Instant::now();
2685        self.sched_stats.refresh(
2686            &mut self.skel,
2687            &self.proc_reader,
2688            started_at,
2689            self.processing_dur,
2690        )?;
2691        self.refresh_cpumasks()?;
2692        self.refresh_idle_qos()?;
2693        self.processing_dur += Instant::now().duration_since(started_at);
2694        Ok(())
2695    }
2696
2697    fn generate_sys_stats(
2698        &mut self,
2699        stats: &Stats,
2700        cpus_ranges: &mut [(usize, usize)],
2701    ) -> Result<SysStats> {
2702        let bstats = &stats.bpf_stats;
2703        let mut sys_stats = SysStats::new(stats, bstats, self.cpu_pool.fallback_cpu)?;
2704
2705        for (lidx, (spec, layer)) in self.layer_specs.iter().zip(self.layers.iter()).enumerate() {
2706            let layer_stats = LayerStats::new(lidx, layer, stats, bstats, cpus_ranges[lidx]);
2707            sys_stats.layers.insert(spec.name.to_string(), layer_stats);
2708            cpus_ranges[lidx] = (layer.nr_cpus, layer.nr_cpus);
2709        }
2710
2711        Ok(sys_stats)
2712    }
2713
2714    fn run(&mut self, shutdown: Arc<AtomicBool>) -> Result<UserExitInfo> {
2715        let (res_ch, req_ch) = self.stats_server.channels();
2716        let mut next_sched_at = Instant::now() + self.sched_intv;
2717        let enable_layer_refresh = !self.layer_refresh_intv.is_zero();
2718        let mut next_layer_refresh_at = Instant::now() + self.layer_refresh_intv;
2719        let mut cpus_ranges = HashMap::<ThreadId, Vec<(usize, usize)>>::new();
2720
2721        while !shutdown.load(Ordering::Relaxed) && !uei_exited!(&self.skel, uei) {
2722            let now = Instant::now();
2723
2724            if now >= next_sched_at {
2725                self.step()?;
2726                while next_sched_at < now {
2727                    next_sched_at += self.sched_intv;
2728                }
2729            }
2730
2731            if enable_layer_refresh && now >= next_layer_refresh_at {
2732                self.skel.maps.bss_data.layer_refresh_seq_avgruntime += 1;
2733                while next_layer_refresh_at < now {
2734                    next_layer_refresh_at += self.layer_refresh_intv;
2735                }
2736            }
2737
2738            match req_ch.recv_deadline(next_sched_at) {
2739                Ok(StatsReq::Hello(tid)) => {
2740                    cpus_ranges.insert(
2741                        tid,
2742                        self.layers.iter().map(|l| (l.nr_cpus, l.nr_cpus)).collect(),
2743                    );
2744                    let stats = Stats::new(&mut self.skel, &self.proc_reader)?;
2745                    res_ch.send(StatsRes::Hello(stats))?;
2746                }
2747                Ok(StatsReq::Refresh(tid, mut stats)) => {
2748                    // Propagate self's layer cpu ranges into each stat's.
2749                    for i in 0..self.nr_layer_cpus_ranges.len() {
2750                        for (_, ranges) in cpus_ranges.iter_mut() {
2751                            ranges[i] = (
2752                                ranges[i].0.min(self.nr_layer_cpus_ranges[i].0),
2753                                ranges[i].1.max(self.nr_layer_cpus_ranges[i].1),
2754                            );
2755                        }
2756                        self.nr_layer_cpus_ranges[i] =
2757                            (self.layers[i].nr_cpus, self.layers[i].nr_cpus);
2758                    }
2759
2760                    stats.refresh(&mut self.skel, &self.proc_reader, now, self.processing_dur)?;
2761                    let sys_stats =
2762                        self.generate_sys_stats(&stats, cpus_ranges.get_mut(&tid).unwrap())?;
2763                    res_ch.send(StatsRes::Refreshed((stats, sys_stats)))?;
2764                }
2765                Ok(StatsReq::Bye(tid)) => {
2766                    cpus_ranges.remove(&tid);
2767                    res_ch.send(StatsRes::Bye)?;
2768                }
2769                Err(RecvTimeoutError::Timeout) => {}
2770                Err(e) => Err(e)?,
2771            }
2772        }
2773
2774        let _ = self.struct_ops.take();
2775        uei_report!(&self.skel, uei)
2776    }
2777}
2778
2779impl Drop for Scheduler<'_> {
2780    fn drop(&mut self) {
2781        if let Some(struct_ops) = self.struct_ops.take() {
2782            drop(struct_ops);
2783        }
2784    }
2785}
2786
2787fn write_example_file(path: &str) -> Result<()> {
2788    let mut f = fs::OpenOptions::new()
2789        .create_new(true)
2790        .write(true)
2791        .open(path)?;
2792    Ok(f.write_all(serde_json::to_string_pretty(&*EXAMPLE_CONFIG)?.as_bytes())?)
2793}
2794
2795fn verify_layer_specs(specs: &[LayerSpec]) -> Result<()> {
2796    let nr_specs = specs.len();
2797    if nr_specs == 0 {
2798        bail!("No layer spec");
2799    }
2800    if nr_specs > MAX_LAYERS {
2801        bail!("Too many layer specs");
2802    }
2803
2804    for (idx, spec) in specs.iter().enumerate() {
2805        if idx < nr_specs - 1 {
2806            if spec.matches.is_empty() {
2807                bail!("Non-terminal spec {:?} has NULL matches", spec.name);
2808            }
2809        } else {
2810            if spec.matches.len() != 1 || !spec.matches[0].is_empty() {
2811                bail!("Terminal spec {:?} must have an empty match", spec.name);
2812            }
2813        }
2814
2815        if spec.matches.len() > MAX_LAYER_MATCH_ORS {
2816            bail!(
2817                "Spec {:?} has too many ({}) OR match blocks",
2818                spec.name,
2819                spec.matches.len()
2820            );
2821        }
2822
2823        for (ands_idx, ands) in spec.matches.iter().enumerate() {
2824            if ands.len() > NR_LAYER_MATCH_KINDS {
2825                bail!(
2826                    "Spec {:?}'s {}th OR block has too many ({}) match conditions",
2827                    spec.name,
2828                    ands_idx,
2829                    ands.len()
2830                );
2831            }
2832            for one in ands.iter() {
2833                match one {
2834                    LayerMatch::CgroupPrefix(prefix) => {
2835                        if prefix.len() > MAX_PATH {
2836                            bail!("Spec {:?} has too long a cgroup prefix", spec.name);
2837                        }
2838                    }
2839                    LayerMatch::CgroupSuffix(suffix) => {
2840                        if suffix.len() > MAX_PATH {
2841                            bail!("Spec {:?} has too long a cgroup suffix", spec.name);
2842                        }
2843                    }
2844                    LayerMatch::CgroupContains(substr) => {
2845                        if substr.len() > MAX_PATH {
2846                            bail!("Spec {:?} has too long a cgroup substr", spec.name);
2847                        }
2848                    }
2849                    LayerMatch::CommPrefix(prefix) => {
2850                        if prefix.len() > MAX_COMM {
2851                            bail!("Spec {:?} has too long a comm prefix", spec.name);
2852                        }
2853                    }
2854                    LayerMatch::PcommPrefix(prefix) => {
2855                        if prefix.len() > MAX_COMM {
2856                            bail!("Spec {:?} has too long a process name prefix", spec.name);
2857                        }
2858                    }
2859                    _ => {}
2860                }
2861            }
2862        }
2863
2864        match spec.kind {
2865            LayerKind::Confined {
2866                cpus_range,
2867                util_range,
2868                ..
2869            }
2870            | LayerKind::Grouped {
2871                cpus_range,
2872                util_range,
2873                ..
2874            } => {
2875                if let Some((cpus_min, cpus_max)) = cpus_range {
2876                    if cpus_min > cpus_max {
2877                        bail!(
2878                            "Spec {:?} has invalid cpus_range({}, {})",
2879                            spec.name,
2880                            cpus_min,
2881                            cpus_max
2882                        );
2883                    }
2884                }
2885                if util_range.0 >= util_range.1 {
2886                    bail!(
2887                        "Spec {:?} has invalid util_range ({}, {})",
2888                        spec.name,
2889                        util_range.0,
2890                        util_range.1
2891                    );
2892                }
2893            }
2894            _ => {}
2895        }
2896    }
2897
2898    Ok(())
2899}
2900
2901fn name_suffix(cgroup: &str, len: usize) -> String {
2902    let suffixlen = std::cmp::min(len, cgroup.len());
2903    let suffixrev: String = cgroup.chars().rev().take(suffixlen).collect();
2904
2905    suffixrev.chars().rev().collect()
2906}
2907
2908fn traverse_sysfs(dir: &Path) -> Result<Vec<PathBuf>> {
2909    let mut paths = vec![];
2910
2911    if !dir.is_dir() {
2912        panic!("path {:?} does not correspond to directory", dir);
2913    }
2914
2915    let direntries = fs::read_dir(dir)?;
2916
2917    for entry in direntries {
2918        let path = entry?.path();
2919        if path.is_dir() {
2920            paths.append(&mut traverse_sysfs(&path)?);
2921            paths.push(path);
2922        }
2923    }
2924
2925    Ok(paths)
2926}
2927
2928fn find_cpumask(cgroup: &str) -> Cpumask {
2929    let mut path = String::from(cgroup);
2930    path.push_str("/cpuset.cpus.effective");
2931
2932    let description = fs::read_to_string(&mut path).unwrap();
2933
2934    Cpumask::from_cpulist(&description).unwrap()
2935}
2936
2937fn expand_template(rule: &LayerMatch) -> Result<Vec<(LayerMatch, Cpumask)>> {
2938    match rule {
2939        LayerMatch::CgroupSuffix(suffix) => Ok(traverse_sysfs(Path::new("/sys/fs/cgroup"))?
2940            .into_iter()
2941            .map(|cgroup| String::from(cgroup.to_str().expect("could not parse cgroup path")))
2942            .filter(|cgroup| cgroup.ends_with(suffix))
2943            .map(|cgroup| {
2944                (
2945                    {
2946                        let mut slashterminated = cgroup.clone();
2947                        slashterminated.push('/');
2948                        LayerMatch::CgroupSuffix(name_suffix(&slashterminated, 64))
2949                    },
2950                    find_cpumask(&cgroup),
2951                )
2952            })
2953            .collect()),
2954        _ => panic!("Unimplemented template enum {:?}", rule),
2955    }
2956}
2957
2958fn main() -> Result<()> {
2959    let opts = Opts::parse();
2960
2961    if opts.help_stats {
2962        stats::server_data().describe_meta(&mut std::io::stdout(), None)?;
2963        return Ok(());
2964    }
2965
2966    if opts.no_load_frac_limit {
2967        warn!("--no-load-frac-limit is deprecated and noop");
2968    }
2969    if opts.layer_preempt_weight_disable != 0.0 {
2970        warn!("--layer-preempt-weight-disable is deprecated and noop");
2971    }
2972    if opts.layer_growth_weight_disable != 0.0 {
2973        warn!("--layer-growth-weight-disable is deprecated and noop");
2974    }
2975    if opts.local_llc_iteration {
2976        warn!("--local_llc_iteration is deprecated and noop");
2977    }
2978
2979    let llv = match opts.verbose {
2980        0 => simplelog::LevelFilter::Info,
2981        1 => simplelog::LevelFilter::Debug,
2982        _ => simplelog::LevelFilter::Trace,
2983    };
2984    let mut lcfg = simplelog::ConfigBuilder::new();
2985    lcfg.set_time_offset_to_local()
2986        .expect("Failed to set local time offset")
2987        .set_time_level(simplelog::LevelFilter::Error)
2988        .set_location_level(simplelog::LevelFilter::Off)
2989        .set_target_level(simplelog::LevelFilter::Off)
2990        .set_thread_level(simplelog::LevelFilter::Off);
2991    simplelog::TermLogger::init(
2992        llv,
2993        lcfg.build(),
2994        simplelog::TerminalMode::Stderr,
2995        simplelog::ColorChoice::Auto,
2996    )?;
2997
2998    debug!("opts={:?}", &opts);
2999
3000    let shutdown = Arc::new(AtomicBool::new(false));
3001    let shutdown_clone = shutdown.clone();
3002    ctrlc::set_handler(move || {
3003        shutdown_clone.store(true, Ordering::Relaxed);
3004    })
3005    .context("Error setting Ctrl-C handler")?;
3006
3007    if let Some(intv) = opts.monitor.or(opts.stats) {
3008        let shutdown_copy = shutdown.clone();
3009        let jh = std::thread::spawn(move || {
3010            match stats::monitor(Duration::from_secs_f64(intv), shutdown_copy) {
3011                Ok(_) => {
3012                    debug!("stats monitor thread finished successfully")
3013                }
3014                Err(error_object) => {
3015                    warn!(
3016                        "stats monitor thread finished because of an error {}",
3017                        error_object
3018                    )
3019                }
3020            }
3021        });
3022        if opts.monitor.is_some() {
3023            let _ = jh.join();
3024            return Ok(());
3025        }
3026    }
3027
3028    if let Some(path) = &opts.example {
3029        write_example_file(path)?;
3030        return Ok(());
3031    }
3032
3033    let mut layer_config = match opts.run_example {
3034        true => EXAMPLE_CONFIG.clone(),
3035        false => LayerConfig { specs: vec![] },
3036    };
3037
3038    for (idx, input) in opts.specs.iter().enumerate() {
3039        let specs = LayerSpec::parse(input)
3040            .context(format!("Failed to parse specs[{}] ({:?})", idx, input))?;
3041
3042        for spec in specs {
3043            match spec.template {
3044                Some(ref rule) => {
3045                    let matches = expand_template(&rule)?;
3046                    // in the absence of matching cgroups, have template layers
3047                    // behave as non-template layers do.
3048                    if matches.is_empty() {
3049                        layer_config.specs.push(spec);
3050                    } else {
3051                        for (mt, mask) in matches {
3052                            let mut genspec = spec.clone();
3053
3054                            genspec.cpuset = Some(mask);
3055
3056                            // Push the new "and" rule into each "or" term.
3057                            for orterm in &mut genspec.matches {
3058                                orterm.push(mt.clone());
3059                            }
3060
3061                            match &mt {
3062                                LayerMatch::CgroupSuffix(cgroup) => genspec.name.push_str(cgroup),
3063                                _ => bail!("Template match has unexpected type"),
3064                            }
3065
3066                            // Push the generated layer into the config
3067                            layer_config.specs.push(genspec);
3068                        }
3069                    }
3070                }
3071
3072                None => {
3073                    layer_config.specs.push(spec);
3074                }
3075            }
3076        }
3077    }
3078
3079    for spec in layer_config.specs.iter_mut() {
3080        let common = spec.kind.common_mut();
3081
3082        if common.slice_us == 0 {
3083            common.slice_us = opts.slice_us;
3084        }
3085
3086        if common.weight == 0 {
3087            common.weight = DEFAULT_LAYER_WEIGHT;
3088        }
3089        common.weight = common.weight.clamp(MIN_LAYER_WEIGHT, MAX_LAYER_WEIGHT);
3090
3091        if common.preempt {
3092            if common.disallow_open_after_us.is_some() {
3093                warn!(
3094                    "Preempt layer {} has non-null disallow_open_after_us, ignored",
3095                    &spec.name
3096                );
3097            }
3098            if common.disallow_preempt_after_us.is_some() {
3099                warn!(
3100                    "Preempt layer {} has non-null disallow_preempt_after_us, ignored",
3101                    &spec.name
3102                );
3103            }
3104            common.disallow_open_after_us = Some(u64::MAX);
3105            common.disallow_preempt_after_us = Some(u64::MAX);
3106        } else {
3107            if common.disallow_open_after_us.is_none() {
3108                common.disallow_open_after_us = Some(*DFL_DISALLOW_OPEN_AFTER_US);
3109            }
3110
3111            if common.disallow_preempt_after_us.is_none() {
3112                common.disallow_preempt_after_us = Some(*DFL_DISALLOW_PREEMPT_AFTER_US);
3113            }
3114        }
3115
3116        if common.idle_smt.is_some() {
3117            warn!("Layer {} has deprecated flag \"idle_smt\"", &spec.name);
3118        }
3119    }
3120
3121    if opts.print_and_exit {
3122        println!("specs={}", serde_json::to_string_pretty(&layer_config)?);
3123        return Ok(());
3124    }
3125
3126    debug!("specs={}", serde_json::to_string_pretty(&layer_config)?);
3127    verify_layer_specs(&layer_config.specs)?;
3128
3129    let mut open_object = MaybeUninit::uninit();
3130    loop {
3131        let mut sched = Scheduler::init(&opts, &layer_config.specs, &mut open_object)?;
3132        if !sched.run(shutdown.clone())?.should_restart() {
3133            break;
3134        }
3135    }
3136
3137    Ok(())
3138}