1mod bpf_skel;
6mod stats;
7
8use std::collections::BTreeMap;
9use std::collections::HashMap;
10use std::collections::HashSet;
11use std::ffi::CString;
12use std::fs;
13use std::io::Write;
14use std::mem::MaybeUninit;
15use std::ops::Sub;
16use std::path::Path;
17use std::path::PathBuf;
18use std::sync::atomic::AtomicBool;
19use std::sync::atomic::Ordering;
20use std::sync::Arc;
21use std::thread::ThreadId;
22use std::time::Duration;
23use std::time::Instant;
24
25use inotify::{Inotify, WatchMask};
26use libc;
27use std::os::unix::io::AsRawFd;
28
29use anyhow::anyhow;
30use anyhow::bail;
31use anyhow::Context;
32use anyhow::Result;
33pub use bpf_skel::*;
34use clap::Parser;
35use crossbeam::channel::Receiver;
36use crossbeam::select;
37use lazy_static::lazy_static;
38use libbpf_rs::libbpf_sys;
39use libbpf_rs::AsRawLibbpf;
40use libbpf_rs::MapCore as _;
41use libbpf_rs::OpenObject;
42use libbpf_rs::ProgramInput;
43use log::info;
44use log::trace;
45use log::warn;
46use log::{debug, error};
47use nix::sched::CpuSet;
48use nvml_wrapper::error::NvmlError;
49use nvml_wrapper::Nvml;
50use once_cell::sync::OnceCell;
51use regex::Regex;
52use scx_bpf_compat;
53use scx_layered::*;
54use scx_raw_pmu::PMUManager;
55use scx_stats::prelude::*;
56use scx_utils::build_id;
57use scx_utils::compat;
58use scx_utils::init_libbpf_logging;
59use scx_utils::libbpf_clap_opts::LibbpfOpts;
60use scx_utils::perf;
61use scx_utils::pm::{cpu_idle_resume_latency_supported, update_cpu_idle_resume_latency};
62use scx_utils::read_netdevs;
63use scx_utils::scx_enums;
64use scx_utils::scx_ops_attach;
65use scx_utils::scx_ops_load;
66use scx_utils::scx_ops_open;
67use scx_utils::uei_exited;
68use scx_utils::uei_report;
69use scx_utils::CoreType;
70use scx_utils::Cpumask;
71use scx_utils::Llc;
72use scx_utils::NetDev;
73use scx_utils::Topology;
74use scx_utils::TopologyArgs;
75use scx_utils::UserExitInfo;
76use scx_utils::NR_CPUS_POSSIBLE;
77use scx_utils::NR_CPU_IDS;
78use stats::LayerStats;
79use stats::StatsReq;
80use stats::StatsRes;
81use stats::SysStats;
82use std::collections::VecDeque;
83use sysinfo::{Pid, ProcessRefreshKind, ProcessesToUpdate, System};
84use walkdir::WalkDir;
85
86const SCHEDULER_NAME: &str = "scx_layered";
87const MAX_PATH: usize = bpf_intf::consts_MAX_PATH as usize;
88const MAX_COMM: usize = bpf_intf::consts_MAX_COMM as usize;
89const MAX_LAYER_WEIGHT: u32 = bpf_intf::consts_MAX_LAYER_WEIGHT;
90const MIN_LAYER_WEIGHT: u32 = bpf_intf::consts_MIN_LAYER_WEIGHT;
91const MAX_LAYER_MATCH_ORS: usize = bpf_intf::consts_MAX_LAYER_MATCH_ORS as usize;
92const MAX_LAYER_NAME: usize = bpf_intf::consts_MAX_LAYER_NAME as usize;
93const MAX_LAYERS: usize = bpf_intf::consts_MAX_LAYERS as usize;
94const DEFAULT_LAYER_WEIGHT: u32 = bpf_intf::consts_DEFAULT_LAYER_WEIGHT;
95const USAGE_HALF_LIFE: u32 = bpf_intf::consts_USAGE_HALF_LIFE;
96const USAGE_HALF_LIFE_F64: f64 = USAGE_HALF_LIFE as f64 / 1_000_000_000.0;
97
98const LAYER_USAGE_OWNED: usize = bpf_intf::layer_usage_LAYER_USAGE_OWNED as usize;
99const LAYER_USAGE_OPEN: usize = bpf_intf::layer_usage_LAYER_USAGE_OPEN as usize;
100const LAYER_USAGE_SUM_UPTO: usize = bpf_intf::layer_usage_LAYER_USAGE_SUM_UPTO as usize;
101const LAYER_USAGE_PROTECTED: usize = bpf_intf::layer_usage_LAYER_USAGE_PROTECTED as usize;
102const LAYER_USAGE_PROTECTED_PREEMPT: usize =
103 bpf_intf::layer_usage_LAYER_USAGE_PROTECTED_PREEMPT as usize;
104const NR_LAYER_USAGES: usize = bpf_intf::layer_usage_NR_LAYER_USAGES as usize;
105
106const NR_GSTATS: usize = bpf_intf::global_stat_id_NR_GSTATS as usize;
107const NR_LSTATS: usize = bpf_intf::layer_stat_id_NR_LSTATS as usize;
108const NR_LLC_LSTATS: usize = bpf_intf::llc_layer_stat_id_NR_LLC_LSTATS as usize;
109
110const NR_LAYER_MATCH_KINDS: usize = bpf_intf::layer_match_kind_NR_LAYER_MATCH_KINDS as usize;
111
112static NVML: OnceCell<Nvml> = OnceCell::new();
113
114fn nvml() -> Result<&'static Nvml, NvmlError> {
115 NVML.get_or_try_init(Nvml::init)
116}
117
118lazy_static! {
119 static ref USAGE_DECAY: f64 = 0.5f64.powf(1.0 / USAGE_HALF_LIFE_F64);
120 static ref DFL_DISALLOW_OPEN_AFTER_US: u64 = 2 * scx_enums.SCX_SLICE_DFL / 1000;
121 static ref DFL_DISALLOW_PREEMPT_AFTER_US: u64 = 4 * scx_enums.SCX_SLICE_DFL / 1000;
122 static ref EXAMPLE_CONFIG: LayerConfig = LayerConfig {
123 specs: vec![
124 LayerSpec {
125 name: "batch".into(),
126 comment: Some("tasks under system.slice or tasks with nice value > 0".into()),
127 cpuset: None,
128 template: None,
129 matches: vec![
130 vec![LayerMatch::CgroupPrefix("system.slice/".into())],
131 vec![LayerMatch::NiceAbove(0)],
132 ],
133 kind: LayerKind::Confined {
134 util_range: (0.8, 0.9),
135 cpus_range: Some((0, 16)),
136 cpus_range_frac: None,
137 protected: false,
138 membw_gb: None,
139 common: LayerCommon {
140 min_exec_us: 1000,
141 yield_ignore: 0.0,
142 preempt: false,
143 preempt_first: false,
144 exclusive: false,
145 allow_node_aligned: false,
146 skip_remote_node: false,
147 prev_over_idle_core: false,
148 idle_smt: None,
149 slice_us: 20000,
150 fifo: false,
151 weight: DEFAULT_LAYER_WEIGHT,
152 disallow_open_after_us: None,
153 disallow_preempt_after_us: None,
154 xllc_mig_min_us: 1000.0,
155 growth_algo: LayerGrowthAlgo::Sticky,
156 idle_resume_us: None,
157 perf: 1024,
158 nodes: vec![],
159 llcs: vec![],
160 member_expire_ms: 0,
161 placement: LayerPlacement::Standard,
162 },
163 },
164 },
165 LayerSpec {
166 name: "immediate".into(),
167 comment: Some("tasks under workload.slice with nice value < 0".into()),
168 cpuset: None,
169 template: None,
170 matches: vec![vec![
171 LayerMatch::CgroupPrefix("workload.slice/".into()),
172 LayerMatch::NiceBelow(0),
173 ]],
174 kind: LayerKind::Open {
175 common: LayerCommon {
176 min_exec_us: 100,
177 yield_ignore: 0.25,
178 preempt: true,
179 preempt_first: false,
180 exclusive: true,
181 allow_node_aligned: true,
182 skip_remote_node: false,
183 prev_over_idle_core: true,
184 idle_smt: None,
185 slice_us: 20000,
186 fifo: false,
187 weight: DEFAULT_LAYER_WEIGHT,
188 disallow_open_after_us: None,
189 disallow_preempt_after_us: None,
190 xllc_mig_min_us: 0.0,
191 growth_algo: LayerGrowthAlgo::Sticky,
192 perf: 1024,
193 idle_resume_us: None,
194 nodes: vec![],
195 llcs: vec![],
196 member_expire_ms: 0,
197 placement: LayerPlacement::Standard,
198 },
199 },
200 },
201 LayerSpec {
202 name: "stress-ng".into(),
203 comment: Some("stress-ng test layer".into()),
204 cpuset: None,
205 template: None,
206 matches: vec![
207 vec![LayerMatch::CommPrefix("stress-ng".into()),],
208 vec![LayerMatch::PcommPrefix("stress-ng".into()),]
209 ],
210 kind: LayerKind::Confined {
211 cpus_range: None,
212 util_range: (0.2, 0.8),
213 protected: false,
214 cpus_range_frac: None,
215 membw_gb: None,
216 common: LayerCommon {
217 min_exec_us: 800,
218 yield_ignore: 0.0,
219 preempt: true,
220 preempt_first: false,
221 exclusive: false,
222 allow_node_aligned: false,
223 skip_remote_node: false,
224 prev_over_idle_core: false,
225 idle_smt: None,
226 slice_us: 800,
227 fifo: false,
228 weight: DEFAULT_LAYER_WEIGHT,
229 disallow_open_after_us: None,
230 disallow_preempt_after_us: None,
231 xllc_mig_min_us: 0.0,
232 growth_algo: LayerGrowthAlgo::Topo,
233 perf: 1024,
234 idle_resume_us: None,
235 nodes: vec![],
236 llcs: vec![],
237 member_expire_ms: 0,
238 placement: LayerPlacement::Standard,
239 },
240 },
241 },
242 LayerSpec {
243 name: "normal".into(),
244 comment: Some("the rest".into()),
245 cpuset: None,
246 template: None,
247 matches: vec![vec![]],
248 kind: LayerKind::Grouped {
249 cpus_range: None,
250 util_range: (0.5, 0.6),
251 util_includes_open_cputime: true,
252 protected: false,
253 cpus_range_frac: None,
254 membw_gb: None,
255 common: LayerCommon {
256 min_exec_us: 200,
257 yield_ignore: 0.0,
258 preempt: false,
259 preempt_first: false,
260 exclusive: false,
261 allow_node_aligned: false,
262 skip_remote_node: false,
263 prev_over_idle_core: false,
264 idle_smt: None,
265 slice_us: 20000,
266 fifo: false,
267 weight: DEFAULT_LAYER_WEIGHT,
268 disallow_open_after_us: None,
269 disallow_preempt_after_us: None,
270 xllc_mig_min_us: 100.0,
271 growth_algo: LayerGrowthAlgo::Linear,
272 perf: 1024,
273 idle_resume_us: None,
274 nodes: vec![],
275 llcs: vec![],
276 member_expire_ms: 0,
277 placement: LayerPlacement::Standard,
278 },
279 },
280 },
281 ],
282 };
283}
284
285#[derive(Debug, Parser)]
573#[command(verbatim_doc_comment)]
574struct Opts {
575 #[clap(short = 's', long, default_value = "20000")]
577 slice_us: u64,
578
579 #[clap(short = 'M', long, default_value = "0")]
584 max_exec_us: u64,
585
586 #[clap(short = 'i', long, default_value = "0.1")]
588 interval: f64,
589
590 #[clap(short = 'n', long, default_value = "false")]
593 no_load_frac_limit: bool,
594
595 #[clap(long, default_value = "0")]
597 exit_dump_len: u32,
598
599 #[clap(short = 'v', long, action = clap::ArgAction::Count)]
602 verbose: u8,
603
604 #[arg(short = 't', long, num_args = 0..=1, default_missing_value = "true", require_equals = true)]
608 disable_topology: Option<bool>,
609
610 #[clap(long)]
612 xnuma_preemption: bool,
613
614 #[clap(long)]
616 monitor_disable: bool,
617
618 #[clap(short = 'e', long)]
620 example: Option<String>,
621
622 #[clap(long, default_value = "0.0")]
626 layer_preempt_weight_disable: f64,
627
628 #[clap(long, default_value = "0.0")]
632 layer_growth_weight_disable: f64,
633
634 #[clap(long)]
636 stats: Option<f64>,
637
638 #[clap(long)]
641 monitor: Option<f64>,
642
643 #[clap(long)]
645 run_example: bool,
646
647 #[clap(long, default_value = "false")]
650 local_llc_iteration: bool,
651
652 #[clap(long, default_value = "10000")]
657 lo_fb_wait_us: u64,
658
659 #[clap(long, default_value = ".05")]
662 lo_fb_share: f64,
663
664 #[clap(long, default_value = "false")]
666 disable_antistall: bool,
667
668 #[clap(long, default_value = "false")]
670 enable_gpu_affinitize: bool,
671
672 #[clap(long, default_value = "900")]
675 gpu_affinitize_secs: u64,
676
677 #[clap(long, default_value = "false")]
682 enable_match_debug: bool,
683
684 #[clap(long, default_value = "3")]
686 antistall_sec: u64,
687
688 #[clap(long, default_value = "false")]
690 enable_gpu_support: bool,
691
692 #[clap(long, default_value = "3")]
700 gpu_kprobe_level: u64,
701
702 #[clap(long, default_value = "false")]
704 netdev_irq_balance: bool,
705
706 #[clap(long, default_value = "false")]
708 disable_queued_wakeup: bool,
709
710 #[clap(long, default_value = "false")]
712 disable_percpu_kthread_preempt: bool,
713
714 #[clap(long, default_value = "false")]
718 percpu_kthread_preempt_all: bool,
719
720 #[clap(short = 'V', long, action = clap::ArgAction::SetTrue)]
722 version: bool,
723
724 #[clap(long)]
726 help_stats: bool,
727
728 specs: Vec<String>,
730
731 #[clap(long, default_value = "2000")]
734 layer_refresh_ms_avgruntime: u64,
735
736 #[clap(long, default_value = "")]
738 task_hint_map: String,
739
740 #[clap(long, default_value = "false")]
742 print_and_exit: bool,
743
744 #[clap(long, default_value = "")]
746 hi_fb_thread_name: String,
747
748 #[clap(flatten, next_help_heading = "Topology Options")]
749 topology: TopologyArgs,
750
751 #[clap(flatten, next_help_heading = "Libbpf Options")]
752 pub libbpf: LibbpfOpts,
753}
754
755#[derive(Debug, Clone)]
757enum CgroupEvent {
758 Created {
759 path: String,
760 cgroup_id: u64, match_bitmap: u64, },
763 Removed {
764 path: String,
765 cgroup_id: u64, },
767}
768
769fn read_total_cpu(reader: &fb_procfs::ProcReader) -> Result<fb_procfs::CpuStat> {
770 reader
771 .read_stat()
772 .context("Failed to read procfs")?
773 .total_cpu
774 .ok_or_else(|| anyhow!("Could not read total cpu stat in proc"))
775}
776
777fn calc_util(curr: &fb_procfs::CpuStat, prev: &fb_procfs::CpuStat) -> Result<f64> {
778 match (curr, prev) {
779 (
780 fb_procfs::CpuStat {
781 user_usec: Some(curr_user),
782 nice_usec: Some(curr_nice),
783 system_usec: Some(curr_system),
784 idle_usec: Some(curr_idle),
785 iowait_usec: Some(curr_iowait),
786 irq_usec: Some(curr_irq),
787 softirq_usec: Some(curr_softirq),
788 stolen_usec: Some(curr_stolen),
789 ..
790 },
791 fb_procfs::CpuStat {
792 user_usec: Some(prev_user),
793 nice_usec: Some(prev_nice),
794 system_usec: Some(prev_system),
795 idle_usec: Some(prev_idle),
796 iowait_usec: Some(prev_iowait),
797 irq_usec: Some(prev_irq),
798 softirq_usec: Some(prev_softirq),
799 stolen_usec: Some(prev_stolen),
800 ..
801 },
802 ) => {
803 let idle_usec = curr_idle.saturating_sub(*prev_idle);
804 let iowait_usec = curr_iowait.saturating_sub(*prev_iowait);
805 let user_usec = curr_user.saturating_sub(*prev_user);
806 let system_usec = curr_system.saturating_sub(*prev_system);
807 let nice_usec = curr_nice.saturating_sub(*prev_nice);
808 let irq_usec = curr_irq.saturating_sub(*prev_irq);
809 let softirq_usec = curr_softirq.saturating_sub(*prev_softirq);
810 let stolen_usec = curr_stolen.saturating_sub(*prev_stolen);
811
812 let busy_usec =
813 user_usec + system_usec + nice_usec + irq_usec + softirq_usec + stolen_usec;
814 let total_usec = idle_usec + busy_usec + iowait_usec;
815 if total_usec > 0 {
816 Ok(((busy_usec as f64) / (total_usec as f64)).clamp(0.0, 1.0))
817 } else {
818 Ok(1.0)
819 }
820 }
821 _ => bail!("Missing stats in cpustat"),
822 }
823}
824
825fn copy_into_cstr(dst: &mut [i8], src: &str) {
826 let cstr = CString::new(src).unwrap();
827 let bytes = unsafe { std::mem::transmute::<&[u8], &[i8]>(cstr.as_bytes_with_nul()) };
828 dst[0..bytes.len()].copy_from_slice(bytes);
829}
830
831fn nodemask_from_nodes(nodes: &Vec<usize>) -> usize {
832 let mut mask = 0;
833 for node in nodes {
834 mask |= 1 << node;
835 }
836 mask
837}
838
839fn llcmask_from_llcs(llcs: &BTreeMap<usize, Arc<Llc>>) -> usize {
840 let mut mask = 0;
841 for (_, cache) in llcs {
842 mask |= 1 << cache.id;
843 }
844 mask
845}
846
847fn read_cpu_ctxs(skel: &BpfSkel) -> Result<Vec<bpf_intf::cpu_ctx>> {
848 let mut cpu_ctxs = vec![];
849 let cpu_ctxs_vec = skel
850 .maps
851 .cpu_ctxs
852 .lookup_percpu(&0u32.to_ne_bytes(), libbpf_rs::MapFlags::ANY)
853 .context("Failed to lookup cpu_ctx")?
854 .unwrap();
855 for cpu in 0..*NR_CPUS_POSSIBLE {
856 cpu_ctxs.push(*unsafe {
857 &*(cpu_ctxs_vec[cpu].as_slice().as_ptr() as *const bpf_intf::cpu_ctx)
858 });
859 }
860 Ok(cpu_ctxs)
861}
862
863#[derive(Clone, Debug)]
864struct BpfStats {
865 gstats: Vec<u64>,
866 lstats: Vec<Vec<u64>>,
867 lstats_sums: Vec<u64>,
868 llc_lstats: Vec<Vec<Vec<u64>>>, }
870
871impl BpfStats {
872 fn read(skel: &BpfSkel, cpu_ctxs: &[bpf_intf::cpu_ctx]) -> Self {
873 let nr_layers = skel.maps.rodata_data.as_ref().unwrap().nr_layers as usize;
874 let nr_llcs = skel.maps.rodata_data.as_ref().unwrap().nr_llcs as usize;
875 let mut gstats = vec![0u64; NR_GSTATS];
876 let mut lstats = vec![vec![0u64; NR_LSTATS]; nr_layers];
877 let mut llc_lstats = vec![vec![vec![0u64; NR_LLC_LSTATS]; nr_llcs]; nr_layers];
878
879 for cpu in 0..*NR_CPUS_POSSIBLE {
880 for stat in 0..NR_GSTATS {
881 gstats[stat] += cpu_ctxs[cpu].gstats[stat];
882 }
883 for layer in 0..nr_layers {
884 for stat in 0..NR_LSTATS {
885 lstats[layer][stat] += cpu_ctxs[cpu].lstats[layer][stat];
886 }
887 }
888 }
889
890 let mut lstats_sums = vec![0u64; NR_LSTATS];
891 for layer in 0..nr_layers {
892 for stat in 0..NR_LSTATS {
893 lstats_sums[stat] += lstats[layer][stat];
894 }
895 }
896
897 for llc_id in 0..nr_llcs {
898 let key = llc_id as u32;
904 let llc_id_slice =
905 unsafe { std::slice::from_raw_parts((&key as *const u32) as *const u8, 4) };
906 let v = skel
907 .maps
908 .llc_data
909 .lookup(llc_id_slice, libbpf_rs::MapFlags::ANY)
910 .unwrap()
911 .unwrap();
912 let llcc = unsafe { *(v.as_slice().as_ptr() as *const bpf_intf::llc_ctx) };
913
914 for layer_id in 0..nr_layers {
915 for stat_id in 0..NR_LLC_LSTATS {
916 llc_lstats[layer_id][llc_id][stat_id] = llcc.lstats[layer_id][stat_id];
917 }
918 }
919 }
920
921 Self {
922 gstats,
923 lstats,
924 lstats_sums,
925 llc_lstats,
926 }
927 }
928}
929
930impl<'a, 'b> Sub<&'b BpfStats> for &'a BpfStats {
931 type Output = BpfStats;
932
933 fn sub(self, rhs: &'b BpfStats) -> BpfStats {
934 let vec_sub = |l: &[u64], r: &[u64]| l.iter().zip(r.iter()).map(|(l, r)| *l - *r).collect();
935 BpfStats {
936 gstats: vec_sub(&self.gstats, &rhs.gstats),
937 lstats: self
938 .lstats
939 .iter()
940 .zip(rhs.lstats.iter())
941 .map(|(l, r)| vec_sub(l, r))
942 .collect(),
943 lstats_sums: vec_sub(&self.lstats_sums, &rhs.lstats_sums),
944 llc_lstats: self
945 .llc_lstats
946 .iter()
947 .zip(rhs.llc_lstats.iter())
948 .map(|(l_layer, r_layer)| {
949 l_layer
950 .iter()
951 .zip(r_layer.iter())
952 .map(|(l_llc, r_llc)| {
953 let (l_llc, mut r_llc) = (l_llc.clone(), r_llc.clone());
954 r_llc[bpf_intf::llc_layer_stat_id_LLC_LSTAT_LAT as usize] = 0;
956 vec_sub(&l_llc, &r_llc)
957 })
958 .collect()
959 })
960 .collect(),
961 }
962 }
963}
964
965#[derive(Clone, Debug)]
966struct Stats {
967 at: Instant,
968 elapsed: Duration,
969 nr_layers: usize,
970 nr_layer_tasks: Vec<usize>,
971 nr_nodes: usize,
972
973 total_util: f64, layer_utils: Vec<Vec<f64>>,
975 prev_layer_usages: Vec<Vec<u64>>,
976
977 layer_membws: Vec<Vec<f64>>, prev_layer_membw_agg: Vec<Vec<u64>>, cpu_busy: f64, prev_total_cpu: fb_procfs::CpuStat,
982 prev_pmu_resctrl_membw: (u64, u64), system_cpu_util_ewma: f64, layer_dsq_insert_ewma: Vec<f64>, bpf_stats: BpfStats,
988 prev_bpf_stats: BpfStats,
989
990 processing_dur: Duration,
991 prev_processing_dur: Duration,
992
993 layer_slice_us: Vec<u64>,
994
995 gpu_tasks_affinitized: u64,
996 gpu_task_affinitization_ms: u64,
997}
998
999impl Stats {
1000 fn read_layer_usages(cpu_ctxs: &[bpf_intf::cpu_ctx], nr_layers: usize) -> Vec<Vec<u64>> {
1001 let mut layer_usages = vec![vec![0u64; NR_LAYER_USAGES]; nr_layers];
1002
1003 for cpu in 0..*NR_CPUS_POSSIBLE {
1004 for layer in 0..nr_layers {
1005 for usage in 0..NR_LAYER_USAGES {
1006 layer_usages[layer][usage] += cpu_ctxs[cpu].layer_usages[layer][usage];
1007 }
1008 }
1009 }
1010
1011 layer_usages
1012 }
1013
1014 fn read_layer_membw_agg(cpu_ctxs: &[bpf_intf::cpu_ctx], nr_layers: usize) -> Vec<Vec<u64>> {
1016 let mut layer_membw_agg = vec![vec![0u64; NR_LAYER_USAGES]; nr_layers];
1017
1018 for cpu in 0..*NR_CPUS_POSSIBLE {
1019 for layer in 0..nr_layers {
1020 for usage in 0..NR_LAYER_USAGES {
1021 layer_membw_agg[layer][usage] += cpu_ctxs[cpu].layer_membw_agg[layer][usage];
1022 }
1023 }
1024 }
1025
1026 layer_membw_agg
1027 }
1028
1029 fn resctrl_read_total_membw() -> Result<u64> {
1041 let mut total_membw = 0u64;
1042 for entry in WalkDir::new("/sys/fs/resctrl/mon_data")
1043 .min_depth(1)
1044 .into_iter()
1045 .filter_map(Result::ok)
1046 .filter(|x| x.path().is_dir())
1047 {
1048 let mut path = entry.path().to_path_buf();
1049 path.push("mbm_total_bytes");
1050 total_membw += fs::read_to_string(path)?.trim().parse::<u64>()?;
1051 }
1052
1053 Ok(total_membw)
1054 }
1055
1056 fn new(
1057 skel: &mut BpfSkel,
1058 proc_reader: &fb_procfs::ProcReader,
1059 gpu_task_affinitizer: &GpuTaskAffinitizer,
1060 ) -> Result<Self> {
1061 let nr_layers = skel.maps.rodata_data.as_ref().unwrap().nr_layers as usize;
1062 let cpu_ctxs = read_cpu_ctxs(skel)?;
1063 let bpf_stats = BpfStats::read(skel, &cpu_ctxs);
1064 let nr_nodes = skel.maps.rodata_data.as_ref().unwrap().nr_nodes as usize;
1065 let pmu_membw = Self::read_layer_membw_agg(&cpu_ctxs, nr_layers);
1066
1067 Ok(Self {
1068 at: Instant::now(),
1069 elapsed: Default::default(),
1070 nr_layers,
1071 nr_layer_tasks: vec![0; nr_layers],
1072 nr_nodes,
1073
1074 total_util: 0.0,
1075 layer_utils: vec![vec![0.0; NR_LAYER_USAGES]; nr_layers],
1076 layer_membws: vec![vec![0.0; NR_LAYER_USAGES]; nr_layers],
1077 prev_layer_usages: Self::read_layer_usages(&cpu_ctxs, nr_layers),
1078 prev_layer_membw_agg: pmu_membw,
1082 prev_pmu_resctrl_membw: (0, 0),
1083
1084 cpu_busy: 0.0,
1085 prev_total_cpu: read_total_cpu(proc_reader)?,
1086 system_cpu_util_ewma: 0.0,
1087 layer_dsq_insert_ewma: vec![0.0; nr_layers],
1088
1089 bpf_stats: bpf_stats.clone(),
1090 prev_bpf_stats: bpf_stats,
1091
1092 processing_dur: Default::default(),
1093 prev_processing_dur: Default::default(),
1094
1095 layer_slice_us: vec![0; nr_layers],
1096 gpu_tasks_affinitized: gpu_task_affinitizer.tasks_affinitized,
1097 gpu_task_affinitization_ms: gpu_task_affinitizer.last_task_affinitization_ms,
1098 })
1099 }
1100
1101 fn refresh(
1102 &mut self,
1103 skel: &mut BpfSkel,
1104 proc_reader: &fb_procfs::ProcReader,
1105 now: Instant,
1106 cur_processing_dur: Duration,
1107 gpu_task_affinitizer: &GpuTaskAffinitizer,
1108 ) -> Result<()> {
1109 let elapsed = now.duration_since(self.at);
1110 let elapsed_f64 = elapsed.as_secs_f64();
1111 let cpu_ctxs = read_cpu_ctxs(skel)?;
1112
1113 let nr_layer_tasks: Vec<usize> = skel
1114 .maps
1115 .bss_data
1116 .as_ref()
1117 .unwrap()
1118 .layers
1119 .iter()
1120 .take(self.nr_layers)
1121 .map(|layer| layer.nr_tasks as usize)
1122 .collect();
1123 let layer_slice_us: Vec<u64> = skel
1124 .maps
1125 .bss_data
1126 .as_ref()
1127 .unwrap()
1128 .layers
1129 .iter()
1130 .take(self.nr_layers)
1131 .map(|layer| layer.slice_ns / 1000_u64)
1132 .collect();
1133
1134 let cur_layer_usages = Self::read_layer_usages(&cpu_ctxs, self.nr_layers);
1135 let cur_layer_membw_agg = Self::read_layer_membw_agg(&cpu_ctxs, self.nr_layers);
1136
1137 let (pmu_prev, resctrl_prev) = self.prev_pmu_resctrl_membw;
1145 let pmu_cur: u64 = cur_layer_membw_agg
1146 .iter()
1147 .map(|membw_agg| membw_agg[LAYER_USAGE_OPEN] + membw_agg[LAYER_USAGE_OWNED])
1148 .sum();
1149 let resctrl_cur = Self::resctrl_read_total_membw()?;
1150 let factor = (resctrl_cur - resctrl_prev) as f64 / (pmu_cur - pmu_prev) as f64;
1151
1152 let compute_diff = |cur_agg: &Vec<Vec<u64>>, prev_agg: &Vec<Vec<u64>>| {
1154 cur_agg
1155 .iter()
1156 .zip(prev_agg.iter())
1157 .map(|(cur, prev)| {
1158 cur.iter()
1159 .zip(prev.iter())
1160 .map(|(c, p)| (c - p) as f64 / 1_000_000_000.0 / elapsed_f64)
1161 .collect()
1162 })
1163 .collect()
1164 };
1165
1166 let compute_mem_diff = |cur_agg: &Vec<Vec<u64>>, prev_agg: &Vec<Vec<u64>>| {
1169 cur_agg
1170 .iter()
1171 .zip(prev_agg.iter())
1172 .map(|(cur, prev)| {
1173 cur.iter()
1174 .zip(prev.iter())
1175 .map(|(c, p)| (*c as i64 - *p as i64) as f64 / (1024 as f64).powf(3.0))
1176 .collect()
1177 })
1178 .collect()
1179 };
1180
1181 let cur_layer_utils: Vec<Vec<f64>> =
1182 compute_diff(&cur_layer_usages, &self.prev_layer_usages);
1183
1184 let cur_layer_membw: Vec<Vec<f64>> =
1186 compute_mem_diff(&cur_layer_membw_agg, &self.prev_layer_membw_agg);
1187
1188 let cur_layer_membw: Vec<Vec<f64>> = cur_layer_membw
1189 .iter()
1190 .map(|x| x.iter().map(|x| *x * factor).collect())
1191 .collect();
1192
1193 let metric_decay =
1194 |cur_metric: Vec<Vec<f64>>, prev_metric: &Vec<Vec<f64>>, decay_rate: f64| {
1195 cur_metric
1196 .iter()
1197 .zip(prev_metric.iter())
1198 .map(|(cur, prev)| {
1199 cur.iter()
1200 .zip(prev.iter())
1201 .map(|(c, p)| {
1202 let decay = decay_rate.powf(elapsed_f64);
1203 p * decay + c * (1.0 - decay)
1204 })
1205 .collect()
1206 })
1207 .collect()
1208 };
1209
1210 let layer_utils: Vec<Vec<f64>> =
1211 metric_decay(cur_layer_utils, &self.layer_utils, *USAGE_DECAY);
1212 let layer_membws: Vec<Vec<f64>> = metric_decay(cur_layer_membw, &self.layer_membws, 0.0);
1213
1214 let cur_total_cpu = read_total_cpu(proc_reader)?;
1215 let cpu_busy = calc_util(&cur_total_cpu, &self.prev_total_cpu)?;
1216
1217 const SYS_CPU_UTIL_EWMA_SECS: f64 = 10.0;
1219 let elapsed_f64 = elapsed.as_secs_f64();
1220 let alpha = elapsed_f64 / SYS_CPU_UTIL_EWMA_SECS.max(elapsed_f64);
1221 let system_cpu_util_ewma = alpha * cpu_busy + (1.0 - alpha) * self.system_cpu_util_ewma;
1222
1223 let cur_bpf_stats = BpfStats::read(skel, &cpu_ctxs);
1224 let bpf_stats = &cur_bpf_stats - &self.prev_bpf_stats;
1225
1226 const DSQ_INSERT_EWMA_SECS: f64 = 10.0;
1228 let dsq_alpha = elapsed_f64 / DSQ_INSERT_EWMA_SECS.max(elapsed_f64);
1229 let layer_dsq_insert_ewma: Vec<f64> = (0..self.nr_layers)
1230 .map(|layer_id| {
1231 let sel_local = bpf_stats.lstats[layer_id]
1232 [bpf_intf::layer_stat_id_LSTAT_SEL_LOCAL as usize]
1233 as f64;
1234 let enq_local = bpf_stats.lstats[layer_id]
1235 [bpf_intf::layer_stat_id_LSTAT_ENQ_LOCAL as usize]
1236 as f64;
1237 let enq_dsq = bpf_stats.lstats[layer_id]
1238 [bpf_intf::layer_stat_id_LSTAT_ENQ_DSQ as usize]
1239 as f64;
1240 let total_dispatches = sel_local + enq_local + enq_dsq;
1241
1242 let cur_ratio = if total_dispatches > 0.0 {
1243 enq_dsq / total_dispatches
1244 } else {
1245 0.0
1246 };
1247
1248 dsq_alpha * cur_ratio + (1.0 - dsq_alpha) * self.layer_dsq_insert_ewma[layer_id]
1249 })
1250 .collect();
1251
1252 let processing_dur = cur_processing_dur
1253 .checked_sub(self.prev_processing_dur)
1254 .unwrap();
1255
1256 *self = Self {
1257 at: now,
1258 elapsed,
1259 nr_layers: self.nr_layers,
1260 nr_layer_tasks,
1261 nr_nodes: self.nr_nodes,
1262
1263 total_util: layer_utils
1264 .iter()
1265 .map(|x| x.iter().take(LAYER_USAGE_SUM_UPTO + 1).sum::<f64>())
1266 .sum(),
1267 layer_utils,
1268 layer_membws,
1269 prev_layer_usages: cur_layer_usages,
1270 prev_layer_membw_agg: cur_layer_membw_agg,
1271 prev_pmu_resctrl_membw: (pmu_cur, resctrl_cur),
1273
1274 cpu_busy,
1275 prev_total_cpu: cur_total_cpu,
1276 system_cpu_util_ewma,
1277 layer_dsq_insert_ewma,
1278
1279 bpf_stats,
1280 prev_bpf_stats: cur_bpf_stats,
1281
1282 processing_dur,
1283 prev_processing_dur: cur_processing_dur,
1284
1285 layer_slice_us,
1286 gpu_tasks_affinitized: gpu_task_affinitizer.tasks_affinitized,
1287 gpu_task_affinitization_ms: gpu_task_affinitizer.last_task_affinitization_ms,
1288 };
1289 Ok(())
1290 }
1291}
1292
1293#[derive(Debug)]
1294struct Layer {
1295 name: String,
1296 kind: LayerKind,
1297 growth_algo: LayerGrowthAlgo,
1298 core_order: Vec<usize>,
1299
1300 target_llc_cpus: (usize, usize),
1301 assigned_llcs: Vec<usize>,
1302
1303 nr_cpus: usize,
1304 nr_llc_cpus: Vec<usize>,
1305 cpus: Cpumask,
1306 allowed_cpus: Cpumask,
1307}
1308
1309fn get_kallsyms_addr(sym_name: &str) -> Result<u64> {
1310 fs::read_to_string("/proc/kallsyms")?
1311 .lines()
1312 .find(|line| line.contains(sym_name))
1313 .and_then(|line| line.split_whitespace().next())
1314 .and_then(|addr| u64::from_str_radix(addr, 16).ok())
1315 .ok_or_else(|| anyhow!("Symbol '{}' not found", sym_name))
1316}
1317
1318fn resolve_cpus_pct_range(
1319 cpus_range: &Option<(usize, usize)>,
1320 cpus_range_frac: &Option<(f64, f64)>,
1321 max_cpus: usize,
1322) -> Result<(usize, usize)> {
1323 match (cpus_range, cpus_range_frac) {
1324 (Some(_x), Some(_y)) => {
1325 bail!("cpus_range cannot be used with cpus_pct.");
1326 }
1327 (Some((cpus_range_min, cpus_range_max)), None) => Ok((*cpus_range_min, *cpus_range_max)),
1328 (None, Some((cpus_frac_min, cpus_frac_max))) => {
1329 if *cpus_frac_min < 0_f64
1330 || *cpus_frac_min > 1_f64
1331 || *cpus_frac_max < 0_f64
1332 || *cpus_frac_max > 1_f64
1333 {
1334 bail!("cpus_range_frac values must be between 0.0 and 1.0");
1335 }
1336 let cpus_min_count = ((max_cpus as f64) * cpus_frac_min).round_ties_even() as usize;
1337 let cpus_max_count = ((max_cpus as f64) * cpus_frac_max).round_ties_even() as usize;
1338 Ok((
1339 std::cmp::max(cpus_min_count, 1),
1340 std::cmp::min(cpus_max_count, max_cpus),
1341 ))
1342 }
1343 (None, None) => Ok((0, max_cpus)),
1344 }
1345}
1346
1347impl Layer {
1348 fn new(spec: &LayerSpec, topo: &Topology, core_order: &Vec<usize>) -> Result<Self> {
1349 let name = &spec.name;
1350 let kind = spec.kind.clone();
1351 let mut allowed_cpus = Cpumask::new();
1352 match &kind {
1353 LayerKind::Confined {
1354 cpus_range,
1355 cpus_range_frac,
1356 common: LayerCommon { nodes, llcs, .. },
1357 ..
1358 } => {
1359 let cpus_range =
1360 resolve_cpus_pct_range(cpus_range, cpus_range_frac, topo.all_cpus.len())?;
1361 if cpus_range.0 > cpus_range.1 || cpus_range.1 == 0 {
1362 bail!("invalid cpus_range {:?}", cpus_range);
1363 }
1364 if nodes.is_empty() && llcs.is_empty() {
1365 allowed_cpus.set_all();
1366 } else {
1367 for (node_id, node) in &topo.nodes {
1369 if nodes.contains(node_id) {
1371 for &id in node.all_cpus.keys() {
1372 allowed_cpus.set_cpu(id)?;
1373 }
1374 }
1375 for (llc_id, llc) in &node.llcs {
1377 if llcs.contains(llc_id) {
1378 for &id in llc.all_cpus.keys() {
1379 allowed_cpus.set_cpu(id)?;
1380 }
1381 }
1382 }
1383 }
1384 }
1385 }
1386 LayerKind::Grouped {
1387 common: LayerCommon { nodes, llcs, .. },
1388 ..
1389 }
1390 | LayerKind::Open {
1391 common: LayerCommon { nodes, llcs, .. },
1392 ..
1393 } => {
1394 if nodes.is_empty() && llcs.is_empty() {
1395 allowed_cpus.set_all();
1396 } else {
1397 for (node_id, node) in &topo.nodes {
1399 if nodes.contains(node_id) {
1401 for &id in node.all_cpus.keys() {
1402 allowed_cpus.set_cpu(id)?;
1403 }
1404 }
1405 for (llc_id, llc) in &node.llcs {
1407 if llcs.contains(llc_id) {
1408 for &id in llc.all_cpus.keys() {
1409 allowed_cpus.set_cpu(id)?;
1410 }
1411 }
1412 }
1413 }
1414 }
1415 }
1416 }
1417
1418 if let Some(util_range) = kind.util_range() {
1421 if util_range.0 < 0.0 || util_range.1 < 0.0 || util_range.0 >= util_range.1 {
1422 bail!("invalid util_range {:?}", util_range);
1423 }
1424 }
1425
1426 let layer_growth_algo = kind.common().growth_algo.clone();
1427
1428 debug!(
1429 "layer: {} algo: {:?} core order: {:?}",
1430 name, &layer_growth_algo, core_order
1431 );
1432
1433 Ok(Self {
1434 name: name.into(),
1435 kind,
1436 growth_algo: layer_growth_algo,
1437 core_order: core_order.clone(),
1438
1439 target_llc_cpus: (0, 0),
1440 assigned_llcs: vec![],
1441
1442 nr_cpus: 0,
1443 nr_llc_cpus: vec![0; topo.all_llcs.len()],
1444 cpus: Cpumask::new(),
1445 allowed_cpus,
1446 })
1447 }
1448
1449 fn free_some_cpus(&mut self, cpu_pool: &mut CpuPool, max_to_free: usize) -> Result<usize> {
1450 let cpus_to_free = match cpu_pool.next_to_free(&self.cpus, self.core_order.iter().rev())? {
1451 Some(ret) => ret.clone(),
1452 None => return Ok(0),
1453 };
1454
1455 let nr_to_free = cpus_to_free.weight();
1456
1457 Ok(if nr_to_free <= max_to_free {
1458 trace!("[{}] freeing CPUs: {}", self.name, &cpus_to_free);
1459 self.cpus &= &cpus_to_free.not();
1460 self.nr_cpus -= nr_to_free;
1461 for cpu in cpus_to_free.iter() {
1462 self.nr_llc_cpus[cpu_pool.topo.all_cpus[&cpu].llc_id] -= 1;
1463 }
1464 cpu_pool.free(&cpus_to_free)?;
1465 nr_to_free
1466 } else {
1467 0
1468 })
1469 }
1470
1471 fn alloc_some_cpus(&mut self, cpu_pool: &mut CpuPool) -> Result<usize> {
1472 let new_cpus = match cpu_pool
1473 .alloc_cpus(&self.allowed_cpus, &self.core_order)
1474 .clone()
1475 {
1476 Some(ret) => ret.clone(),
1477 None => {
1478 trace!("layer-{} can't grow, no CPUs", &self.name);
1479 return Ok(0);
1480 }
1481 };
1482
1483 let nr_new_cpus = new_cpus.weight();
1484
1485 trace!("[{}] adding CPUs: {}", &self.name, &new_cpus);
1486 self.cpus |= &new_cpus;
1487 self.nr_cpus += nr_new_cpus;
1488 for cpu in new_cpus.iter() {
1489 self.nr_llc_cpus[cpu_pool.topo.all_cpus[&cpu].llc_id] += 1;
1490 }
1491 Ok(nr_new_cpus)
1492 }
1493}
1494#[derive(Debug, Clone)]
1495struct NodeInfo {
1496 node_mask: nix::sched::CpuSet,
1497 _node_id: usize,
1498}
1499
1500#[derive(Debug)]
1501struct GpuTaskAffinitizer {
1502 gpu_devs_to_node_info: HashMap<u32, NodeInfo>,
1505 gpu_pids_to_devs: HashMap<Pid, u32>,
1506 last_process_time: Option<Instant>,
1507 sys: System,
1508 pid_map: HashMap<Pid, Vec<Pid>>,
1509 poll_interval: Duration,
1510 enable: bool,
1511 tasks_affinitized: u64,
1512 last_task_affinitization_ms: u64,
1513}
1514
1515impl GpuTaskAffinitizer {
1516 pub fn new(poll_interval: u64, enable: bool) -> GpuTaskAffinitizer {
1517 GpuTaskAffinitizer {
1518 gpu_devs_to_node_info: HashMap::new(),
1519 gpu_pids_to_devs: HashMap::new(),
1520 last_process_time: None,
1521 sys: System::default(),
1522 pid_map: HashMap::new(),
1523 poll_interval: Duration::from_secs(poll_interval),
1524 enable,
1525 tasks_affinitized: 0,
1526 last_task_affinitization_ms: 0,
1527 }
1528 }
1529
1530 fn find_one_cpu(&self, affinity: Vec<u64>) -> Result<u32> {
1531 for (chunk, &mask) in affinity.iter().enumerate() {
1532 let mut inner_offset: u64 = 1;
1533 for _ in 0..64 {
1534 if (mask & inner_offset) != 0 {
1535 return Ok((64 * chunk + u64::trailing_zeros(inner_offset) as usize) as u32);
1536 }
1537 inner_offset = inner_offset << 1;
1538 }
1539 }
1540 anyhow::bail!("unable to get CPU from NVML bitmask");
1541 }
1542
1543 fn node_to_cpuset(&self, node: &scx_utils::Node) -> Result<CpuSet> {
1544 let mut cpuset = CpuSet::new();
1545 for (cpu_id, _cpu) in &node.all_cpus {
1546 cpuset.set(*cpu_id)?;
1547 }
1548 Ok(cpuset)
1549 }
1550
1551 fn init_dev_node_map(&mut self, topo: Arc<Topology>) -> Result<()> {
1552 let nvml = nvml()?;
1553 let device_count = nvml.device_count()?;
1554
1555 for idx in 0..device_count {
1556 let dev = nvml.device_by_index(idx)?;
1557 let cpu = dev.cpu_affinity(16)?;
1559 let ideal_cpu = self.find_one_cpu(cpu)?;
1560 if let Some(cpu) = topo.all_cpus.get(&(ideal_cpu as usize)) {
1561 self.gpu_devs_to_node_info.insert(
1562 idx,
1563 NodeInfo {
1564 node_mask: self.node_to_cpuset(
1565 topo.nodes.get(&cpu.node_id).expect("topo missing node"),
1566 )?,
1567 _node_id: cpu.node_id,
1568 },
1569 );
1570 }
1571 }
1572 Ok(())
1573 }
1574
1575 fn update_gpu_pids(&mut self) -> Result<()> {
1576 let nvml = nvml()?;
1577 for i in 0..nvml.device_count()? {
1578 let device = nvml.device_by_index(i)?;
1579 for proc in device
1580 .running_compute_processes()?
1581 .into_iter()
1582 .chain(device.running_graphics_processes()?.into_iter())
1583 {
1584 self.gpu_pids_to_devs.insert(Pid::from_u32(proc.pid), i);
1585 }
1586 }
1587 Ok(())
1588 }
1589
1590 fn update_process_info(&mut self) -> Result<()> {
1591 self.sys.refresh_processes_specifics(
1592 ProcessesToUpdate::All,
1593 true,
1594 ProcessRefreshKind::nothing(),
1595 );
1596 self.pid_map.clear();
1597 for (pid, proc_) in self.sys.processes() {
1598 if let Some(ppid) = proc_.parent() {
1599 self.pid_map.entry(ppid).or_default().push(*pid);
1600 }
1601 }
1602 Ok(())
1603 }
1604
1605 fn get_child_pids_and_tids(&self, root_pid: Pid) -> HashSet<Pid> {
1606 let mut work = VecDeque::from([root_pid]);
1607 let mut pids_and_tids: HashSet<Pid> = HashSet::new();
1608
1609 while let Some(pid) = work.pop_front() {
1610 if pids_and_tids.insert(pid) {
1611 if let Some(kids) = self.pid_map.get(&pid) {
1612 work.extend(kids);
1613 }
1614 if let Some(proc_) = self.sys.process(pid) {
1615 if let Some(tasks) = proc_.tasks() {
1616 pids_and_tids.extend(tasks.iter().copied());
1617 }
1618 }
1619 }
1620 }
1621 pids_and_tids
1622 }
1623
1624 fn affinitize_gpu_pids(&mut self) -> Result<()> {
1625 if !self.enable {
1626 return Ok(());
1627 }
1628 for (pid, dev) in &self.gpu_pids_to_devs {
1629 let node_info = self
1630 .gpu_devs_to_node_info
1631 .get(&dev)
1632 .expect("Unable to get gpu pid node mask");
1633 for child in self.get_child_pids_and_tids(*pid) {
1634 match nix::sched::sched_setaffinity(
1635 nix::unistd::Pid::from_raw(child.as_u32() as i32),
1636 &node_info.node_mask,
1637 ) {
1638 Ok(_) => {
1639 self.tasks_affinitized += 1;
1641 }
1642 Err(_) => {
1643 debug!(
1644 "Error affinitizing gpu pid {} to node {:#?}",
1645 child.as_u32(),
1646 node_info
1647 );
1648 }
1649 };
1650 }
1651 }
1652 Ok(())
1653 }
1654
1655 pub fn maybe_affinitize(&mut self) {
1656 if !self.enable {
1657 return;
1658 }
1659 let now = Instant::now();
1660
1661 if let Some(last_process_time) = self.last_process_time {
1662 if (now - last_process_time) < self.poll_interval {
1663 return;
1664 }
1665 }
1666
1667 match self.update_gpu_pids() {
1668 Ok(_) => {}
1669 Err(e) => {
1670 error!("Error updating GPU PIDs: {}", e);
1671 }
1672 };
1673 match self.update_process_info() {
1674 Ok(_) => {}
1675 Err(e) => {
1676 error!("Error updating process info to affinitize GPU PIDs: {}", e);
1677 }
1678 };
1679 match self.affinitize_gpu_pids() {
1680 Ok(_) => {}
1681 Err(e) => {
1682 error!("Error updating GPU PIDs: {}", e);
1683 }
1684 };
1685 self.last_process_time = Some(now);
1686 self.last_task_affinitization_ms = (Instant::now() - now).as_millis() as u64;
1687
1688 return;
1689 }
1690
1691 pub fn init(&mut self, topo: Arc<Topology>) {
1692 if !self.enable || self.last_process_time.is_some() {
1693 return;
1694 }
1695
1696 match self.init_dev_node_map(topo) {
1697 Ok(_) => {}
1698 Err(e) => {
1699 error!("Error initializing gpu node dev map: {}", e);
1700 }
1701 };
1702 self.sys = System::new_all();
1703 return;
1704 }
1705}
1706
1707struct Scheduler<'a> {
1708 skel: BpfSkel<'a>,
1709 struct_ops: Option<libbpf_rs::Link>,
1710 layer_specs: Vec<LayerSpec>,
1711
1712 sched_intv: Duration,
1713 layer_refresh_intv: Duration,
1714
1715 cpu_pool: CpuPool,
1716 layers: Vec<Layer>,
1717 idle_qos_enabled: bool,
1718
1719 proc_reader: fb_procfs::ProcReader,
1720 sched_stats: Stats,
1721
1722 cgroup_regexes: Option<HashMap<u32, Regex>>,
1723
1724 nr_layer_cpus_ranges: Vec<(usize, usize)>,
1725 processing_dur: Duration,
1726
1727 topo: Arc<Topology>,
1728 netdevs: BTreeMap<String, NetDev>,
1729 stats_server: StatsServer<StatsReq, StatsRes>,
1730 gpu_task_handler: GpuTaskAffinitizer,
1731}
1732
1733impl<'a> Scheduler<'a> {
1734 fn init_layers(
1735 skel: &mut OpenBpfSkel,
1736 specs: &[LayerSpec],
1737 topo: &Topology,
1738 ) -> Result<HashMap<u32, Regex>> {
1739 skel.maps.rodata_data.as_mut().unwrap().nr_layers = specs.len() as u32;
1740 let mut perf_set = false;
1741
1742 let mut layer_iteration_order = (0..specs.len()).collect::<Vec<_>>();
1743 let mut layer_weights: Vec<usize> = vec![];
1744 let mut cgroup_regex_id = 0;
1745 let mut cgroup_regexes = HashMap::new();
1746
1747 for (spec_i, spec) in specs.iter().enumerate() {
1748 let layer = &mut skel.maps.bss_data.as_mut().unwrap().layers[spec_i];
1749
1750 for (or_i, or) in spec.matches.iter().enumerate() {
1751 for (and_i, and) in or.iter().enumerate() {
1752 let mt = &mut layer.matches[or_i].matches[and_i];
1753
1754 mt.exclude.write(false);
1756
1757 match and {
1758 LayerMatch::CgroupPrefix(prefix) => {
1759 mt.kind = bpf_intf::layer_match_kind_MATCH_CGROUP_PREFIX as i32;
1760 copy_into_cstr(&mut mt.cgroup_prefix, prefix.as_str());
1761 }
1762 LayerMatch::CgroupSuffix(suffix) => {
1763 mt.kind = bpf_intf::layer_match_kind_MATCH_CGROUP_SUFFIX as i32;
1764 copy_into_cstr(&mut mt.cgroup_suffix, suffix.as_str());
1765 }
1766 LayerMatch::CgroupRegex(regex_str) => {
1767 if cgroup_regex_id >= bpf_intf::consts_MAX_CGROUP_REGEXES {
1768 bail!(
1769 "Too many cgroup regex rules. Maximum allowed: {}",
1770 bpf_intf::consts_MAX_CGROUP_REGEXES
1771 );
1772 }
1773
1774 mt.kind = bpf_intf::layer_match_kind_MATCH_CGROUP_REGEX as i32;
1776 mt.cgroup_regex_id = cgroup_regex_id;
1777
1778 let regex = Regex::new(regex_str).with_context(|| {
1779 format!("Invalid regex '{}' in layer '{}'", regex_str, spec.name)
1780 })?;
1781 cgroup_regexes.insert(cgroup_regex_id, regex);
1782 cgroup_regex_id += 1;
1783 }
1784 LayerMatch::CgroupContains(substr) => {
1785 mt.kind = bpf_intf::layer_match_kind_MATCH_CGROUP_CONTAINS as i32;
1786 copy_into_cstr(&mut mt.cgroup_substr, substr.as_str());
1787 }
1788 LayerMatch::CommPrefix(prefix) => {
1789 mt.kind = bpf_intf::layer_match_kind_MATCH_COMM_PREFIX as i32;
1790 copy_into_cstr(&mut mt.comm_prefix, prefix.as_str());
1791 }
1792 LayerMatch::CommPrefixExclude(prefix) => {
1793 mt.kind = bpf_intf::layer_match_kind_MATCH_COMM_PREFIX as i32;
1794 mt.exclude.write(true);
1795 copy_into_cstr(&mut mt.comm_prefix, prefix.as_str());
1796 }
1797 LayerMatch::PcommPrefix(prefix) => {
1798 mt.kind = bpf_intf::layer_match_kind_MATCH_PCOMM_PREFIX as i32;
1799 copy_into_cstr(&mut mt.pcomm_prefix, prefix.as_str());
1800 }
1801 LayerMatch::PcommPrefixExclude(prefix) => {
1802 mt.kind = bpf_intf::layer_match_kind_MATCH_PCOMM_PREFIX as i32;
1803 mt.exclude.write(true);
1804 copy_into_cstr(&mut mt.pcomm_prefix, prefix.as_str());
1805 }
1806 LayerMatch::NiceAbove(nice) => {
1807 mt.kind = bpf_intf::layer_match_kind_MATCH_NICE_ABOVE as i32;
1808 mt.nice = *nice;
1809 }
1810 LayerMatch::NiceBelow(nice) => {
1811 mt.kind = bpf_intf::layer_match_kind_MATCH_NICE_BELOW as i32;
1812 mt.nice = *nice;
1813 }
1814 LayerMatch::NiceEquals(nice) => {
1815 mt.kind = bpf_intf::layer_match_kind_MATCH_NICE_EQUALS as i32;
1816 mt.nice = *nice;
1817 }
1818 LayerMatch::UIDEquals(user_id) => {
1819 mt.kind = bpf_intf::layer_match_kind_MATCH_USER_ID_EQUALS as i32;
1820 mt.user_id = *user_id;
1821 }
1822 LayerMatch::GIDEquals(group_id) => {
1823 mt.kind = bpf_intf::layer_match_kind_MATCH_GROUP_ID_EQUALS as i32;
1824 mt.group_id = *group_id;
1825 }
1826 LayerMatch::PIDEquals(pid) => {
1827 mt.kind = bpf_intf::layer_match_kind_MATCH_PID_EQUALS as i32;
1828 mt.pid = *pid;
1829 }
1830 LayerMatch::PPIDEquals(ppid) => {
1831 mt.kind = bpf_intf::layer_match_kind_MATCH_PPID_EQUALS as i32;
1832 mt.ppid = *ppid;
1833 }
1834 LayerMatch::TGIDEquals(tgid) => {
1835 mt.kind = bpf_intf::layer_match_kind_MATCH_TGID_EQUALS as i32;
1836 mt.tgid = *tgid;
1837 }
1838 LayerMatch::NSPIDEquals(nsid, pid) => {
1839 mt.kind = bpf_intf::layer_match_kind_MATCH_NSPID_EQUALS as i32;
1840 mt.nsid = *nsid;
1841 mt.pid = *pid;
1842 }
1843 LayerMatch::NSEquals(nsid) => {
1844 mt.kind = bpf_intf::layer_match_kind_MATCH_NS_EQUALS as i32;
1845 mt.nsid = *nsid as u64;
1846 }
1847 LayerMatch::CmdJoin(joincmd) => {
1848 mt.kind = bpf_intf::layer_match_kind_MATCH_SCXCMD_JOIN as i32;
1849 copy_into_cstr(&mut mt.comm_prefix, joincmd);
1850 }
1851 LayerMatch::IsGroupLeader(polarity) => {
1852 mt.kind = bpf_intf::layer_match_kind_MATCH_IS_GROUP_LEADER as i32;
1853 mt.is_group_leader.write(*polarity);
1854 }
1855 LayerMatch::IsKthread(polarity) => {
1856 mt.kind = bpf_intf::layer_match_kind_MATCH_IS_KTHREAD as i32;
1857 mt.is_kthread.write(*polarity);
1858 }
1859 LayerMatch::UsedGpuTid(polarity) => {
1860 mt.kind = bpf_intf::layer_match_kind_MATCH_USED_GPU_TID as i32;
1861 mt.used_gpu_tid.write(*polarity);
1862 }
1863 LayerMatch::UsedGpuPid(polarity) => {
1864 mt.kind = bpf_intf::layer_match_kind_MATCH_USED_GPU_PID as i32;
1865 mt.used_gpu_pid.write(*polarity);
1866 }
1867 LayerMatch::AvgRuntime(min, max) => {
1868 mt.kind = bpf_intf::layer_match_kind_MATCH_AVG_RUNTIME as i32;
1869 mt.min_avg_runtime_us = *min;
1870 mt.max_avg_runtime_us = *max;
1871 }
1872 LayerMatch::HintEquals(hint) => {
1873 mt.kind = bpf_intf::layer_match_kind_MATCH_HINT_EQUALS as i32;
1874 mt.hint = *hint;
1875 }
1876 LayerMatch::SystemCpuUtilBelow(threshold) => {
1877 mt.kind = bpf_intf::layer_match_kind_MATCH_SYSTEM_CPU_UTIL_BELOW as i32;
1878 mt.system_cpu_util_below = (*threshold * 10000.0) as u64;
1879 }
1880 LayerMatch::DsqInsertBelow(threshold) => {
1881 mt.kind = bpf_intf::layer_match_kind_MATCH_DSQ_INSERT_BELOW as i32;
1882 mt.dsq_insert_below = (*threshold * 10000.0) as u64;
1883 }
1884 }
1885 }
1886 layer.matches[or_i].nr_match_ands = or.len() as i32;
1887 }
1888
1889 layer.nr_match_ors = spec.matches.len() as u32;
1890 layer.kind = spec.kind.as_bpf_enum();
1891
1892 {
1893 let LayerCommon {
1894 min_exec_us,
1895 yield_ignore,
1896 perf,
1897 preempt,
1898 preempt_first,
1899 exclusive,
1900 allow_node_aligned,
1901 skip_remote_node,
1902 prev_over_idle_core,
1903 growth_algo,
1904 nodes,
1905 slice_us,
1906 fifo,
1907 weight,
1908 disallow_open_after_us,
1909 disallow_preempt_after_us,
1910 xllc_mig_min_us,
1911 placement,
1912 member_expire_ms,
1913 ..
1914 } = spec.kind.common();
1915
1916 layer.slice_ns = *slice_us * 1000;
1917 layer.fifo.write(*fifo);
1918 layer.min_exec_ns = min_exec_us * 1000;
1919 layer.yield_step_ns = if *yield_ignore > 0.999 {
1920 0
1921 } else if *yield_ignore < 0.001 {
1922 layer.slice_ns
1923 } else {
1924 (layer.slice_ns as f64 * (1.0 - *yield_ignore)) as u64
1925 };
1926 let mut layer_name: String = spec.name.clone();
1927 layer_name.truncate(MAX_LAYER_NAME);
1928 copy_into_cstr(&mut layer.name, layer_name.as_str());
1929 layer.preempt.write(*preempt);
1930 layer.preempt_first.write(*preempt_first);
1931 layer.excl.write(*exclusive);
1932 layer.allow_node_aligned.write(*allow_node_aligned);
1933 layer.skip_remote_node.write(*skip_remote_node);
1934 layer.prev_over_idle_core.write(*prev_over_idle_core);
1935 layer.growth_algo = growth_algo.as_bpf_enum();
1936 layer.weight = *weight;
1937 layer.member_expire_ms = *member_expire_ms;
1938 layer.disallow_open_after_ns = match disallow_open_after_us.unwrap() {
1939 v if v == u64::MAX => v,
1940 v => v * 1000,
1941 };
1942 layer.disallow_preempt_after_ns = match disallow_preempt_after_us.unwrap() {
1943 v if v == u64::MAX => v,
1944 v => v * 1000,
1945 };
1946 layer.xllc_mig_min_ns = (xllc_mig_min_us * 1000.0) as u64;
1947 layer_weights.push(layer.weight.try_into().unwrap());
1948 layer.perf = u32::try_from(*perf)?;
1949 layer.node_mask = nodemask_from_nodes(nodes) as u64;
1950 for (topo_node_id, topo_node) in &topo.nodes {
1951 if !nodes.is_empty() && !nodes.contains(topo_node_id) {
1952 continue;
1953 }
1954 layer.llc_mask |= llcmask_from_llcs(&topo_node.llcs) as u64;
1955 }
1956
1957 let task_place = |place: u32| crate::types::layer_task_place(place);
1958 layer.task_place = match placement {
1959 LayerPlacement::Standard => {
1960 task_place(bpf_intf::layer_task_place_PLACEMENT_STD as u32)
1961 }
1962 LayerPlacement::Sticky => {
1963 task_place(bpf_intf::layer_task_place_PLACEMENT_STICK as u32)
1964 }
1965 LayerPlacement::Floating => {
1966 task_place(bpf_intf::layer_task_place_PLACEMENT_FLOAT as u32)
1967 }
1968 };
1969 }
1970
1971 layer.is_protected.write(match spec.kind {
1972 LayerKind::Open { .. } => false,
1973 LayerKind::Confined { protected, .. } | LayerKind::Grouped { protected, .. } => {
1974 protected
1975 }
1976 });
1977
1978 match &spec.cpuset {
1979 Some(mask) => {
1980 Self::update_cpumask(&mask, &mut layer.cpuset);
1981 }
1982 None => {
1983 for i in 0..layer.cpuset.len() {
1984 layer.cpuset[i] = u8::MAX;
1985 }
1986 }
1987 };
1988
1989 perf_set |= layer.perf > 0;
1990 }
1991
1992 layer_iteration_order.sort_by(|i, j| layer_weights[*i].cmp(&layer_weights[*j]));
1993 for (idx, layer_idx) in layer_iteration_order.iter().enumerate() {
1994 skel.maps
1995 .rodata_data
1996 .as_mut()
1997 .unwrap()
1998 .layer_iteration_order[idx] = *layer_idx as u32;
1999 }
2000
2001 if perf_set && !compat::ksym_exists("scx_bpf_cpuperf_set")? {
2002 warn!("cpufreq support not available, ignoring perf configurations");
2003 }
2004
2005 Ok(cgroup_regexes)
2006 }
2007
2008 fn init_nodes(skel: &mut OpenBpfSkel, _opts: &Opts, topo: &Topology) {
2009 skel.maps.rodata_data.as_mut().unwrap().nr_nodes = topo.nodes.len() as u32;
2010 skel.maps.rodata_data.as_mut().unwrap().nr_llcs = 0;
2011
2012 for (&node_id, node) in &topo.nodes {
2013 debug!("configuring node {}, LLCs {:?}", node_id, node.llcs.len());
2014 skel.maps.rodata_data.as_mut().unwrap().nr_llcs += node.llcs.len() as u32;
2015 let raw_numa_slice = node.span.as_raw_slice();
2016 let node_cpumask_slice =
2017 &mut skel.maps.rodata_data.as_mut().unwrap().numa_cpumasks[node_id];
2018 let (left, _) = node_cpumask_slice.split_at_mut(raw_numa_slice.len());
2019 left.clone_from_slice(raw_numa_slice);
2020 debug!(
2021 "node {} mask: {:?}",
2022 node_id,
2023 skel.maps.rodata_data.as_ref().unwrap().numa_cpumasks[node_id]
2024 );
2025
2026 for llc in node.llcs.values() {
2027 debug!("configuring llc {:?} for node {:?}", llc.id, node_id);
2028 skel.maps.rodata_data.as_mut().unwrap().llc_numa_id_map[llc.id] = node_id as u32;
2029 }
2030 }
2031
2032 for cpu in topo.all_cpus.values() {
2033 skel.maps.rodata_data.as_mut().unwrap().cpu_llc_id_map[cpu.id] = cpu.llc_id as u32;
2034 }
2035 }
2036
2037 fn init_cpu_prox_map(topo: &Topology, cpu_ctxs: &mut [bpf_intf::cpu_ctx]) {
2038 let radiate = |mut vec: Vec<usize>, center_id: usize| -> Vec<usize> {
2039 vec.sort_by_key(|&id| (center_id as i32 - id as i32).abs());
2040 vec
2041 };
2042 let radiate_cpu =
2043 |mut vec: Vec<usize>, center_cpu: usize, center_core: usize| -> Vec<usize> {
2044 vec.sort_by_key(|&id| {
2045 (
2046 (center_core as i32 - topo.all_cpus.get(&id).unwrap().core_id as i32).abs(),
2047 (center_cpu as i32 - id as i32).abs(),
2048 )
2049 });
2050 vec
2051 };
2052
2053 for (&cpu_id, cpu) in &topo.all_cpus {
2054 let mut core_span = topo.all_cores[&cpu.core_id].span.clone();
2056 let llc_span = &topo.all_llcs[&cpu.llc_id].span;
2057 let node_span = &topo.nodes[&cpu.node_id].span;
2058 let sys_span = &topo.span;
2059
2060 let sys_span = sys_span.and(&node_span.not());
2062 let node_span = node_span.and(&llc_span.not());
2063 let llc_span = llc_span.and(&core_span.not());
2064 core_span.clear_cpu(cpu_id).unwrap();
2065
2066 let mut sys_order: Vec<usize> = sys_span.iter().collect();
2068 let mut node_order: Vec<usize> = node_span.iter().collect();
2069 let mut llc_order: Vec<usize> = llc_span.iter().collect();
2070 let mut core_order: Vec<usize> = core_span.iter().collect();
2071
2072 sys_order = radiate_cpu(sys_order, cpu_id, cpu.core_id);
2077 node_order = radiate(node_order, cpu.node_id);
2078 llc_order = radiate_cpu(llc_order, cpu_id, cpu.core_id);
2079 core_order = radiate_cpu(core_order, cpu_id, cpu.core_id);
2080
2081 let mut order: Vec<usize> = vec![];
2083 let mut idx: usize = 0;
2084
2085 idx += 1;
2086 order.push(cpu_id);
2087
2088 idx += core_order.len();
2089 order.append(&mut core_order);
2090 let core_end = idx;
2091
2092 idx += llc_order.len();
2093 order.append(&mut llc_order);
2094 let llc_end = idx;
2095
2096 idx += node_order.len();
2097 order.append(&mut node_order);
2098 let node_end = idx;
2099
2100 idx += sys_order.len();
2101 order.append(&mut sys_order);
2102 let sys_end = idx;
2103
2104 debug!(
2105 "CPU[{}] proximity map[{}/{}/{}/{}]: {:?}",
2106 cpu_id, core_end, llc_end, node_end, sys_end, &order
2107 );
2108
2109 let pmap = &mut cpu_ctxs[cpu_id].prox_map;
2111 for (i, &cpu) in order.iter().enumerate() {
2112 pmap.cpus[i] = cpu as u16;
2113 }
2114 pmap.core_end = core_end as u32;
2115 pmap.llc_end = llc_end as u32;
2116 pmap.node_end = node_end as u32;
2117 pmap.sys_end = sys_end as u32;
2118 }
2119 }
2120
2121 fn convert_cpu_ctxs(cpu_ctxs: Vec<bpf_intf::cpu_ctx>) -> Vec<Vec<u8>> {
2122 cpu_ctxs
2123 .into_iter()
2124 .map(|cpu_ctx| {
2125 let bytes = unsafe {
2126 std::slice::from_raw_parts(
2127 &cpu_ctx as *const bpf_intf::cpu_ctx as *const u8,
2128 std::mem::size_of::<bpf_intf::cpu_ctx>(),
2129 )
2130 };
2131 bytes.to_vec()
2132 })
2133 .collect()
2134 }
2135
2136 fn init_cpus(skel: &BpfSkel, layer_specs: &[LayerSpec], topo: &Topology) -> Result<()> {
2137 let key = (0_u32).to_ne_bytes();
2138 let mut cpu_ctxs: Vec<bpf_intf::cpu_ctx> = vec![];
2139 let cpu_ctxs_vec = skel
2140 .maps
2141 .cpu_ctxs
2142 .lookup_percpu(&key, libbpf_rs::MapFlags::ANY)
2143 .context("Failed to lookup cpu_ctx")?
2144 .unwrap();
2145
2146 let op_layers: Vec<u32> = layer_specs
2147 .iter()
2148 .enumerate()
2149 .filter(|(_idx, spec)| match &spec.kind {
2150 LayerKind::Open { .. } => spec.kind.common().preempt,
2151 _ => false,
2152 })
2153 .map(|(idx, _)| idx as u32)
2154 .collect();
2155 let on_layers: Vec<u32> = layer_specs
2156 .iter()
2157 .enumerate()
2158 .filter(|(_idx, spec)| match &spec.kind {
2159 LayerKind::Open { .. } => !spec.kind.common().preempt,
2160 _ => false,
2161 })
2162 .map(|(idx, _)| idx as u32)
2163 .collect();
2164 let gp_layers: Vec<u32> = layer_specs
2165 .iter()
2166 .enumerate()
2167 .filter(|(_idx, spec)| match &spec.kind {
2168 LayerKind::Grouped { .. } => spec.kind.common().preempt,
2169 _ => false,
2170 })
2171 .map(|(idx, _)| idx as u32)
2172 .collect();
2173 let gn_layers: Vec<u32> = layer_specs
2174 .iter()
2175 .enumerate()
2176 .filter(|(_idx, spec)| match &spec.kind {
2177 LayerKind::Grouped { .. } => !spec.kind.common().preempt,
2178 _ => false,
2179 })
2180 .map(|(idx, _)| idx as u32)
2181 .collect();
2182
2183 for cpu in 0..*NR_CPUS_POSSIBLE {
2185 cpu_ctxs.push(*unsafe {
2186 &*(cpu_ctxs_vec[cpu].as_slice().as_ptr() as *const bpf_intf::cpu_ctx)
2187 });
2188
2189 let topo_cpu = topo.all_cpus.get(&cpu).unwrap();
2190 let is_big = topo_cpu.core_type == CoreType::Big { turbo: true };
2191 cpu_ctxs[cpu].cpu = cpu as i32;
2192 cpu_ctxs[cpu].layer_id = MAX_LAYERS as u32;
2193 cpu_ctxs[cpu].is_big = is_big;
2194
2195 fastrand::seed(cpu as u64);
2196
2197 let mut ogp_order = op_layers.clone();
2198 ogp_order.append(&mut gp_layers.clone());
2199 fastrand::shuffle(&mut ogp_order);
2200
2201 let mut ogn_order = on_layers.clone();
2202 ogn_order.append(&mut gn_layers.clone());
2203 fastrand::shuffle(&mut ogn_order);
2204
2205 let mut op_order = op_layers.clone();
2206 fastrand::shuffle(&mut op_order);
2207
2208 let mut on_order = on_layers.clone();
2209 fastrand::shuffle(&mut on_order);
2210
2211 let mut gp_order = gp_layers.clone();
2212 fastrand::shuffle(&mut gp_order);
2213
2214 let mut gn_order = gn_layers.clone();
2215 fastrand::shuffle(&mut gn_order);
2216
2217 for i in 0..MAX_LAYERS {
2218 cpu_ctxs[cpu].ogp_layer_order[i] =
2219 ogp_order.get(i).cloned().unwrap_or(MAX_LAYERS as u32);
2220 cpu_ctxs[cpu].ogn_layer_order[i] =
2221 ogn_order.get(i).cloned().unwrap_or(MAX_LAYERS as u32);
2222
2223 cpu_ctxs[cpu].op_layer_order[i] =
2224 op_order.get(i).cloned().unwrap_or(MAX_LAYERS as u32);
2225 cpu_ctxs[cpu].on_layer_order[i] =
2226 on_order.get(i).cloned().unwrap_or(MAX_LAYERS as u32);
2227 cpu_ctxs[cpu].gp_layer_order[i] =
2228 gp_order.get(i).cloned().unwrap_or(MAX_LAYERS as u32);
2229 cpu_ctxs[cpu].gn_layer_order[i] =
2230 gn_order.get(i).cloned().unwrap_or(MAX_LAYERS as u32);
2231 }
2232 }
2233
2234 Self::init_cpu_prox_map(topo, &mut cpu_ctxs);
2235
2236 skel.maps
2237 .cpu_ctxs
2238 .update_percpu(
2239 &key,
2240 &Self::convert_cpu_ctxs(cpu_ctxs),
2241 libbpf_rs::MapFlags::ANY,
2242 )
2243 .context("Failed to update cpu_ctx")?;
2244
2245 Ok(())
2246 }
2247
2248 fn init_llc_prox_map(skel: &mut BpfSkel, topo: &Topology) -> Result<()> {
2249 for (&llc_id, llc) in &topo.all_llcs {
2250 let mut node_order: Vec<usize> =
2252 topo.nodes[&llc.node_id].llcs.keys().cloned().collect();
2253 let mut sys_order: Vec<usize> = topo.all_llcs.keys().cloned().collect();
2254
2255 sys_order.retain(|id| !node_order.contains(id));
2257 node_order.retain(|&id| id != llc_id);
2258
2259 fastrand::seed(llc_id as u64);
2262 fastrand::shuffle(&mut sys_order);
2263 fastrand::shuffle(&mut node_order);
2264
2265 let mut order: Vec<usize> = vec![];
2267 let mut idx: usize = 0;
2268
2269 idx += 1;
2270 order.push(llc_id);
2271
2272 idx += node_order.len();
2273 order.append(&mut node_order);
2274 let node_end = idx;
2275
2276 idx += sys_order.len();
2277 order.append(&mut sys_order);
2278 let sys_end = idx;
2279
2280 debug!(
2281 "LLC[{}] proximity map[{}/{}]: {:?}",
2282 llc_id, node_end, sys_end, &order
2283 );
2284
2285 let key = llc_id as u32;
2290 let llc_id_slice =
2291 unsafe { std::slice::from_raw_parts((&key as *const u32) as *const u8, 4) };
2292 let v = skel
2293 .maps
2294 .llc_data
2295 .lookup(llc_id_slice, libbpf_rs::MapFlags::ANY)
2296 .unwrap()
2297 .unwrap();
2298 let mut llcc = unsafe { *(v.as_slice().as_ptr() as *const bpf_intf::llc_ctx) };
2299
2300 let pmap = &mut llcc.prox_map;
2301 for (i, &llc_id) in order.iter().enumerate() {
2302 pmap.llcs[i] = llc_id as u16;
2303 }
2304 pmap.node_end = node_end as u32;
2305 pmap.sys_end = sys_end as u32;
2306
2307 let v = unsafe {
2308 std::slice::from_raw_parts(
2309 &llcc as *const bpf_intf::llc_ctx as *const u8,
2310 std::mem::size_of::<bpf_intf::llc_ctx>(),
2311 )
2312 };
2313
2314 skel.maps
2315 .llc_data
2316 .update(llc_id_slice, v, libbpf_rs::MapFlags::ANY)?
2317 }
2318
2319 Ok(())
2320 }
2321
2322 fn init(
2323 opts: &'a Opts,
2324 layer_specs: &[LayerSpec],
2325 open_object: &'a mut MaybeUninit<OpenObject>,
2326 hint_to_layer_map: &HashMap<u64, HintLayerInfo>,
2327 membw_tracking: bool,
2328 ) -> Result<Self> {
2329 let nr_layers = layer_specs.len();
2330 let mut disable_topology = opts.disable_topology.unwrap_or(false);
2331
2332 let topo = Arc::new(if disable_topology {
2333 Topology::with_flattened_llc_node()?
2334 } else if opts.topology.virt_llc.is_some() {
2335 Topology::with_args(&opts.topology)?
2336 } else {
2337 Topology::new()?
2338 });
2339
2340 if topo.nodes.keys().enumerate().any(|(i, &k)| i != k) {
2347 bail!("Holes in node IDs detected: {:?}", topo.nodes.keys());
2348 }
2349 if topo.all_llcs.keys().enumerate().any(|(i, &k)| i != k) {
2350 bail!("Holes in LLC IDs detected: {:?}", topo.all_llcs.keys());
2351 }
2352 if topo.all_cpus.keys().enumerate().any(|(i, &k)| i != k) {
2353 bail!("Holes in CPU IDs detected: {:?}", topo.all_cpus.keys());
2354 }
2355
2356 let netdevs = if opts.netdev_irq_balance {
2357 warn!(
2358 "Experimental netdev IRQ balancing enabled. Reset IRQ masks of network devices after use!!!"
2359 );
2360 read_netdevs()?
2361 } else {
2362 BTreeMap::new()
2363 };
2364
2365 if !disable_topology {
2366 if topo.nodes.len() == 1 && topo.nodes[&0].llcs.len() == 1 {
2367 disable_topology = true;
2368 };
2369 info!(
2370 "Topology awareness not specified, selecting {} based on hardware",
2371 if disable_topology {
2372 "disabled"
2373 } else {
2374 "enabled"
2375 }
2376 );
2377 };
2378
2379 let cpu_pool = CpuPool::new(topo.clone())?;
2380
2381 let layer_specs: Vec<_> = if disable_topology {
2384 info!("Disabling topology awareness");
2385 layer_specs
2386 .iter()
2387 .cloned()
2388 .map(|mut s| {
2389 s.kind.common_mut().nodes.clear();
2390 s.kind.common_mut().llcs.clear();
2391 s
2392 })
2393 .collect()
2394 } else {
2395 layer_specs.to_vec()
2396 };
2397
2398 init_libbpf_logging(None);
2400 let kfuncs_in_syscall = scx_bpf_compat::kfuncs_supported_in_syscall()?;
2401 if !kfuncs_in_syscall {
2402 warn!("Using slow path: kfuncs not supported in syscall programs (a8e03b6bbb2c ∉ ker)");
2403 }
2404
2405 let mut skel_builder = BpfSkelBuilder::default();
2407 skel_builder.obj_builder.debug(opts.verbose > 1);
2408 info!(
2409 "Running scx_layered (build ID: {})",
2410 build_id::full_version(env!("CARGO_PKG_VERSION"))
2411 );
2412 let open_opts = opts.libbpf.clone().into_bpf_open_opts();
2413 let mut skel = scx_ops_open!(skel_builder, open_object, layered, open_opts)?;
2414
2415 skel.progs.scx_pmu_switch_tc.set_autoload(membw_tracking);
2417 skel.progs.scx_pmu_tick_tc.set_autoload(membw_tracking);
2418
2419 if opts.enable_gpu_support {
2422 if opts.gpu_kprobe_level >= 1 {
2425 compat::cond_kprobe_enable("nvidia_open", &skel.progs.kprobe_nvidia_open)?;
2426 }
2427 if opts.gpu_kprobe_level >= 2 {
2430 compat::cond_kprobe_enable("nvidia_mmap", &skel.progs.kprobe_nvidia_mmap)?;
2431 }
2432 if opts.gpu_kprobe_level >= 3 {
2433 compat::cond_kprobe_enable("nvidia_poll", &skel.progs.kprobe_nvidia_poll)?;
2434 }
2435 }
2436
2437 let ext_sched_class_addr = get_kallsyms_addr("ext_sched_class");
2438 let idle_sched_class_addr = get_kallsyms_addr("idle_sched_class");
2439
2440 let event = if membw_tracking {
2441 setup_membw_tracking(&mut skel)?
2442 } else {
2443 0
2444 };
2445
2446 let rodata = skel.maps.rodata_data.as_mut().unwrap();
2447
2448 if ext_sched_class_addr.is_ok() && idle_sched_class_addr.is_ok() {
2449 rodata.ext_sched_class_addr = ext_sched_class_addr.unwrap();
2450 rodata.idle_sched_class_addr = idle_sched_class_addr.unwrap();
2451 } else {
2452 warn!(
2453 "Unable to get sched_class addresses from /proc/kallsyms, disabling skip_preempt."
2454 );
2455 }
2456
2457 rodata.slice_ns = scx_enums.SCX_SLICE_DFL;
2458 rodata.max_exec_ns = 20 * scx_enums.SCX_SLICE_DFL;
2459
2460 skel.struct_ops.layered_mut().exit_dump_len = opts.exit_dump_len;
2462
2463 if !opts.disable_queued_wakeup {
2464 match *compat::SCX_OPS_ALLOW_QUEUED_WAKEUP {
2465 0 => info!("Kernel does not support queued wakeup optimization"),
2466 v => skel.struct_ops.layered_mut().flags |= v,
2467 }
2468 }
2469
2470 rodata.percpu_kthread_preempt = !opts.disable_percpu_kthread_preempt;
2471 rodata.percpu_kthread_preempt_all =
2472 !opts.disable_percpu_kthread_preempt && opts.percpu_kthread_preempt_all;
2473 rodata.debug = opts.verbose as u32;
2474 rodata.slice_ns = opts.slice_us * 1000;
2475 rodata.max_exec_ns = if opts.max_exec_us > 0 {
2476 opts.max_exec_us * 1000
2477 } else {
2478 opts.slice_us * 1000 * 20
2479 };
2480 rodata.nr_cpu_ids = *NR_CPU_IDS as u32;
2481 rodata.nr_possible_cpus = *NR_CPUS_POSSIBLE as u32;
2482 rodata.smt_enabled = topo.smt_enabled;
2483 rodata.has_little_cores = topo.has_little_cores();
2484 rodata.xnuma_preemption = opts.xnuma_preemption;
2485 rodata.antistall_sec = opts.antistall_sec;
2486 rodata.monitor_disable = opts.monitor_disable;
2487 rodata.lo_fb_wait_ns = opts.lo_fb_wait_us * 1000;
2488 rodata.lo_fb_share_ppk = ((opts.lo_fb_share * 1024.0) as u32).clamp(1, 1024);
2489 rodata.enable_antistall = !opts.disable_antistall;
2490 rodata.enable_match_debug = opts.enable_match_debug;
2491 rodata.enable_gpu_support = opts.enable_gpu_support;
2492 rodata.kfuncs_supported_in_syscall = kfuncs_in_syscall;
2493
2494 for (cpu, sib) in topo.sibling_cpus().iter().enumerate() {
2495 rodata.__sibling_cpu[cpu] = *sib;
2496 }
2497 for cpu in topo.all_cpus.keys() {
2498 rodata.all_cpus[cpu / 8] |= 1 << (cpu % 8);
2499 }
2500
2501 rodata.nr_op_layers = layer_specs
2502 .iter()
2503 .filter(|spec| match &spec.kind {
2504 LayerKind::Open { .. } => spec.kind.common().preempt,
2505 _ => false,
2506 })
2507 .count() as u32;
2508 rodata.nr_on_layers = layer_specs
2509 .iter()
2510 .filter(|spec| match &spec.kind {
2511 LayerKind::Open { .. } => !spec.kind.common().preempt,
2512 _ => false,
2513 })
2514 .count() as u32;
2515 rodata.nr_gp_layers = layer_specs
2516 .iter()
2517 .filter(|spec| match &spec.kind {
2518 LayerKind::Grouped { .. } => spec.kind.common().preempt,
2519 _ => false,
2520 })
2521 .count() as u32;
2522 rodata.nr_gn_layers = layer_specs
2523 .iter()
2524 .filter(|spec| match &spec.kind {
2525 LayerKind::Grouped { .. } => !spec.kind.common().preempt,
2526 _ => false,
2527 })
2528 .count() as u32;
2529 rodata.nr_excl_layers = layer_specs
2530 .iter()
2531 .filter(|spec| spec.kind.common().exclusive)
2532 .count() as u32;
2533
2534 let mut min_open = u64::MAX;
2535 let mut min_preempt = u64::MAX;
2536
2537 for spec in layer_specs.iter() {
2538 if let LayerKind::Open { common, .. } = &spec.kind {
2539 min_open = min_open.min(common.disallow_open_after_us.unwrap());
2540 min_preempt = min_preempt.min(common.disallow_preempt_after_us.unwrap());
2541 }
2542 }
2543
2544 rodata.min_open_layer_disallow_open_after_ns = match min_open {
2545 u64::MAX => *DFL_DISALLOW_OPEN_AFTER_US,
2546 v => v,
2547 };
2548 rodata.min_open_layer_disallow_preempt_after_ns = match min_preempt {
2549 u64::MAX => *DFL_DISALLOW_PREEMPT_AFTER_US,
2550 v => v,
2551 };
2552
2553 for i in 0..layer_specs.len() {
2555 skel.maps.bss_data.as_mut().unwrap().empty_layer_ids[i] = i as u32;
2556 }
2557 skel.maps.bss_data.as_mut().unwrap().nr_empty_layer_ids = nr_layers as u32;
2558
2559 let layered_task_hint_map_path = &opts.task_hint_map;
2564 let hint_map = &mut skel.maps.scx_layered_task_hint_map;
2565 if layered_task_hint_map_path.is_empty() == false {
2567 hint_map.set_pin_path(layered_task_hint_map_path).unwrap();
2568 rodata.task_hint_map_enabled = true;
2569 }
2570
2571 if !opts.hi_fb_thread_name.is_empty() {
2572 let bpf_hi_fb_thread_name = &mut rodata.hi_fb_thread_name;
2573 copy_into_cstr(bpf_hi_fb_thread_name, opts.hi_fb_thread_name.as_str());
2574 rodata.enable_hi_fb_thread_name_match = true;
2575 }
2576
2577 let cgroup_regexes = Self::init_layers(&mut skel, &layer_specs, &topo)?;
2578 skel.maps.rodata_data.as_mut().unwrap().nr_cgroup_regexes = cgroup_regexes.len() as u32;
2579 Self::init_nodes(&mut skel, opts, &topo);
2580
2581 let mut skel = scx_ops_load!(skel, layered, uei)?;
2582
2583 if hint_to_layer_map.len() != 0 {
2585 for (k, v) in hint_to_layer_map.iter() {
2586 let key: u32 = *k as u32;
2587
2588 let mut info_bytes = vec![0u8; std::mem::size_of::<bpf_intf::hint_layer_info>()];
2590 let info_ptr = info_bytes.as_mut_ptr() as *mut bpf_intf::hint_layer_info;
2591 unsafe {
2592 (*info_ptr).layer_id = v.layer_id as u32;
2593 (*info_ptr).system_cpu_util_below = match v.system_cpu_util_below {
2594 Some(threshold) => (threshold * 10000.0) as u64,
2595 None => u64::MAX, };
2597 (*info_ptr).dsq_insert_below = match v.dsq_insert_below {
2598 Some(threshold) => (threshold * 10000.0) as u64,
2599 None => u64::MAX, };
2601 }
2602
2603 skel.maps.hint_to_layer_id_map.update(
2604 &key.to_ne_bytes(),
2605 &info_bytes,
2606 libbpf_rs::MapFlags::ANY,
2607 )?;
2608 }
2609 }
2610
2611 if membw_tracking {
2612 create_perf_fds(&mut skel, event)?;
2613 }
2614
2615 let mut layers = vec![];
2616 let layer_growth_orders =
2617 LayerGrowthAlgo::layer_core_orders(&cpu_pool, &layer_specs, &topo)?;
2618 for (idx, spec) in layer_specs.iter().enumerate() {
2619 let growth_order = layer_growth_orders
2620 .get(&idx)
2621 .with_context(|| "layer has no growth order".to_string())?;
2622 layers.push(Layer::new(spec, &topo, growth_order)?);
2623 }
2624
2625 let mut idle_qos_enabled = layers
2626 .iter()
2627 .any(|layer| layer.kind.common().idle_resume_us.unwrap_or(0) > 0);
2628 if idle_qos_enabled && !cpu_idle_resume_latency_supported() {
2629 warn!("idle_resume_us not supported, ignoring");
2630 idle_qos_enabled = false;
2631 }
2632
2633 Self::init_cpus(&skel, &layer_specs, &topo)?;
2634 Self::init_llc_prox_map(&mut skel, &topo)?;
2635
2636 let proc_reader = fb_procfs::ProcReader::new();
2638
2639 let input = ProgramInput {
2641 ..Default::default()
2642 };
2643 let prog = &mut skel.progs.initialize_pid_namespace;
2644
2645 let _ = prog.test_run(input);
2646
2647 if layered_task_hint_map_path.is_empty() == false {
2656 let path = CString::new(layered_task_hint_map_path.as_bytes()).unwrap();
2657 let mode: libc::mode_t = 0o666;
2658 unsafe {
2659 if libc::chmod(path.as_ptr(), mode) != 0 {
2660 trace!("'chmod' to 666 of task hint map failed, continuing...");
2661 }
2662 }
2663 }
2664
2665 let struct_ops = scx_ops_attach!(skel, layered)?;
2667 let stats_server = StatsServer::new(stats::server_data()).launch()?;
2668 let mut gpu_task_handler =
2669 GpuTaskAffinitizer::new(opts.gpu_affinitize_secs, opts.enable_gpu_affinitize);
2670 gpu_task_handler.init(topo.clone());
2671
2672 let sched = Self {
2673 struct_ops: Some(struct_ops),
2674 layer_specs,
2675
2676 sched_intv: Duration::from_secs_f64(opts.interval),
2677 layer_refresh_intv: Duration::from_millis(opts.layer_refresh_ms_avgruntime),
2678
2679 cpu_pool,
2680 layers,
2681 idle_qos_enabled,
2682
2683 sched_stats: Stats::new(&mut skel, &proc_reader, &gpu_task_handler)?,
2684
2685 cgroup_regexes: Some(cgroup_regexes),
2686 nr_layer_cpus_ranges: vec![(0, 0); nr_layers],
2687 processing_dur: Default::default(),
2688
2689 proc_reader,
2690 skel,
2691
2692 topo,
2693 netdevs,
2694 stats_server,
2695 gpu_task_handler,
2696 };
2697
2698 info!("Layered Scheduler Attached. Run `scx_layered --monitor` for metrics.");
2699
2700 Ok(sched)
2701 }
2702
2703 fn update_cpumask(mask: &Cpumask, bpfmask: &mut [u8]) {
2704 for cpu in 0..mask.len() {
2705 if mask.test_cpu(cpu) {
2706 bpfmask[cpu / 8] |= 1 << (cpu % 8);
2707 } else {
2708 bpfmask[cpu / 8] &= !(1 << (cpu % 8));
2709 }
2710 }
2711 }
2712
2713 fn update_bpf_layer_cpumask(layer: &Layer, bpf_layer: &mut types::layer) {
2714 trace!("[{}] Updating BPF CPUs: {}", layer.name, &layer.cpus);
2715 Self::update_cpumask(&layer.cpus, &mut bpf_layer.cpus);
2716
2717 bpf_layer.nr_cpus = layer.nr_cpus as u32;
2718 for (llc_id, &nr_llc_cpus) in layer.nr_llc_cpus.iter().enumerate() {
2719 bpf_layer.nr_llc_cpus[llc_id] = nr_llc_cpus as u32;
2720 }
2721
2722 bpf_layer.refresh_cpus = 1;
2723 }
2724
2725 fn update_netdev_cpumasks(&mut self) -> Result<()> {
2726 let available_cpus = self.cpu_pool.available_cpus();
2727 if available_cpus.is_empty() {
2728 return Ok(());
2729 }
2730
2731 for (iface, netdev) in self.netdevs.iter_mut() {
2732 let node = self
2733 .topo
2734 .nodes
2735 .values()
2736 .take_while(|n| n.id == netdev.node())
2737 .next()
2738 .ok_or_else(|| anyhow!("Failed to get netdev node"))?;
2739 let node_cpus = node.span.clone();
2740 for (irq, irqmask) in netdev.irqs.iter_mut() {
2741 irqmask.clear_all();
2742 for cpu in available_cpus.iter() {
2743 if !node_cpus.test_cpu(cpu) {
2744 continue;
2745 }
2746 let _ = irqmask.set_cpu(cpu);
2747 }
2748 if irqmask.weight() == 0 {
2750 for cpu in node_cpus.iter() {
2751 let _ = irqmask.set_cpu(cpu);
2752 }
2753 }
2754 trace!("{} updating irq {} cpumask {:?}", iface, irq, irqmask);
2755 }
2756 netdev.apply_cpumasks()?;
2757 }
2758
2759 Ok(())
2760 }
2761
2762 fn clamp_target_by_membw(
2763 &self,
2764 layer: &Layer,
2765 membw_limit: f64,
2766 membw: f64,
2767 curtarget: u64,
2768 ) -> usize {
2769 let ncpu: u64 = layer.cpus.weight() as u64;
2770 let membw = (membw * (1024 as f64).powf(3.0)).round() as u64;
2771 let membw_limit = (membw_limit * (1024 as f64).powf(3.0)).round() as u64;
2772 let last_membw_percpu = if ncpu > 0 { membw / ncpu } else { 0 };
2773
2774 if membw_limit == 0 || last_membw_percpu == 0 {
2777 return curtarget as usize;
2778 }
2779
2780 return (membw_limit / last_membw_percpu) as usize;
2781 }
2782
2783 fn calc_target_nr_cpus(&self) -> Vec<(usize, usize)> {
2789 let nr_cpus = self.cpu_pool.topo.all_cpus.len();
2790 let utils = &self.sched_stats.layer_utils;
2791 let membws = &self.sched_stats.layer_membws;
2792
2793 let mut records: Vec<(u64, u64, u64, usize, usize, usize)> = vec![];
2794 let mut targets: Vec<(usize, usize)> = vec![];
2795
2796 for (idx, layer) in self.layers.iter().enumerate() {
2797 targets.push(match &layer.kind {
2798 LayerKind::Confined {
2799 util_range,
2800 cpus_range,
2801 cpus_range_frac,
2802 membw_gb,
2803 ..
2804 }
2805 | LayerKind::Grouped {
2806 util_range,
2807 cpus_range,
2808 cpus_range_frac,
2809 membw_gb,
2810 ..
2811 } => {
2812 let cpus_range =
2813 resolve_cpus_pct_range(cpus_range, cpus_range_frac, nr_cpus).unwrap();
2814
2815 let owned = utils[idx][LAYER_USAGE_OWNED];
2820 let open = utils[idx][LAYER_USAGE_OPEN];
2821
2822 let membw_owned = membws[idx][LAYER_USAGE_OWNED];
2823 let membw_open = membws[idx][LAYER_USAGE_OPEN];
2824
2825 let mut util = owned;
2826 let mut membw = membw_owned;
2827 if layer.kind.util_includes_open_cputime() || layer.nr_cpus == 0 {
2828 util += open;
2829 membw += membw_open;
2830 }
2831
2832 let util = if util < 0.01 { 0.0 } else { util };
2833 let low = (util / util_range.1).ceil() as usize;
2834 let high = ((util / util_range.0).floor() as usize).max(low);
2835
2836 let membw_limit = match membw_gb {
2837 Some(membw_limit) => *membw_limit,
2838 None => 0.0,
2839 };
2840
2841 trace!(
2842 "layer {0} (membw, membw_limit): ({membw} gi_b, {membw_limit} gi_b)",
2843 layer.name
2844 );
2845
2846 let target = layer.cpus.weight().clamp(low, high);
2847
2848 records.push((
2849 (owned * 100.0) as u64,
2850 (open * 100.0) as u64,
2851 (util * 100.0) as u64,
2852 low,
2853 high,
2854 target,
2855 ));
2856
2857 let target = target.clamp(cpus_range.0, cpus_range.1);
2858 let membw_target = self.clamp_target_by_membw(
2859 &layer,
2860 membw_limit as f64,
2861 membw as f64,
2862 target as u64,
2863 );
2864
2865 trace!("CPU target pre- and post-membw adjustment: {target} -> {membw_target}");
2866
2867 if membw_target < cpus_range.0 {
2870 warn!("cannot satisfy memory bw limit for layer {}", layer.name);
2871 warn!("membw_target {membw_target} low {}", cpus_range.0);
2872 };
2873
2874 let target = membw_target.clamp(cpus_range.0, target);
2877
2878 (target, cpus_range.0)
2879 }
2880 LayerKind::Open { .. } => (0, 0),
2881 });
2882 }
2883
2884 trace!("(owned, open, util, low, high, target): {:?}", &records);
2885 targets
2886 }
2887
2888 fn weighted_target_nr_cpus(&self, targets: &[(usize, usize)]) -> Vec<usize> {
2892 let mut nr_left = self.cpu_pool.topo.all_cpus.len();
2893 let weights: Vec<usize> = self
2894 .layers
2895 .iter()
2896 .map(|layer| layer.kind.common().weight as usize)
2897 .collect();
2898 let mut cands: BTreeMap<usize, (usize, usize, usize)> = targets
2899 .iter()
2900 .zip(&weights)
2901 .enumerate()
2902 .map(|(i, ((target, min), weight))| (i, (*target, *min, *weight)))
2903 .collect();
2904 let mut weight_sum: usize = weights.iter().sum();
2905 let mut weighted: Vec<usize> = vec![0; self.layers.len()];
2906
2907 trace!("cands: {:?}", &cands);
2908
2909 cands.retain(|&i, &mut (target, min, weight)| {
2911 if target <= min {
2912 let target = target.min(nr_left);
2913 weighted[i] = target;
2914 weight_sum -= weight;
2915 nr_left -= target;
2916 false
2917 } else {
2918 true
2919 }
2920 });
2921
2922 trace!("cands after accepting mins: {:?}", &cands);
2923
2924 let calc_share = |nr_left, weight, weight_sum| {
2926 (((nr_left * weight) as f64 / weight_sum as f64).ceil() as usize).min(nr_left)
2927 };
2928
2929 while !cands.is_empty() {
2930 let mut progress = false;
2931
2932 cands.retain(|&i, &mut (target, _min, weight)| {
2933 let share = calc_share(nr_left, weight, weight_sum);
2934 if target <= share {
2935 weighted[i] = target;
2936 weight_sum -= weight;
2937 nr_left -= target;
2938 progress = true;
2939 false
2940 } else {
2941 true
2942 }
2943 });
2944
2945 if !progress {
2946 break;
2947 }
2948 }
2949
2950 trace!("cands after accepting under allotted: {:?}", &cands);
2951
2952 let nr_to_share = nr_left;
2955 for (i, (_target, _min, weight)) in cands.into_iter() {
2956 let share = calc_share(nr_to_share, weight, weight_sum).min(nr_left);
2957 weighted[i] = share;
2958 nr_left -= share;
2959 }
2960
2961 trace!("weighted: {:?}", &weighted);
2962
2963 weighted
2964 }
2965
2966 fn compute_target_llcs(target: usize, topo: &Topology) -> (usize, usize) {
2970 let cores_per_llc = topo.all_cores.len() / topo.all_llcs.len();
2972 let cpus_per_core = topo.all_cores.first_key_value().unwrap().1.cpus.len();
2974 let cpus_per_llc = cores_per_llc * cpus_per_core;
2975
2976 let full = target / cpus_per_llc;
2977 let extra = target % cpus_per_llc;
2978
2979 (full, extra.div_ceil(cpus_per_core))
2980 }
2981
2982 fn recompute_layer_core_order(&mut self, layer_targets: &Vec<(usize, usize)>) -> Result<bool> {
2990 debug!(
2992 " free: before pass: free_llcs={:?}",
2993 self.cpu_pool.free_llcs
2994 );
2995 for &(idx, target) in layer_targets.iter().rev() {
2996 let layer = &mut self.layers[idx];
2997 let old_tlc = layer.target_llc_cpus;
2998 let new_tlc = Self::compute_target_llcs(target, &self.topo);
2999
3000 if layer.growth_algo != LayerGrowthAlgo::StickyDynamic {
3001 continue;
3002 }
3003
3004 let mut to_free = (old_tlc.0 as i32 - new_tlc.0 as i32).max(0) as usize;
3005
3006 debug!(
3007 " free: layer={} old_tlc={:?} new_tlc={:?} to_free={} assigned={} free={}",
3008 layer.name,
3009 old_tlc,
3010 new_tlc,
3011 to_free,
3012 layer.assigned_llcs.len(),
3013 self.cpu_pool.free_llcs.len()
3014 );
3015
3016 while to_free > 0 && layer.assigned_llcs.len() > 0 {
3017 let llc = layer.assigned_llcs.pop().unwrap();
3018 self.cpu_pool.free_llcs.push((llc, 0));
3019 to_free -= 1;
3020
3021 debug!(" layer={} freed_llc={}", layer.name, llc);
3022 }
3023 }
3024 debug!(" free: after pass: free_llcs={:?}", self.cpu_pool.free_llcs);
3025
3026 for &(idx, target) in layer_targets.iter().rev() {
3028 let layer = &mut self.layers[idx];
3029 let old_tlc = layer.target_llc_cpus;
3030 let new_tlc = Self::compute_target_llcs(target, &self.topo);
3031
3032 if layer.growth_algo != LayerGrowthAlgo::StickyDynamic {
3033 continue;
3034 }
3035
3036 let mut to_alloc = (new_tlc.0 as i32 - old_tlc.0 as i32).max(0) as usize;
3037
3038 debug!(
3039 " alloc: layer={} old_tlc={:?} new_tlc={:?} to_alloc={} assigned={} free={}",
3040 layer.name,
3041 old_tlc,
3042 new_tlc,
3043 to_alloc,
3044 layer.assigned_llcs.len(),
3045 self.cpu_pool.free_llcs.len()
3046 );
3047
3048 while to_alloc > 0
3049 && self.cpu_pool.free_llcs.len() > 0
3050 && to_alloc <= self.cpu_pool.free_llcs.len()
3051 {
3052 let llc = self.cpu_pool.free_llcs.pop().unwrap().0;
3053 layer.assigned_llcs.push(llc);
3054 to_alloc -= 1;
3055
3056 debug!(" layer={} alloc_llc={}", layer.name, llc);
3057 }
3058
3059 debug!(
3060 " alloc: layer={} assigned_llcs={:?}",
3061 layer.name, layer.assigned_llcs
3062 );
3063
3064 layer.target_llc_cpus = new_tlc;
3066 }
3067
3068 for &(idx, _) in layer_targets.iter() {
3071 let mut core_order = vec![];
3072 let layer = &mut self.layers[idx];
3073
3074 if layer.growth_algo != LayerGrowthAlgo::StickyDynamic {
3075 continue;
3076 }
3077
3078 let tlc = layer.target_llc_cpus;
3079 let mut extra = tlc.1;
3080 let cores_per_llc = self.topo.all_cores.len() / self.topo.all_llcs.len();
3082 let cpus_per_core = self.topo.all_cores.first_key_value().unwrap().1.cpus.len();
3083 let cpus_per_llc = cores_per_llc * cpus_per_core;
3084
3085 for i in 0..self.cpu_pool.free_llcs.len() {
3087 let free_vec = &mut self.cpu_pool.free_llcs;
3088 let avail = cpus_per_llc - free_vec[i].1;
3090 let mut used = extra.min(avail);
3092 let cores_to_add = used;
3093
3094 let shift = free_vec[i].1;
3095 free_vec[i].1 += used;
3096
3097 let llc_id = free_vec[i].0;
3098 let llc = self.topo.all_llcs.get(&llc_id).unwrap();
3099
3100 for core in llc.cores.iter().skip(shift) {
3101 if used == 0 {
3102 break;
3103 }
3104 core_order.push(core.1.id);
3105 used -= 1;
3106 }
3107
3108 extra -= cores_to_add;
3109 if extra == 0 {
3110 break;
3111 }
3112 }
3113
3114 core_order.reverse();
3115 layer.core_order = core_order;
3116 }
3117
3118 for i in 0..self.cpu_pool.free_llcs.len() {
3120 self.cpu_pool.free_llcs[i].1 = 0;
3121 }
3122
3123 for &(idx, _) in layer_targets.iter() {
3124 let layer = &mut self.layers[idx];
3125
3126 if layer.growth_algo != LayerGrowthAlgo::StickyDynamic {
3127 continue;
3128 }
3129
3130 for core in self.topo.all_cores.iter() {
3131 let llc_id = core.1.llc_id;
3132 if layer.assigned_llcs.contains(&llc_id) {
3133 layer.core_order.push(core.1.id);
3134 }
3135 }
3136 layer.core_order.reverse();
3138
3139 debug!(
3140 " alloc: layer={} core_order={:?}",
3141 layer.name, layer.core_order
3142 );
3143 }
3144
3145 let mut updated = false;
3149
3150 for &(idx, _) in layer_targets.iter() {
3152 let layer = &mut self.layers[idx];
3153
3154 if layer.growth_algo != LayerGrowthAlgo::StickyDynamic {
3155 continue;
3156 }
3157
3158 let mut new_cpus = Cpumask::new();
3160 for &core_id in &layer.core_order {
3161 if let Some(core) = self.topo.all_cores.get(&core_id) {
3162 new_cpus |= &core.span;
3163 }
3164 }
3165
3166 new_cpus &= &layer.allowed_cpus;
3168
3169 let cpus_to_free = layer.cpus.clone().and(&new_cpus.clone().not());
3171
3172 if cpus_to_free.weight() > 0 {
3173 debug!(
3174 " apply: layer={} freeing CPUs: {}",
3175 layer.name, cpus_to_free
3176 );
3177 layer.cpus &= &cpus_to_free.not();
3179 layer.nr_cpus -= cpus_to_free.weight();
3180 for cpu in cpus_to_free.iter() {
3181 layer.nr_llc_cpus[self.cpu_pool.topo.all_cpus[&cpu].llc_id] -= 1;
3182 }
3183 self.cpu_pool.free(&cpus_to_free)?;
3184 updated = true;
3185 }
3186 }
3187
3188 for &(idx, _) in layer_targets.iter() {
3190 let layer = &mut self.layers[idx];
3191
3192 if layer.growth_algo != LayerGrowthAlgo::StickyDynamic {
3193 continue;
3194 }
3195
3196 let mut new_cpus = Cpumask::new();
3198 for &core_id in &layer.core_order {
3199 if let Some(core) = self.topo.all_cores.get(&core_id) {
3200 new_cpus |= &core.span;
3201 }
3202 }
3203 new_cpus &= &layer.allowed_cpus;
3204
3205 let available_cpus = self.cpu_pool.available_cpus();
3207 let desired_to_alloc = new_cpus.clone().and(&layer.cpus.clone().not());
3208 let cpus_to_alloc = desired_to_alloc.clone().and(&available_cpus);
3209
3210 if desired_to_alloc.weight() > cpus_to_alloc.weight() {
3211 debug!(
3212 " apply: layer={} wanted to alloc {} CPUs but only {} available",
3213 layer.name,
3214 desired_to_alloc.weight(),
3215 cpus_to_alloc.weight()
3216 );
3217 }
3218
3219 if cpus_to_alloc.weight() > 0 {
3220 debug!(
3221 " apply: layer={} allocating CPUs: {}",
3222 layer.name, cpus_to_alloc
3223 );
3224 layer.cpus |= &cpus_to_alloc;
3226 layer.nr_cpus += cpus_to_alloc.weight();
3227 for cpu in cpus_to_alloc.iter() {
3228 layer.nr_llc_cpus[self.cpu_pool.topo.all_cpus[&cpu].llc_id] += 1;
3229 }
3230 self.cpu_pool.mark_allocated(&cpus_to_alloc)?;
3231 updated = true;
3232 }
3233
3234 debug!(
3235 " apply: layer={} final cpus.weight()={} nr_cpus={}",
3236 layer.name,
3237 layer.cpus.weight(),
3238 layer.nr_cpus
3239 );
3240 }
3241
3242 Ok(updated)
3243 }
3244
3245 fn refresh_cpumasks(&mut self) -> Result<()> {
3246 let layer_is_open = |layer: &Layer| matches!(layer.kind, LayerKind::Open { .. });
3247
3248 let mut updated = false;
3249 let targets = self.calc_target_nr_cpus();
3250 let targets = self.weighted_target_nr_cpus(&targets);
3251
3252 let mut ascending: Vec<(usize, usize)> = targets.iter().copied().enumerate().collect();
3253 ascending.sort_by(|a, b| a.1.cmp(&b.1));
3254
3255 let sticky_dynamic_updated = self.recompute_layer_core_order(&ascending)?;
3256 updated |= sticky_dynamic_updated;
3257
3258 if sticky_dynamic_updated {
3260 for (idx, layer) in self.layers.iter().enumerate() {
3261 if layer.growth_algo == LayerGrowthAlgo::StickyDynamic {
3262 Self::update_bpf_layer_cpumask(
3263 layer,
3264 &mut self.skel.maps.bss_data.as_mut().unwrap().layers[idx],
3265 );
3266 }
3267 }
3268 }
3269
3270 let mut force_free = self
3273 .layers
3274 .iter()
3275 .zip(targets.iter())
3276 .any(|(layer, &target)| layer.nr_cpus < target);
3277
3278 for &(idx, target) in ascending.iter().rev() {
3282 let layer = &mut self.layers[idx];
3283 if layer_is_open(layer) {
3284 continue;
3285 }
3286
3287 if layer.growth_algo == LayerGrowthAlgo::StickyDynamic {
3289 continue;
3290 }
3291
3292 let nr_cur = layer.cpus.weight();
3293 if nr_cur <= target {
3294 continue;
3295 }
3296 let mut nr_to_free = nr_cur - target;
3297
3298 let nr_to_break_at = nr_to_free / 2;
3303
3304 let mut freed = false;
3305
3306 while nr_to_free > 0 {
3307 let max_to_free = if force_free {
3308 force_free = false;
3309 layer.nr_cpus
3310 } else {
3311 nr_to_free
3312 };
3313
3314 let nr_freed = layer.free_some_cpus(&mut self.cpu_pool, max_to_free)?;
3315 if nr_freed == 0 {
3316 break;
3317 }
3318
3319 nr_to_free = nr_to_free.saturating_sub(nr_freed);
3320 freed = true;
3321
3322 if nr_to_free <= nr_to_break_at {
3323 break;
3324 }
3325 }
3326
3327 if freed {
3328 Self::update_bpf_layer_cpumask(
3329 layer,
3330 &mut self.skel.maps.bss_data.as_mut().unwrap().layers[idx],
3331 );
3332 updated = true;
3333 }
3334 }
3335
3336 for &(idx, target) in &ascending {
3342 let layer = &mut self.layers[idx];
3343
3344 if layer_is_open(layer) {
3345 continue;
3346 }
3347
3348 if layer.growth_algo == LayerGrowthAlgo::StickyDynamic {
3350 continue;
3351 }
3352
3353 let nr_cur = layer.cpus.weight();
3354 if nr_cur >= target {
3355 continue;
3356 }
3357
3358 let mut nr_to_alloc = target - nr_cur;
3359 let mut alloced = false;
3360
3361 while nr_to_alloc > 0 {
3362 let nr_alloced = layer.alloc_some_cpus(&mut self.cpu_pool)?;
3363 if nr_alloced == 0 {
3364 break;
3365 }
3366 alloced = true;
3367 nr_to_alloc -= nr_alloced.min(nr_to_alloc);
3368 }
3369
3370 if alloced {
3371 Self::update_bpf_layer_cpumask(
3372 layer,
3373 &mut self.skel.maps.bss_data.as_mut().unwrap().layers[idx],
3374 );
3375 updated = true;
3376 }
3377 }
3378
3379 if updated {
3381 for (idx, layer) in self.layers.iter_mut().enumerate() {
3382 if !layer_is_open(layer) {
3383 continue;
3384 }
3385
3386 let bpf_layer = &mut self.skel.maps.bss_data.as_mut().unwrap().layers[idx];
3387 let available_cpus = self.cpu_pool.available_cpus().and(&layer.allowed_cpus);
3388 let nr_available_cpus = available_cpus.weight();
3389
3390 layer.cpus = available_cpus;
3393 layer.nr_cpus = nr_available_cpus;
3394 Self::update_bpf_layer_cpumask(layer, bpf_layer);
3395 }
3396
3397 self.skel.maps.bss_data.as_mut().unwrap().fallback_cpu =
3398 self.cpu_pool.fallback_cpu as u32;
3399
3400 for (lidx, layer) in self.layers.iter().enumerate() {
3401 self.nr_layer_cpus_ranges[lidx] = (
3402 self.nr_layer_cpus_ranges[lidx].0.min(layer.nr_cpus),
3403 self.nr_layer_cpus_ranges[lidx].1.max(layer.nr_cpus),
3404 );
3405 }
3406
3407 let input = ProgramInput {
3409 ..Default::default()
3410 };
3411 let prog = &mut self.skel.progs.refresh_layer_cpumasks;
3412 let _ = prog.test_run(input);
3413
3414 let empty_layer_ids: Vec<u32> = self
3416 .layers
3417 .iter()
3418 .enumerate()
3419 .filter(|(_idx, layer)| layer.nr_cpus == 0)
3420 .map(|(idx, _layer)| idx as u32)
3421 .collect();
3422 for i in 0..self.layers.len() {
3423 self.skel.maps.bss_data.as_mut().unwrap().empty_layer_ids[i] =
3424 empty_layer_ids.get(i).cloned().unwrap_or(MAX_LAYERS as u32);
3425 }
3426 self.skel.maps.bss_data.as_mut().unwrap().nr_empty_layer_ids =
3427 empty_layer_ids.len() as u32;
3428 }
3429
3430 let _ = self.update_netdev_cpumasks();
3431 Ok(())
3432 }
3433
3434 fn refresh_idle_qos(&mut self) -> Result<()> {
3435 if !self.idle_qos_enabled {
3436 return Ok(());
3437 }
3438
3439 let mut cpu_idle_qos = vec![0; *NR_CPU_IDS];
3440 for layer in self.layers.iter() {
3441 let idle_resume_us = layer.kind.common().idle_resume_us.unwrap_or(0) as i32;
3442 for cpu in layer.cpus.iter() {
3443 cpu_idle_qos[cpu] = idle_resume_us;
3444 }
3445 }
3446
3447 for (cpu, idle_resume_usec) in cpu_idle_qos.iter().enumerate() {
3448 update_cpu_idle_resume_latency(cpu, *idle_resume_usec)?;
3449 }
3450
3451 Ok(())
3452 }
3453
3454 fn step(&mut self) -> Result<()> {
3455 let started_at = Instant::now();
3456 self.sched_stats.refresh(
3457 &mut self.skel,
3458 &self.proc_reader,
3459 started_at,
3460 self.processing_dur,
3461 &self.gpu_task_handler,
3462 )?;
3463
3464 self.skel
3466 .maps
3467 .bss_data
3468 .as_mut()
3469 .unwrap()
3470 .system_cpu_util_ewma = (self.sched_stats.system_cpu_util_ewma * 10000.0) as u64;
3471
3472 for layer_id in 0..self.sched_stats.nr_layers {
3473 self.skel
3474 .maps
3475 .bss_data
3476 .as_mut()
3477 .unwrap()
3478 .layer_dsq_insert_ewma[layer_id] =
3479 (self.sched_stats.layer_dsq_insert_ewma[layer_id] * 10000.0) as u64;
3480 }
3481
3482 self.refresh_cpumasks()?;
3483 self.refresh_idle_qos()?;
3484 self.gpu_task_handler.maybe_affinitize();
3485 self.processing_dur += Instant::now().duration_since(started_at);
3486 Ok(())
3487 }
3488
3489 fn generate_sys_stats(
3490 &mut self,
3491 stats: &Stats,
3492 cpus_ranges: &mut [(usize, usize)],
3493 ) -> Result<SysStats> {
3494 let bstats = &stats.bpf_stats;
3495 let mut sys_stats = SysStats::new(stats, bstats, self.cpu_pool.fallback_cpu)?;
3496
3497 for (lidx, (spec, layer)) in self.layer_specs.iter().zip(self.layers.iter()).enumerate() {
3498 let layer_stats = LayerStats::new(lidx, layer, stats, bstats, cpus_ranges[lidx]);
3499 sys_stats.layers.insert(spec.name.to_string(), layer_stats);
3500 cpus_ranges[lidx] = (layer.nr_cpus, layer.nr_cpus);
3501 }
3502
3503 Ok(sys_stats)
3504 }
3505
3506 fn process_cgroup_creation(
3508 path: &Path,
3509 cgroup_regexes: &HashMap<u32, Regex>,
3510 cgroup_path_to_id: &mut HashMap<String, u64>,
3511 sender: &crossbeam::channel::Sender<CgroupEvent>,
3512 ) {
3513 let path_str = path.to_string_lossy().to_string();
3514
3515 let cgroup_id = std::fs::metadata(path)
3517 .map(|metadata| {
3518 use std::os::unix::fs::MetadataExt;
3519 metadata.ino()
3520 })
3521 .unwrap_or(0);
3522
3523 let mut match_bitmap = 0u64;
3525 for (rule_id, regex) in cgroup_regexes {
3526 if regex.is_match(&path_str) {
3527 match_bitmap |= 1u64 << rule_id;
3528 }
3529 }
3530
3531 cgroup_path_to_id.insert(path_str.clone(), cgroup_id);
3533
3534 if let Err(e) = sender.send(CgroupEvent::Created {
3536 path: path_str,
3537 cgroup_id,
3538 match_bitmap,
3539 }) {
3540 error!("Failed to send cgroup creation event: {}", e);
3541 }
3542 }
3543
3544 fn start_cgroup_watcher(
3545 shutdown: Arc<AtomicBool>,
3546 cgroup_regexes: HashMap<u32, Regex>,
3547 ) -> Result<Receiver<CgroupEvent>> {
3548 let mut inotify = Inotify::init().context("Failed to initialize inotify")?;
3549 let mut wd_to_path = HashMap::new();
3550
3551 let (sender, receiver) = crossbeam::channel::bounded::<CgroupEvent>(1024);
3553
3554 let root_wd = inotify
3556 .watches()
3557 .add("/sys/fs/cgroup", WatchMask::CREATE | WatchMask::DELETE)
3558 .context("Failed to add watch for /sys/fs/cgroup")?;
3559 wd_to_path.insert(root_wd, PathBuf::from("/sys/fs/cgroup"));
3560
3561 Self::add_recursive_watches(&mut inotify, &mut wd_to_path, Path::new("/sys/fs/cgroup"))?;
3563
3564 std::thread::spawn(move || {
3566 let mut buffer = [0; 4096];
3567 let inotify_fd = inotify.as_raw_fd();
3568 let mut cgroup_path_to_id = HashMap::<String, u64>::new();
3570
3571 for entry in WalkDir::new("/sys/fs/cgroup")
3573 .into_iter()
3574 .filter_map(|e| e.ok())
3575 .filter(|e| e.file_type().is_dir())
3576 {
3577 let path = entry.path();
3578 Self::process_cgroup_creation(
3579 path,
3580 &cgroup_regexes,
3581 &mut cgroup_path_to_id,
3582 &sender,
3583 );
3584 }
3585
3586 while !shutdown.load(Ordering::Relaxed) {
3587 let ready = unsafe {
3589 let mut read_fds: libc::fd_set = std::mem::zeroed();
3590 libc::FD_ZERO(&mut read_fds);
3591 libc::FD_SET(inotify_fd, &mut read_fds);
3592
3593 let mut timeout = libc::timeval {
3594 tv_sec: 0,
3595 tv_usec: 100_000, };
3597
3598 libc::select(
3599 inotify_fd + 1,
3600 &mut read_fds,
3601 std::ptr::null_mut(),
3602 std::ptr::null_mut(),
3603 &mut timeout,
3604 )
3605 };
3606
3607 if ready <= 0 {
3608 continue;
3610 }
3611
3612 let events = match inotify.read_events(&mut buffer) {
3614 Ok(events) => events,
3615 Err(e) => {
3616 error!("Error reading inotify events: {}", e);
3617 break;
3618 }
3619 };
3620
3621 for event in events {
3622 if !event.mask.contains(inotify::EventMask::CREATE)
3623 && !event.mask.contains(inotify::EventMask::DELETE)
3624 {
3625 continue;
3626 }
3627
3628 let name = match event.name {
3629 Some(name) => name,
3630 None => continue,
3631 };
3632
3633 let parent_path = match wd_to_path.get(&event.wd) {
3634 Some(parent) => parent,
3635 None => {
3636 warn!("Unknown watch descriptor: {:?}", event.wd);
3637 continue;
3638 }
3639 };
3640
3641 let path = parent_path.join(name.to_string_lossy().as_ref());
3642
3643 if event.mask.contains(inotify::EventMask::CREATE) {
3644 if !path.is_dir() {
3645 continue;
3646 }
3647
3648 Self::process_cgroup_creation(
3649 &path,
3650 &cgroup_regexes,
3651 &mut cgroup_path_to_id,
3652 &sender,
3653 );
3654
3655 match inotify
3657 .watches()
3658 .add(&path, WatchMask::CREATE | WatchMask::DELETE)
3659 {
3660 Ok(wd) => {
3661 wd_to_path.insert(wd, path.clone());
3662 }
3663 Err(e) => {
3664 warn!(
3665 "Failed to add watch for new cgroup {}: {}",
3666 path.display(),
3667 e
3668 );
3669 }
3670 }
3671 } else if event.mask.contains(inotify::EventMask::DELETE) {
3672 let path_str = path.to_string_lossy().to_string();
3673
3674 let cgroup_id = cgroup_path_to_id.remove(&path_str).unwrap_or(0);
3676
3677 if let Err(e) = sender.send(CgroupEvent::Removed {
3679 path: path_str,
3680 cgroup_id,
3681 }) {
3682 error!("Failed to send cgroup removal event: {}", e);
3683 }
3684
3685 let wd_to_remove = wd_to_path.iter().find_map(|(wd, watched_path)| {
3687 if watched_path == &path {
3688 Some(wd.clone())
3689 } else {
3690 None
3691 }
3692 });
3693 if let Some(wd) = wd_to_remove {
3694 wd_to_path.remove(&wd);
3695 }
3696 }
3697 }
3698 }
3699 });
3700
3701 Ok(receiver)
3702 }
3703
3704 fn add_recursive_watches(
3705 inotify: &mut Inotify,
3706 wd_to_path: &mut HashMap<inotify::WatchDescriptor, PathBuf>,
3707 path: &Path,
3708 ) -> Result<()> {
3709 for entry in WalkDir::new(path)
3710 .into_iter()
3711 .filter_map(|e| e.ok())
3712 .filter(|e| e.file_type().is_dir())
3713 .skip(1)
3714 {
3715 let entry_path = entry.path();
3716 match inotify
3718 .watches()
3719 .add(entry_path, WatchMask::CREATE | WatchMask::DELETE)
3720 {
3721 Ok(wd) => {
3722 wd_to_path.insert(wd, entry_path.to_path_buf());
3723 }
3724 Err(e) => {
3725 debug!("Failed to add watch for {}: {}", entry_path.display(), e);
3726 }
3727 }
3728 }
3729 Ok(())
3730 }
3731
3732 fn run(&mut self, shutdown: Arc<AtomicBool>) -> Result<UserExitInfo> {
3733 let (res_ch, req_ch) = self.stats_server.channels();
3734 let mut next_sched_at = Instant::now() + self.sched_intv;
3735 let enable_layer_refresh = !self.layer_refresh_intv.is_zero();
3736 let mut next_layer_refresh_at = Instant::now() + self.layer_refresh_intv;
3737 let mut cpus_ranges = HashMap::<ThreadId, Vec<(usize, usize)>>::new();
3738
3739 let cgroup_regexes = self.cgroup_regexes.take().unwrap();
3741 let cgroup_event_rx = if !cgroup_regexes.is_empty() {
3742 Some(Self::start_cgroup_watcher(
3743 shutdown.clone(),
3744 cgroup_regexes,
3745 )?)
3746 } else {
3747 None
3748 };
3749
3750 while !shutdown.load(Ordering::Relaxed) && !uei_exited!(&self.skel, uei) {
3751 let now = Instant::now();
3752
3753 if now >= next_sched_at {
3754 self.step()?;
3755 while next_sched_at < now {
3756 next_sched_at += self.sched_intv;
3757 }
3758 }
3759
3760 if enable_layer_refresh && now >= next_layer_refresh_at {
3761 self.skel
3762 .maps
3763 .bss_data
3764 .as_mut()
3765 .unwrap()
3766 .layer_refresh_seq_avgruntime += 1;
3767 while next_layer_refresh_at < now {
3768 next_layer_refresh_at += self.layer_refresh_intv;
3769 }
3770 }
3771
3772 let timeout_duration = next_sched_at.saturating_duration_since(Instant::now());
3774 let never_rx = crossbeam::channel::never();
3775 let cgroup_rx = cgroup_event_rx.as_ref().unwrap_or(&never_rx);
3776
3777 select! {
3778 recv(req_ch) -> msg => match msg {
3779 Ok(StatsReq::Hello(tid)) => {
3780 cpus_ranges.insert(
3781 tid,
3782 self.layers.iter().map(|l| (l.nr_cpus, l.nr_cpus)).collect(),
3783 );
3784 let stats =
3785 Stats::new(&mut self.skel, &self.proc_reader, &self.gpu_task_handler)?;
3786 res_ch.send(StatsRes::Hello(stats))?;
3787 }
3788 Ok(StatsReq::Refresh(tid, mut stats)) => {
3789 for i in 0..self.nr_layer_cpus_ranges.len() {
3791 for (_, ranges) in cpus_ranges.iter_mut() {
3792 ranges[i] = (
3793 ranges[i].0.min(self.nr_layer_cpus_ranges[i].0),
3794 ranges[i].1.max(self.nr_layer_cpus_ranges[i].1),
3795 );
3796 }
3797 self.nr_layer_cpus_ranges[i] =
3798 (self.layers[i].nr_cpus, self.layers[i].nr_cpus);
3799 }
3800
3801 stats.refresh(
3802 &mut self.skel,
3803 &self.proc_reader,
3804 now,
3805 self.processing_dur,
3806 &self.gpu_task_handler,
3807 )?;
3808 let sys_stats =
3809 self.generate_sys_stats(&stats, cpus_ranges.get_mut(&tid).unwrap())?;
3810 res_ch.send(StatsRes::Refreshed((stats, sys_stats)))?;
3811 }
3812 Ok(StatsReq::Bye(tid)) => {
3813 cpus_ranges.remove(&tid);
3814 res_ch.send(StatsRes::Bye)?;
3815 }
3816 Err(e) => Err(e)?,
3817 },
3818
3819 recv(cgroup_rx) -> event => match event {
3820 Ok(CgroupEvent::Created { path, cgroup_id, match_bitmap }) => {
3821 self.skel.maps.cgroup_match_bitmap.update(
3823 &cgroup_id.to_ne_bytes(),
3824 &match_bitmap.to_ne_bytes(),
3825 libbpf_rs::MapFlags::ANY,
3826 ).with_context(|| format!(
3827 "Failed to insert cgroup {}({}) into BPF map. Cgroup map may be full \
3828 (max 16384 entries). Aborting.",
3829 cgroup_id, path
3830 ))?;
3831
3832 debug!("Added cgroup {} to BPF map with bitmap 0x{:x}", cgroup_id, match_bitmap);
3833 }
3834 Ok(CgroupEvent::Removed { path, cgroup_id }) => {
3835 if let Err(e) = self.skel.maps.cgroup_match_bitmap.delete(&cgroup_id.to_ne_bytes()) {
3837 warn!("Failed to delete cgroup {} from BPF map: {}", cgroup_id, e);
3838 } else {
3839 debug!("Removed cgroup {}({}) from BPF map", cgroup_id, path);
3840 }
3841 }
3842 Err(e) => {
3843 error!("Error receiving cgroup event: {}", e);
3844 }
3845 },
3846
3847 recv(crossbeam::channel::after(timeout_duration)) -> _ => {
3848 }
3850 }
3851 }
3852
3853 let _ = self.struct_ops.take();
3854 uei_report!(&self.skel, uei)
3855 }
3856}
3857
3858impl Drop for Scheduler<'_> {
3859 fn drop(&mut self) {
3860 info!("Unregister {SCHEDULER_NAME} scheduler");
3861
3862 if let Some(struct_ops) = self.struct_ops.take() {
3863 drop(struct_ops);
3864 }
3865 }
3866}
3867
3868fn write_example_file(path: &str) -> Result<()> {
3869 let mut f = fs::OpenOptions::new()
3870 .create_new(true)
3871 .write(true)
3872 .open(path)?;
3873 Ok(f.write_all(serde_json::to_string_pretty(&*EXAMPLE_CONFIG)?.as_bytes())?)
3874}
3875
3876struct HintLayerInfo {
3877 layer_id: usize,
3878 system_cpu_util_below: Option<f64>,
3879 dsq_insert_below: Option<f64>,
3880}
3881
3882fn verify_layer_specs(specs: &[LayerSpec]) -> Result<HashMap<u64, HintLayerInfo>> {
3883 let mut hint_to_layer_map = HashMap::<u64, (usize, String, Option<f64>, Option<f64>)>::new();
3884
3885 let nr_specs = specs.len();
3886 if nr_specs == 0 {
3887 bail!("No layer spec");
3888 }
3889 if nr_specs > MAX_LAYERS {
3890 bail!("Too many layer specs");
3891 }
3892
3893 for (idx, spec) in specs.iter().enumerate() {
3894 if idx < nr_specs - 1 {
3895 if spec.matches.is_empty() {
3896 bail!("Non-terminal spec {:?} has NULL matches", spec.name);
3897 }
3898 } else {
3899 if spec.matches.len() != 1 || !spec.matches[0].is_empty() {
3900 bail!("Terminal spec {:?} must have an empty match", spec.name);
3901 }
3902 }
3903
3904 if spec.matches.len() > MAX_LAYER_MATCH_ORS {
3905 bail!(
3906 "Spec {:?} has too many ({}) OR match blocks",
3907 spec.name,
3908 spec.matches.len()
3909 );
3910 }
3911
3912 for (ands_idx, ands) in spec.matches.iter().enumerate() {
3913 if ands.len() > NR_LAYER_MATCH_KINDS {
3914 bail!(
3915 "Spec {:?}'s {}th OR block has too many ({}) match conditions",
3916 spec.name,
3917 ands_idx,
3918 ands.len()
3919 );
3920 }
3921 let mut hint_equals_cnt = 0;
3922 let mut system_cpu_util_below_cnt = 0;
3923 let mut dsq_insert_below_cnt = 0;
3924 let mut hint_value: Option<u64> = None;
3925 let mut system_cpu_util_threshold: Option<f64> = None;
3926 let mut dsq_insert_threshold: Option<f64> = None;
3927 for one in ands.iter() {
3928 match one {
3929 LayerMatch::CgroupPrefix(prefix) => {
3930 if prefix.len() > MAX_PATH {
3931 bail!("Spec {:?} has too long a cgroup prefix", spec.name);
3932 }
3933 }
3934 LayerMatch::CgroupSuffix(suffix) => {
3935 if suffix.len() > MAX_PATH {
3936 bail!("Spec {:?} has too long a cgroup suffix", spec.name);
3937 }
3938 }
3939 LayerMatch::CgroupContains(substr) => {
3940 if substr.len() > MAX_PATH {
3941 bail!("Spec {:?} has too long a cgroup substr", spec.name);
3942 }
3943 }
3944 LayerMatch::CommPrefix(prefix) => {
3945 if prefix.len() > MAX_COMM {
3946 bail!("Spec {:?} has too long a comm prefix", spec.name);
3947 }
3948 }
3949 LayerMatch::PcommPrefix(prefix) => {
3950 if prefix.len() > MAX_COMM {
3951 bail!("Spec {:?} has too long a process name prefix", spec.name);
3952 }
3953 }
3954 LayerMatch::SystemCpuUtilBelow(threshold) => {
3955 if *threshold < 0.0 || *threshold > 1.0 {
3956 bail!(
3957 "Spec {:?} has SystemCpuUtilBelow threshold outside the range [0.0, 1.0]",
3958 spec.name
3959 );
3960 }
3961 system_cpu_util_threshold = Some(*threshold);
3962 system_cpu_util_below_cnt += 1;
3963 }
3964 LayerMatch::DsqInsertBelow(threshold) => {
3965 if *threshold < 0.0 || *threshold > 1.0 {
3966 bail!(
3967 "Spec {:?} has DsqInsertBelow threshold outside the range [0.0, 1.0]",
3968 spec.name
3969 );
3970 }
3971 dsq_insert_threshold = Some(*threshold);
3972 dsq_insert_below_cnt += 1;
3973 }
3974 LayerMatch::HintEquals(hint) => {
3975 if *hint > 1024 {
3976 bail!(
3977 "Spec {:?} has hint value outside the range [0, 1024]",
3978 spec.name
3979 );
3980 }
3981 hint_value = Some(*hint);
3982 hint_equals_cnt += 1;
3983 }
3984 _ => {}
3985 }
3986 }
3987 if hint_equals_cnt > 1 {
3988 bail!("Only 1 HintEquals match permitted per AND block");
3989 }
3990 let high_freq_matcher_cnt = system_cpu_util_below_cnt + dsq_insert_below_cnt;
3991 if high_freq_matcher_cnt > 0 {
3992 if hint_equals_cnt != 1 {
3993 bail!("High-frequency matchers (SystemCpuUtilBelow, DsqInsertBelow) must be used with one HintEquals");
3994 }
3995 if system_cpu_util_below_cnt > 1 {
3996 bail!("Only 1 SystemCpuUtilBelow match permitted per AND block");
3997 }
3998 if dsq_insert_below_cnt > 1 {
3999 bail!("Only 1 DsqInsertBelow match permitted per AND block");
4000 }
4001 if ands.len() != hint_equals_cnt + system_cpu_util_below_cnt + dsq_insert_below_cnt
4002 {
4003 bail!("High-frequency matchers must be used only with HintEquals (no other matchers)");
4004 }
4005 } else if hint_equals_cnt == 1 && ands.len() != 1 {
4006 bail!("HintEquals match cannot be in conjunction with other matches");
4007 }
4008
4009 if let Some(hint) = hint_value {
4011 if let Some((layer_id, name, _, _)) = hint_to_layer_map.get(&hint) {
4012 if *layer_id != idx {
4013 bail!(
4014 "Spec {:?} has hint value ({}) that is already mapped to Spec {:?}",
4015 spec.name,
4016 hint,
4017 name
4018 );
4019 }
4020 } else {
4021 hint_to_layer_map.insert(
4022 hint,
4023 (
4024 idx,
4025 spec.name.clone(),
4026 system_cpu_util_threshold,
4027 dsq_insert_threshold,
4028 ),
4029 );
4030 }
4031 }
4032 }
4033
4034 match spec.kind {
4035 LayerKind::Confined {
4036 cpus_range,
4037 util_range,
4038 ..
4039 }
4040 | LayerKind::Grouped {
4041 cpus_range,
4042 util_range,
4043 ..
4044 } => {
4045 if let Some((cpus_min, cpus_max)) = cpus_range {
4046 if cpus_min > cpus_max {
4047 bail!(
4048 "Spec {:?} has invalid cpus_range({}, {})",
4049 spec.name,
4050 cpus_min,
4051 cpus_max
4052 );
4053 }
4054 }
4055 if util_range.0 >= util_range.1 {
4056 bail!(
4057 "Spec {:?} has invalid util_range ({}, {})",
4058 spec.name,
4059 util_range.0,
4060 util_range.1
4061 );
4062 }
4063 }
4064 _ => {}
4065 }
4066 }
4067
4068 Ok(hint_to_layer_map
4069 .into_iter()
4070 .map(|(k, v)| {
4071 (
4072 k,
4073 HintLayerInfo {
4074 layer_id: v.0,
4075 system_cpu_util_below: v.2,
4076 dsq_insert_below: v.3,
4077 },
4078 )
4079 })
4080 .collect())
4081}
4082
4083fn name_suffix(cgroup: &str, len: usize) -> String {
4084 let suffixlen = std::cmp::min(len, cgroup.len());
4085 let suffixrev: String = cgroup.chars().rev().take(suffixlen).collect();
4086
4087 suffixrev.chars().rev().collect()
4088}
4089
4090fn traverse_sysfs(dir: &Path) -> Result<Vec<PathBuf>> {
4091 let mut paths = vec![];
4092
4093 if !dir.is_dir() {
4094 panic!("path {:?} does not correspond to directory", dir);
4095 }
4096
4097 let direntries = fs::read_dir(dir)?;
4098
4099 for entry in direntries {
4100 let path = entry?.path();
4101 if path.is_dir() {
4102 paths.append(&mut traverse_sysfs(&path)?);
4103 paths.push(path);
4104 }
4105 }
4106
4107 Ok(paths)
4108}
4109
4110fn find_cpumask(cgroup: &str) -> Cpumask {
4111 let mut path = String::from(cgroup);
4112 path.push_str("/cpuset.cpus.effective");
4113
4114 let description = fs::read_to_string(&mut path).unwrap();
4115
4116 Cpumask::from_cpulist(&description).unwrap()
4117}
4118
4119fn expand_template(rule: &LayerMatch) -> Result<Vec<(LayerMatch, Cpumask)>> {
4120 match rule {
4121 LayerMatch::CgroupSuffix(suffix) => Ok(traverse_sysfs(Path::new("/sys/fs/cgroup"))?
4122 .into_iter()
4123 .map(|cgroup| String::from(cgroup.to_str().expect("could not parse cgroup path")))
4124 .filter(|cgroup| cgroup.ends_with(suffix))
4125 .map(|cgroup| {
4126 (
4127 {
4128 let mut slashterminated = cgroup.clone();
4129 slashterminated.push('/');
4130 LayerMatch::CgroupSuffix(name_suffix(&slashterminated, 64))
4131 },
4132 find_cpumask(&cgroup),
4133 )
4134 })
4135 .collect()),
4136 LayerMatch::CgroupRegex(expr) => Ok(traverse_sysfs(Path::new("/sys/fs/cgroup"))?
4137 .into_iter()
4138 .map(|cgroup| String::from(cgroup.to_str().expect("could not parse cgroup path")))
4139 .filter(|cgroup| {
4140 let re = Regex::new(expr).unwrap();
4141 re.is_match(cgroup)
4142 })
4143 .map(|cgroup| {
4144 (
4145 {
4149 let mut slashterminated = cgroup.clone();
4150 slashterminated.push('/');
4151 LayerMatch::CgroupSuffix(name_suffix(&slashterminated, 64))
4152 },
4153 find_cpumask(&cgroup),
4154 )
4155 })
4156 .collect()),
4157 _ => panic!("Unimplemented template enum {:?}", rule),
4158 }
4159}
4160
4161fn create_perf_fds(skel: &mut BpfSkel, event: u64) -> Result<()> {
4162 let mut attr = perf::bindings::perf_event_attr::default();
4163 attr.size = std::mem::size_of::<perf::bindings::perf_event_attr>() as u32;
4164 attr.type_ = perf::bindings::PERF_TYPE_RAW;
4165 attr.config = event;
4166 attr.sample_type = 0u64;
4167 attr.__bindgen_anon_1.sample_period = 0u64;
4168 attr.set_disabled(0);
4169
4170 let perf_events_map = &skel.maps.scx_pmu_map;
4171 let map_fd = unsafe { libbpf_sys::bpf_map__fd(perf_events_map.as_libbpf_object().as_ptr()) };
4172
4173 let mut failures = 0u64;
4174
4175 for cpu in 0..*NR_CPUS_POSSIBLE {
4176 let fd = unsafe { perf::perf_event_open(&mut attr as *mut _, -1, cpu as i32, -1, 0) };
4177 if fd < 0 {
4178 failures += 1;
4179 trace!(
4180 "perf_event_open failed cpu={cpu} errno={}",
4181 std::io::Error::last_os_error()
4182 );
4183 continue;
4184 }
4185
4186 let key = cpu as u32;
4187 let val = fd as u32;
4188 let ret = unsafe {
4189 libbpf_sys::bpf_map_update_elem(
4190 map_fd,
4191 &key as *const _ as *const _,
4192 &val as *const _ as *const _,
4193 0,
4194 )
4195 };
4196 if ret != 0 {
4197 trace!("bpf_map_update_elem failed cpu={cpu} fd={fd} ret={ret}");
4198 } else {
4199 trace!("mapped cpu={cpu} -> fd={fd}");
4200 }
4201 }
4202
4203 if failures > 0 {
4204 println!("membw tracking: failed to install {failures} counters");
4205 }
4207
4208 Ok(())
4209}
4210
4211fn setup_membw_tracking(skel: &mut OpenBpfSkel) -> Result<u64> {
4213 let pmumanager = PMUManager::new()?;
4214 let codename = &pmumanager.codename as &str;
4215
4216 let pmuspec = match codename {
4217 "amdzen1" | "amdzen2" | "amdzen3" => {
4218 trace!("found AMD codename {codename}");
4219 pmumanager.pmus.get("ls_any_fills_from_sys.mem_io_local")
4220 }
4221 "amdzen4" | "amdzen5" => {
4222 trace!("found AMD codename {codename}");
4223 pmumanager.pmus.get("ls_any_fills_from_sys.dram_io_all")
4224 }
4225
4226 "haswell" | "broadwell" | "broadwellde" | "broadwellx" | "skylake" | "skylakex"
4227 | "cascadelakex" | "arrowlake" | "meteorlake" | "sapphirerapids" | "emeraldrapids"
4228 | "graniterapids" => {
4229 trace!("found Intel codename {codename}");
4230 pmumanager.pmus.get("LONGEST_LAT_CACHE.MISS")
4231 }
4232
4233 _ => {
4234 trace!("found unknown codename {codename}");
4235 None
4236 }
4237 };
4238
4239 let spec = pmuspec.ok_or("not_found").unwrap();
4240 let config = (spec.umask << 8) | spec.event[0];
4241
4242 skel.maps.rodata_data.as_mut().unwrap().membw_event = config;
4244
4245 Ok(config)
4246}
4247
4248fn main() -> Result<()> {
4249 let opts = Opts::parse();
4250
4251 if opts.version {
4252 println!(
4253 "scx_layered {}",
4254 build_id::full_version(env!("CARGO_PKG_VERSION"))
4255 );
4256 return Ok(());
4257 }
4258
4259 if opts.help_stats {
4260 stats::server_data().describe_meta(&mut std::io::stdout(), None)?;
4261 return Ok(());
4262 }
4263
4264 if opts.no_load_frac_limit {
4265 warn!("--no-load-frac-limit is deprecated and noop");
4266 }
4267 if opts.layer_preempt_weight_disable != 0.0 {
4268 warn!("--layer-preempt-weight-disable is deprecated and noop");
4269 }
4270 if opts.layer_growth_weight_disable != 0.0 {
4271 warn!("--layer-growth-weight-disable is deprecated and noop");
4272 }
4273 if opts.local_llc_iteration {
4274 warn!("--local_llc_iteration is deprecated and noop");
4275 }
4276
4277 let llv = match opts.verbose {
4278 0 => simplelog::LevelFilter::Info,
4279 1 => simplelog::LevelFilter::Debug,
4280 _ => simplelog::LevelFilter::Trace,
4281 };
4282 let mut lcfg = simplelog::ConfigBuilder::new();
4283 lcfg.set_time_offset_to_local()
4284 .expect("Failed to set local time offset")
4285 .set_time_level(simplelog::LevelFilter::Error)
4286 .set_location_level(simplelog::LevelFilter::Off)
4287 .set_target_level(simplelog::LevelFilter::Off)
4288 .set_thread_level(simplelog::LevelFilter::Off);
4289 simplelog::TermLogger::init(
4290 llv,
4291 lcfg.build(),
4292 simplelog::TerminalMode::Stderr,
4293 simplelog::ColorChoice::Auto,
4294 )?;
4295
4296 debug!("opts={:?}", &opts);
4297
4298 let shutdown = Arc::new(AtomicBool::new(false));
4299 let shutdown_clone = shutdown.clone();
4300 ctrlc::set_handler(move || {
4301 shutdown_clone.store(true, Ordering::Relaxed);
4302 })
4303 .context("Error setting Ctrl-C handler")?;
4304
4305 if let Some(intv) = opts.monitor.or(opts.stats) {
4306 let shutdown_copy = shutdown.clone();
4307 let jh = std::thread::spawn(move || {
4308 match stats::monitor(Duration::from_secs_f64(intv), shutdown_copy) {
4309 Ok(_) => {
4310 debug!("stats monitor thread finished successfully")
4311 }
4312 Err(error_object) => {
4313 warn!(
4314 "stats monitor thread finished because of an error {}",
4315 error_object
4316 )
4317 }
4318 }
4319 });
4320 if opts.monitor.is_some() {
4321 let _ = jh.join();
4322 return Ok(());
4323 }
4324 }
4325
4326 if let Some(path) = &opts.example {
4327 write_example_file(path)?;
4328 return Ok(());
4329 }
4330
4331 let mut layer_config = match opts.run_example {
4332 true => EXAMPLE_CONFIG.clone(),
4333 false => LayerConfig { specs: vec![] },
4334 };
4335
4336 for (idx, input) in opts.specs.iter().enumerate() {
4337 let specs = LayerSpec::parse(input)
4338 .context(format!("Failed to parse specs[{}] ({:?})", idx, input))?;
4339
4340 for spec in specs {
4341 match spec.template {
4342 Some(ref rule) => {
4343 let matches = expand_template(&rule)?;
4344 if matches.is_empty() {
4347 layer_config.specs.push(spec);
4348 } else {
4349 for (mt, mask) in matches {
4350 let mut genspec = spec.clone();
4351
4352 genspec.cpuset = Some(mask);
4353
4354 for orterm in &mut genspec.matches {
4356 orterm.push(mt.clone());
4357 }
4358
4359 match &mt {
4360 LayerMatch::CgroupSuffix(cgroup) => genspec.name.push_str(cgroup),
4361 _ => bail!("Template match has unexpected type"),
4362 }
4363
4364 layer_config.specs.push(genspec);
4366 }
4367 }
4368 }
4369
4370 None => {
4371 layer_config.specs.push(spec);
4372 }
4373 }
4374 }
4375 }
4376
4377 for spec in layer_config.specs.iter_mut() {
4378 let common = spec.kind.common_mut();
4379
4380 if common.slice_us == 0 {
4381 common.slice_us = opts.slice_us;
4382 }
4383
4384 if common.weight == 0 {
4385 common.weight = DEFAULT_LAYER_WEIGHT;
4386 }
4387 common.weight = common.weight.clamp(MIN_LAYER_WEIGHT, MAX_LAYER_WEIGHT);
4388
4389 if common.preempt {
4390 if common.disallow_open_after_us.is_some() {
4391 warn!(
4392 "Preempt layer {} has non-null disallow_open_after_us, ignored",
4393 &spec.name
4394 );
4395 }
4396 if common.disallow_preempt_after_us.is_some() {
4397 warn!(
4398 "Preempt layer {} has non-null disallow_preempt_after_us, ignored",
4399 &spec.name
4400 );
4401 }
4402 common.disallow_open_after_us = Some(u64::MAX);
4403 common.disallow_preempt_after_us = Some(u64::MAX);
4404 } else {
4405 if common.disallow_open_after_us.is_none() {
4406 common.disallow_open_after_us = Some(*DFL_DISALLOW_OPEN_AFTER_US);
4407 }
4408
4409 if common.disallow_preempt_after_us.is_none() {
4410 common.disallow_preempt_after_us = Some(*DFL_DISALLOW_PREEMPT_AFTER_US);
4411 }
4412 }
4413
4414 if common.idle_smt.is_some() {
4415 warn!("Layer {} has deprecated flag \"idle_smt\"", &spec.name);
4416 }
4417 }
4418
4419 let membw_required = layer_config.specs.iter().any(|spec| match spec.kind {
4420 LayerKind::Confined { membw_gb, .. } | LayerKind::Grouped { membw_gb, .. } => {
4421 membw_gb.is_some()
4422 }
4423 LayerKind::Open { .. } => false,
4424 });
4425
4426 if opts.print_and_exit {
4427 println!("specs={}", serde_json::to_string_pretty(&layer_config)?);
4428 return Ok(());
4429 }
4430
4431 debug!("specs={}", serde_json::to_string_pretty(&layer_config)?);
4432 let hint_to_layer_map = verify_layer_specs(&layer_config.specs)?;
4433
4434 let mut open_object = MaybeUninit::uninit();
4435 loop {
4436 let mut sched = Scheduler::init(
4437 &opts,
4438 &layer_config.specs,
4439 &mut open_object,
4440 &hint_to_layer_map,
4441 membw_required,
4442 )?;
4443 if !sched.run(shutdown.clone())?.should_restart() {
4444 break;
4445 }
4446 }
4447
4448 Ok(())
4449}