1mod bpf_skel;
6mod stats;
7
8use std::collections::BTreeMap;
9use std::collections::HashMap;
10use std::collections::HashSet;
11use std::ffi::CString;
12use std::fs;
13use std::io::Write;
14use std::mem::MaybeUninit;
15use std::ops::Sub;
16use std::path::Path;
17use std::path::PathBuf;
18use std::sync::atomic::AtomicBool;
19use std::sync::atomic::Ordering;
20use std::sync::Arc;
21use std::thread::ThreadId;
22use std::time::Duration;
23use std::time::Instant;
24
25use inotify::{Inotify, WatchMask};
26use libc;
27use std::os::unix::io::AsRawFd;
28
29use anyhow::anyhow;
30use anyhow::bail;
31use anyhow::Context;
32use anyhow::Result;
33pub use bpf_skel::*;
34use clap::Parser;
35use crossbeam::channel::Receiver;
36use crossbeam::select;
37use lazy_static::lazy_static;
38use libbpf_rs::libbpf_sys;
39use libbpf_rs::AsRawLibbpf;
40use libbpf_rs::MapCore as _;
41use libbpf_rs::OpenObject;
42use libbpf_rs::ProgramInput;
43use nix::sched::CpuSet;
44use nvml_wrapper::error::NvmlError;
45use nvml_wrapper::Nvml;
46use once_cell::sync::OnceCell;
47use regex::Regex;
48use scx_bpf_compat;
49use scx_layered::*;
50use scx_raw_pmu::PMUManager;
51use scx_stats::prelude::*;
52use scx_utils::build_id;
53use scx_utils::compat;
54use scx_utils::init_libbpf_logging;
55use scx_utils::libbpf_clap_opts::LibbpfOpts;
56use scx_utils::perf;
57use scx_utils::pm::{cpu_idle_resume_latency_supported, update_cpu_idle_resume_latency};
58use scx_utils::read_netdevs;
59use scx_utils::scx_enums;
60use scx_utils::scx_ops_attach;
61use scx_utils::scx_ops_load;
62use scx_utils::scx_ops_open;
63use scx_utils::uei_exited;
64use scx_utils::uei_report;
65use scx_utils::CoreType;
66use scx_utils::Cpumask;
67use scx_utils::Llc;
68use scx_utils::NetDev;
69use scx_utils::Topology;
70use scx_utils::TopologyArgs;
71use scx_utils::UserExitInfo;
72use scx_utils::NR_CPUS_POSSIBLE;
73use scx_utils::NR_CPU_IDS;
74use stats::LayerStats;
75use stats::StatsReq;
76use stats::StatsRes;
77use stats::SysStats;
78use std::collections::VecDeque;
79use sysinfo::{Pid, ProcessRefreshKind, ProcessesToUpdate, System};
80use tracing::{debug, error, info, trace, warn};
81use tracing_subscriber::filter::EnvFilter;
82use walkdir::WalkDir;
83
84const SCHEDULER_NAME: &str = "scx_layered";
85const MAX_PATH: usize = bpf_intf::consts_MAX_PATH as usize;
86const MAX_COMM: usize = bpf_intf::consts_MAX_COMM as usize;
87const MAX_LAYER_WEIGHT: u32 = bpf_intf::consts_MAX_LAYER_WEIGHT;
88const MIN_LAYER_WEIGHT: u32 = bpf_intf::consts_MIN_LAYER_WEIGHT;
89const MAX_LAYER_MATCH_ORS: usize = bpf_intf::consts_MAX_LAYER_MATCH_ORS as usize;
90const MAX_LAYER_NAME: usize = bpf_intf::consts_MAX_LAYER_NAME as usize;
91const MAX_LAYERS: usize = bpf_intf::consts_MAX_LAYERS as usize;
92const DEFAULT_LAYER_WEIGHT: u32 = bpf_intf::consts_DEFAULT_LAYER_WEIGHT;
93const USAGE_HALF_LIFE: u32 = bpf_intf::consts_USAGE_HALF_LIFE;
94const USAGE_HALF_LIFE_F64: f64 = USAGE_HALF_LIFE as f64 / 1_000_000_000.0;
95
96const LAYER_USAGE_OWNED: usize = bpf_intf::layer_usage_LAYER_USAGE_OWNED as usize;
97const LAYER_USAGE_OPEN: usize = bpf_intf::layer_usage_LAYER_USAGE_OPEN as usize;
98const LAYER_USAGE_SUM_UPTO: usize = bpf_intf::layer_usage_LAYER_USAGE_SUM_UPTO as usize;
99const LAYER_USAGE_PROTECTED: usize = bpf_intf::layer_usage_LAYER_USAGE_PROTECTED as usize;
100const LAYER_USAGE_PROTECTED_PREEMPT: usize =
101 bpf_intf::layer_usage_LAYER_USAGE_PROTECTED_PREEMPT as usize;
102const NR_LAYER_USAGES: usize = bpf_intf::layer_usage_NR_LAYER_USAGES as usize;
103
104const NR_GSTATS: usize = bpf_intf::global_stat_id_NR_GSTATS as usize;
105const NR_LSTATS: usize = bpf_intf::layer_stat_id_NR_LSTATS as usize;
106const NR_LLC_LSTATS: usize = bpf_intf::llc_layer_stat_id_NR_LLC_LSTATS as usize;
107
108const NR_LAYER_MATCH_KINDS: usize = bpf_intf::layer_match_kind_NR_LAYER_MATCH_KINDS as usize;
109
110static NVML: OnceCell<Nvml> = OnceCell::new();
111
112fn nvml() -> Result<&'static Nvml, NvmlError> {
113 NVML.get_or_try_init(Nvml::init)
114}
115
116lazy_static! {
117 static ref USAGE_DECAY: f64 = 0.5f64.powf(1.0 / USAGE_HALF_LIFE_F64);
118 static ref DFL_DISALLOW_OPEN_AFTER_US: u64 = 2 * scx_enums.SCX_SLICE_DFL / 1000;
119 static ref DFL_DISALLOW_PREEMPT_AFTER_US: u64 = 4 * scx_enums.SCX_SLICE_DFL / 1000;
120 static ref EXAMPLE_CONFIG: LayerConfig = LayerConfig {
121 specs: vec![
122 LayerSpec {
123 name: "batch".into(),
124 comment: Some("tasks under system.slice or tasks with nice value > 0".into()),
125 cpuset: None,
126 template: None,
127 matches: vec![
128 vec![LayerMatch::CgroupPrefix("system.slice/".into())],
129 vec![LayerMatch::NiceAbove(0)],
130 ],
131 kind: LayerKind::Confined {
132 util_range: (0.8, 0.9),
133 cpus_range: Some((0, 16)),
134 cpus_range_frac: None,
135 protected: false,
136 membw_gb: None,
137 common: LayerCommon {
138 min_exec_us: 1000,
139 yield_ignore: 0.0,
140 preempt: false,
141 preempt_first: false,
142 exclusive: false,
143 allow_node_aligned: false,
144 skip_remote_node: false,
145 prev_over_idle_core: false,
146 idle_smt: None,
147 slice_us: 20000,
148 fifo: false,
149 weight: DEFAULT_LAYER_WEIGHT,
150 disallow_open_after_us: None,
151 disallow_preempt_after_us: None,
152 xllc_mig_min_us: 1000.0,
153 growth_algo: LayerGrowthAlgo::Sticky,
154 idle_resume_us: None,
155 perf: 1024,
156 nodes: vec![],
157 llcs: vec![],
158 member_expire_ms: 0,
159 placement: LayerPlacement::Standard,
160 },
161 },
162 },
163 LayerSpec {
164 name: "immediate".into(),
165 comment: Some("tasks under workload.slice with nice value < 0".into()),
166 cpuset: None,
167 template: None,
168 matches: vec![vec![
169 LayerMatch::CgroupPrefix("workload.slice/".into()),
170 LayerMatch::NiceBelow(0),
171 ]],
172 kind: LayerKind::Open {
173 common: LayerCommon {
174 min_exec_us: 100,
175 yield_ignore: 0.25,
176 preempt: true,
177 preempt_first: false,
178 exclusive: true,
179 allow_node_aligned: true,
180 skip_remote_node: false,
181 prev_over_idle_core: true,
182 idle_smt: None,
183 slice_us: 20000,
184 fifo: false,
185 weight: DEFAULT_LAYER_WEIGHT,
186 disallow_open_after_us: None,
187 disallow_preempt_after_us: None,
188 xllc_mig_min_us: 0.0,
189 growth_algo: LayerGrowthAlgo::Sticky,
190 perf: 1024,
191 idle_resume_us: None,
192 nodes: vec![],
193 llcs: vec![],
194 member_expire_ms: 0,
195 placement: LayerPlacement::Standard,
196 },
197 },
198 },
199 LayerSpec {
200 name: "stress-ng".into(),
201 comment: Some("stress-ng test layer".into()),
202 cpuset: None,
203 template: None,
204 matches: vec![
205 vec![LayerMatch::CommPrefix("stress-ng".into()),],
206 vec![LayerMatch::PcommPrefix("stress-ng".into()),]
207 ],
208 kind: LayerKind::Confined {
209 cpus_range: None,
210 util_range: (0.2, 0.8),
211 protected: false,
212 cpus_range_frac: None,
213 membw_gb: None,
214 common: LayerCommon {
215 min_exec_us: 800,
216 yield_ignore: 0.0,
217 preempt: true,
218 preempt_first: false,
219 exclusive: false,
220 allow_node_aligned: false,
221 skip_remote_node: false,
222 prev_over_idle_core: false,
223 idle_smt: None,
224 slice_us: 800,
225 fifo: false,
226 weight: DEFAULT_LAYER_WEIGHT,
227 disallow_open_after_us: None,
228 disallow_preempt_after_us: None,
229 xllc_mig_min_us: 0.0,
230 growth_algo: LayerGrowthAlgo::Topo,
231 perf: 1024,
232 idle_resume_us: None,
233 nodes: vec![],
234 llcs: vec![],
235 member_expire_ms: 0,
236 placement: LayerPlacement::Standard,
237 },
238 },
239 },
240 LayerSpec {
241 name: "normal".into(),
242 comment: Some("the rest".into()),
243 cpuset: None,
244 template: None,
245 matches: vec![vec![]],
246 kind: LayerKind::Grouped {
247 cpus_range: None,
248 util_range: (0.5, 0.6),
249 util_includes_open_cputime: true,
250 protected: false,
251 cpus_range_frac: None,
252 membw_gb: None,
253 common: LayerCommon {
254 min_exec_us: 200,
255 yield_ignore: 0.0,
256 preempt: false,
257 preempt_first: false,
258 exclusive: false,
259 allow_node_aligned: false,
260 skip_remote_node: false,
261 prev_over_idle_core: false,
262 idle_smt: None,
263 slice_us: 20000,
264 fifo: false,
265 weight: DEFAULT_LAYER_WEIGHT,
266 disallow_open_after_us: None,
267 disallow_preempt_after_us: None,
268 xllc_mig_min_us: 100.0,
269 growth_algo: LayerGrowthAlgo::Linear,
270 perf: 1024,
271 idle_resume_us: None,
272 nodes: vec![],
273 llcs: vec![],
274 member_expire_ms: 0,
275 placement: LayerPlacement::Standard,
276 },
277 },
278 },
279 ],
280 };
281}
282
283#[derive(Debug, Parser)]
571#[command(verbatim_doc_comment)]
572struct Opts {
573 #[clap(short = 'v', long, action = clap::ArgAction::Count)]
575 verbose: u8,
576
577 #[clap(short = 's', long, default_value = "20000")]
579 slice_us: u64,
580
581 #[clap(short = 'M', long, default_value = "0")]
586 max_exec_us: u64,
587
588 #[clap(short = 'i', long, default_value = "0.1")]
590 interval: f64,
591
592 #[clap(short = 'n', long, default_value = "false")]
595 no_load_frac_limit: bool,
596
597 #[clap(long, default_value = "0")]
599 exit_dump_len: u32,
600
601 #[clap(long, default_value = "info")]
604 log_level: String,
605
606 #[arg(short = 't', long, num_args = 0..=1, default_missing_value = "true", require_equals = true)]
610 disable_topology: Option<bool>,
611
612 #[clap(long)]
614 xnuma_preemption: bool,
615
616 #[clap(long)]
618 monitor_disable: bool,
619
620 #[clap(short = 'e', long)]
622 example: Option<String>,
623
624 #[clap(long, default_value = "0.0")]
628 layer_preempt_weight_disable: f64,
629
630 #[clap(long, default_value = "0.0")]
634 layer_growth_weight_disable: f64,
635
636 #[clap(long)]
638 stats: Option<f64>,
639
640 #[clap(long)]
643 monitor: Option<f64>,
644
645 #[clap(long)]
647 run_example: bool,
648
649 #[clap(long, default_value = "false")]
652 local_llc_iteration: bool,
653
654 #[clap(long, default_value = "10000")]
659 lo_fb_wait_us: u64,
660
661 #[clap(long, default_value = ".05")]
664 lo_fb_share: f64,
665
666 #[clap(long, default_value = "false")]
668 disable_antistall: bool,
669
670 #[clap(long, default_value = "false")]
672 enable_gpu_affinitize: bool,
673
674 #[clap(long, default_value = "900")]
677 gpu_affinitize_secs: u64,
678
679 #[clap(long, default_value = "false")]
684 enable_match_debug: bool,
685
686 #[clap(long, default_value = "3")]
688 antistall_sec: u64,
689
690 #[clap(long, default_value = "false")]
692 enable_gpu_support: bool,
693
694 #[clap(long, default_value = "3")]
702 gpu_kprobe_level: u64,
703
704 #[clap(long, default_value = "false")]
706 netdev_irq_balance: bool,
707
708 #[clap(long, default_value = "false")]
710 disable_queued_wakeup: bool,
711
712 #[clap(long, default_value = "false")]
714 disable_percpu_kthread_preempt: bool,
715
716 #[clap(long, default_value = "false")]
720 percpu_kthread_preempt_all: bool,
721
722 #[clap(short = 'V', long, action = clap::ArgAction::SetTrue)]
724 version: bool,
725
726 #[clap(long)]
728 run_id: Option<u64>,
729
730 #[clap(long)]
732 help_stats: bool,
733
734 specs: Vec<String>,
736
737 #[clap(long, default_value = "2000")]
740 layer_refresh_ms_avgruntime: u64,
741
742 #[clap(long, default_value = "")]
744 task_hint_map: String,
745
746 #[clap(long, default_value = "false")]
748 print_and_exit: bool,
749
750 #[clap(long, default_value = "")]
752 hi_fb_thread_name: String,
753
754 #[clap(flatten, next_help_heading = "Topology Options")]
755 topology: TopologyArgs,
756
757 #[clap(flatten, next_help_heading = "Libbpf Options")]
758 pub libbpf: LibbpfOpts,
759}
760
761#[derive(Debug, Clone)]
763enum CgroupEvent {
764 Created {
765 path: String,
766 cgroup_id: u64, match_bitmap: u64, },
769 Removed {
770 path: String,
771 cgroup_id: u64, },
773}
774
775fn read_total_cpu(reader: &fb_procfs::ProcReader) -> Result<fb_procfs::CpuStat> {
776 reader
777 .read_stat()
778 .context("Failed to read procfs")?
779 .total_cpu
780 .ok_or_else(|| anyhow!("Could not read total cpu stat in proc"))
781}
782
783fn calc_util(curr: &fb_procfs::CpuStat, prev: &fb_procfs::CpuStat) -> Result<f64> {
784 match (curr, prev) {
785 (
786 fb_procfs::CpuStat {
787 user_usec: Some(curr_user),
788 nice_usec: Some(curr_nice),
789 system_usec: Some(curr_system),
790 idle_usec: Some(curr_idle),
791 iowait_usec: Some(curr_iowait),
792 irq_usec: Some(curr_irq),
793 softirq_usec: Some(curr_softirq),
794 stolen_usec: Some(curr_stolen),
795 ..
796 },
797 fb_procfs::CpuStat {
798 user_usec: Some(prev_user),
799 nice_usec: Some(prev_nice),
800 system_usec: Some(prev_system),
801 idle_usec: Some(prev_idle),
802 iowait_usec: Some(prev_iowait),
803 irq_usec: Some(prev_irq),
804 softirq_usec: Some(prev_softirq),
805 stolen_usec: Some(prev_stolen),
806 ..
807 },
808 ) => {
809 let idle_usec = curr_idle.saturating_sub(*prev_idle);
810 let iowait_usec = curr_iowait.saturating_sub(*prev_iowait);
811 let user_usec = curr_user.saturating_sub(*prev_user);
812 let system_usec = curr_system.saturating_sub(*prev_system);
813 let nice_usec = curr_nice.saturating_sub(*prev_nice);
814 let irq_usec = curr_irq.saturating_sub(*prev_irq);
815 let softirq_usec = curr_softirq.saturating_sub(*prev_softirq);
816 let stolen_usec = curr_stolen.saturating_sub(*prev_stolen);
817
818 let busy_usec =
819 user_usec + system_usec + nice_usec + irq_usec + softirq_usec + stolen_usec;
820 let total_usec = idle_usec + busy_usec + iowait_usec;
821 if total_usec > 0 {
822 Ok(((busy_usec as f64) / (total_usec as f64)).clamp(0.0, 1.0))
823 } else {
824 Ok(1.0)
825 }
826 }
827 _ => bail!("Missing stats in cpustat"),
828 }
829}
830
831fn copy_into_cstr(dst: &mut [i8], src: &str) {
832 let cstr = CString::new(src).unwrap();
833 let bytes = unsafe { std::mem::transmute::<&[u8], &[i8]>(cstr.as_bytes_with_nul()) };
834 dst[0..bytes.len()].copy_from_slice(bytes);
835}
836
837fn nodemask_from_nodes(nodes: &Vec<usize>) -> usize {
838 let mut mask = 0;
839 for node in nodes {
840 mask |= 1 << node;
841 }
842 mask
843}
844
845fn llcmask_from_llcs(llcs: &BTreeMap<usize, Arc<Llc>>) -> usize {
846 let mut mask = 0;
847 for (_, cache) in llcs {
848 mask |= 1 << cache.id;
849 }
850 mask
851}
852
853fn read_cpu_ctxs(skel: &BpfSkel) -> Result<Vec<bpf_intf::cpu_ctx>> {
854 let mut cpu_ctxs = vec![];
855 let cpu_ctxs_vec = skel
856 .maps
857 .cpu_ctxs
858 .lookup_percpu(&0u32.to_ne_bytes(), libbpf_rs::MapFlags::ANY)
859 .context("Failed to lookup cpu_ctx")?
860 .unwrap();
861 for cpu in 0..*NR_CPUS_POSSIBLE {
862 cpu_ctxs.push(*unsafe {
863 &*(cpu_ctxs_vec[cpu].as_slice().as_ptr() as *const bpf_intf::cpu_ctx)
864 });
865 }
866 Ok(cpu_ctxs)
867}
868
869#[derive(Clone, Debug)]
870struct BpfStats {
871 gstats: Vec<u64>,
872 lstats: Vec<Vec<u64>>,
873 lstats_sums: Vec<u64>,
874 llc_lstats: Vec<Vec<Vec<u64>>>, }
876
877impl BpfStats {
878 fn read(skel: &BpfSkel, cpu_ctxs: &[bpf_intf::cpu_ctx]) -> Self {
879 let nr_layers = skel.maps.rodata_data.as_ref().unwrap().nr_layers as usize;
880 let nr_llcs = skel.maps.rodata_data.as_ref().unwrap().nr_llcs as usize;
881 let mut gstats = vec![0u64; NR_GSTATS];
882 let mut lstats = vec![vec![0u64; NR_LSTATS]; nr_layers];
883 let mut llc_lstats = vec![vec![vec![0u64; NR_LLC_LSTATS]; nr_llcs]; nr_layers];
884
885 for cpu in 0..*NR_CPUS_POSSIBLE {
886 for stat in 0..NR_GSTATS {
887 gstats[stat] += cpu_ctxs[cpu].gstats[stat];
888 }
889 for layer in 0..nr_layers {
890 for stat in 0..NR_LSTATS {
891 lstats[layer][stat] += cpu_ctxs[cpu].lstats[layer][stat];
892 }
893 }
894 }
895
896 let mut lstats_sums = vec![0u64; NR_LSTATS];
897 for layer in 0..nr_layers {
898 for stat in 0..NR_LSTATS {
899 lstats_sums[stat] += lstats[layer][stat];
900 }
901 }
902
903 for llc_id in 0..nr_llcs {
904 let key = llc_id as u32;
910 let llc_id_slice =
911 unsafe { std::slice::from_raw_parts((&key as *const u32) as *const u8, 4) };
912 let v = skel
913 .maps
914 .llc_data
915 .lookup(llc_id_slice, libbpf_rs::MapFlags::ANY)
916 .unwrap()
917 .unwrap();
918 let llcc = unsafe { *(v.as_slice().as_ptr() as *const bpf_intf::llc_ctx) };
919
920 for layer_id in 0..nr_layers {
921 for stat_id in 0..NR_LLC_LSTATS {
922 llc_lstats[layer_id][llc_id][stat_id] = llcc.lstats[layer_id][stat_id];
923 }
924 }
925 }
926
927 Self {
928 gstats,
929 lstats,
930 lstats_sums,
931 llc_lstats,
932 }
933 }
934}
935
936impl<'a, 'b> Sub<&'b BpfStats> for &'a BpfStats {
937 type Output = BpfStats;
938
939 fn sub(self, rhs: &'b BpfStats) -> BpfStats {
940 let vec_sub = |l: &[u64], r: &[u64]| l.iter().zip(r.iter()).map(|(l, r)| *l - *r).collect();
941 BpfStats {
942 gstats: vec_sub(&self.gstats, &rhs.gstats),
943 lstats: self
944 .lstats
945 .iter()
946 .zip(rhs.lstats.iter())
947 .map(|(l, r)| vec_sub(l, r))
948 .collect(),
949 lstats_sums: vec_sub(&self.lstats_sums, &rhs.lstats_sums),
950 llc_lstats: self
951 .llc_lstats
952 .iter()
953 .zip(rhs.llc_lstats.iter())
954 .map(|(l_layer, r_layer)| {
955 l_layer
956 .iter()
957 .zip(r_layer.iter())
958 .map(|(l_llc, r_llc)| {
959 let (l_llc, mut r_llc) = (l_llc.clone(), r_llc.clone());
960 r_llc[bpf_intf::llc_layer_stat_id_LLC_LSTAT_LAT as usize] = 0;
962 vec_sub(&l_llc, &r_llc)
963 })
964 .collect()
965 })
966 .collect(),
967 }
968 }
969}
970
971#[derive(Clone, Debug)]
972struct Stats {
973 at: Instant,
974 elapsed: Duration,
975 nr_layers: usize,
976 nr_layer_tasks: Vec<usize>,
977 nr_nodes: usize,
978
979 total_util: f64, layer_utils: Vec<Vec<f64>>,
981 prev_layer_usages: Vec<Vec<u64>>,
982
983 layer_membws: Vec<Vec<f64>>, prev_layer_membw_agg: Vec<Vec<u64>>, cpu_busy: f64, prev_total_cpu: fb_procfs::CpuStat,
988 prev_pmu_resctrl_membw: (u64, u64), system_cpu_util_ewma: f64, layer_dsq_insert_ewma: Vec<f64>, bpf_stats: BpfStats,
994 prev_bpf_stats: BpfStats,
995
996 processing_dur: Duration,
997 prev_processing_dur: Duration,
998
999 layer_slice_us: Vec<u64>,
1000
1001 gpu_tasks_affinitized: u64,
1002 gpu_task_affinitization_ms: u64,
1003}
1004
1005impl Stats {
1006 fn read_layer_usages(cpu_ctxs: &[bpf_intf::cpu_ctx], nr_layers: usize) -> Vec<Vec<u64>> {
1007 let mut layer_usages = vec![vec![0u64; NR_LAYER_USAGES]; nr_layers];
1008
1009 for cpu in 0..*NR_CPUS_POSSIBLE {
1010 for layer in 0..nr_layers {
1011 for usage in 0..NR_LAYER_USAGES {
1012 layer_usages[layer][usage] += cpu_ctxs[cpu].layer_usages[layer][usage];
1013 }
1014 }
1015 }
1016
1017 layer_usages
1018 }
1019
1020 fn read_layer_membw_agg(cpu_ctxs: &[bpf_intf::cpu_ctx], nr_layers: usize) -> Vec<Vec<u64>> {
1022 let mut layer_membw_agg = vec![vec![0u64; NR_LAYER_USAGES]; nr_layers];
1023
1024 for cpu in 0..*NR_CPUS_POSSIBLE {
1025 for layer in 0..nr_layers {
1026 for usage in 0..NR_LAYER_USAGES {
1027 layer_membw_agg[layer][usage] += cpu_ctxs[cpu].layer_membw_agg[layer][usage];
1028 }
1029 }
1030 }
1031
1032 layer_membw_agg
1033 }
1034
1035 fn resctrl_read_total_membw() -> Result<u64> {
1047 let mut total_membw = 0u64;
1048 for entry in WalkDir::new("/sys/fs/resctrl/mon_data")
1049 .min_depth(1)
1050 .into_iter()
1051 .filter_map(Result::ok)
1052 .filter(|x| x.path().is_dir())
1053 {
1054 let mut path = entry.path().to_path_buf();
1055 path.push("mbm_total_bytes");
1056 total_membw += fs::read_to_string(path)?.trim().parse::<u64>()?;
1057 }
1058
1059 Ok(total_membw)
1060 }
1061
1062 fn new(
1063 skel: &mut BpfSkel,
1064 proc_reader: &fb_procfs::ProcReader,
1065 gpu_task_affinitizer: &GpuTaskAffinitizer,
1066 ) -> Result<Self> {
1067 let nr_layers = skel.maps.rodata_data.as_ref().unwrap().nr_layers as usize;
1068 let cpu_ctxs = read_cpu_ctxs(skel)?;
1069 let bpf_stats = BpfStats::read(skel, &cpu_ctxs);
1070 let nr_nodes = skel.maps.rodata_data.as_ref().unwrap().nr_nodes as usize;
1071 let pmu_membw = Self::read_layer_membw_agg(&cpu_ctxs, nr_layers);
1072
1073 Ok(Self {
1074 at: Instant::now(),
1075 elapsed: Default::default(),
1076 nr_layers,
1077 nr_layer_tasks: vec![0; nr_layers],
1078 nr_nodes,
1079
1080 total_util: 0.0,
1081 layer_utils: vec![vec![0.0; NR_LAYER_USAGES]; nr_layers],
1082 layer_membws: vec![vec![0.0; NR_LAYER_USAGES]; nr_layers],
1083 prev_layer_usages: Self::read_layer_usages(&cpu_ctxs, nr_layers),
1084 prev_layer_membw_agg: pmu_membw,
1088 prev_pmu_resctrl_membw: (0, 0),
1089
1090 cpu_busy: 0.0,
1091 prev_total_cpu: read_total_cpu(proc_reader)?,
1092 system_cpu_util_ewma: 0.0,
1093 layer_dsq_insert_ewma: vec![0.0; nr_layers],
1094
1095 bpf_stats: bpf_stats.clone(),
1096 prev_bpf_stats: bpf_stats,
1097
1098 processing_dur: Default::default(),
1099 prev_processing_dur: Default::default(),
1100
1101 layer_slice_us: vec![0; nr_layers],
1102 gpu_tasks_affinitized: gpu_task_affinitizer.tasks_affinitized,
1103 gpu_task_affinitization_ms: gpu_task_affinitizer.last_task_affinitization_ms,
1104 })
1105 }
1106
1107 fn refresh(
1108 &mut self,
1109 skel: &mut BpfSkel,
1110 proc_reader: &fb_procfs::ProcReader,
1111 now: Instant,
1112 cur_processing_dur: Duration,
1113 gpu_task_affinitizer: &GpuTaskAffinitizer,
1114 ) -> Result<()> {
1115 let elapsed = now.duration_since(self.at);
1116 let elapsed_f64 = elapsed.as_secs_f64();
1117 let cpu_ctxs = read_cpu_ctxs(skel)?;
1118
1119 let nr_layer_tasks: Vec<usize> = skel
1120 .maps
1121 .bss_data
1122 .as_ref()
1123 .unwrap()
1124 .layers
1125 .iter()
1126 .take(self.nr_layers)
1127 .map(|layer| layer.nr_tasks as usize)
1128 .collect();
1129 let layer_slice_us: Vec<u64> = skel
1130 .maps
1131 .bss_data
1132 .as_ref()
1133 .unwrap()
1134 .layers
1135 .iter()
1136 .take(self.nr_layers)
1137 .map(|layer| layer.slice_ns / 1000_u64)
1138 .collect();
1139
1140 let cur_layer_usages = Self::read_layer_usages(&cpu_ctxs, self.nr_layers);
1141 let cur_layer_membw_agg = Self::read_layer_membw_agg(&cpu_ctxs, self.nr_layers);
1142
1143 let (pmu_prev, resctrl_prev) = self.prev_pmu_resctrl_membw;
1151 let pmu_cur: u64 = cur_layer_membw_agg
1152 .iter()
1153 .map(|membw_agg| membw_agg[LAYER_USAGE_OPEN] + membw_agg[LAYER_USAGE_OWNED])
1154 .sum();
1155 let resctrl_cur = Self::resctrl_read_total_membw()?;
1156 let factor = (resctrl_cur - resctrl_prev) as f64 / (pmu_cur - pmu_prev) as f64;
1157
1158 let compute_diff = |cur_agg: &Vec<Vec<u64>>, prev_agg: &Vec<Vec<u64>>| {
1160 cur_agg
1161 .iter()
1162 .zip(prev_agg.iter())
1163 .map(|(cur, prev)| {
1164 cur.iter()
1165 .zip(prev.iter())
1166 .map(|(c, p)| (c - p) as f64 / 1_000_000_000.0 / elapsed_f64)
1167 .collect()
1168 })
1169 .collect()
1170 };
1171
1172 let compute_mem_diff = |cur_agg: &Vec<Vec<u64>>, prev_agg: &Vec<Vec<u64>>| {
1175 cur_agg
1176 .iter()
1177 .zip(prev_agg.iter())
1178 .map(|(cur, prev)| {
1179 cur.iter()
1180 .zip(prev.iter())
1181 .map(|(c, p)| (*c as i64 - *p as i64) as f64 / (1024 as f64).powf(3.0))
1182 .collect()
1183 })
1184 .collect()
1185 };
1186
1187 let cur_layer_utils: Vec<Vec<f64>> =
1188 compute_diff(&cur_layer_usages, &self.prev_layer_usages);
1189
1190 let cur_layer_membw: Vec<Vec<f64>> =
1192 compute_mem_diff(&cur_layer_membw_agg, &self.prev_layer_membw_agg);
1193
1194 let cur_layer_membw: Vec<Vec<f64>> = cur_layer_membw
1195 .iter()
1196 .map(|x| x.iter().map(|x| *x * factor).collect())
1197 .collect();
1198
1199 let metric_decay =
1200 |cur_metric: Vec<Vec<f64>>, prev_metric: &Vec<Vec<f64>>, decay_rate: f64| {
1201 cur_metric
1202 .iter()
1203 .zip(prev_metric.iter())
1204 .map(|(cur, prev)| {
1205 cur.iter()
1206 .zip(prev.iter())
1207 .map(|(c, p)| {
1208 let decay = decay_rate.powf(elapsed_f64);
1209 p * decay + c * (1.0 - decay)
1210 })
1211 .collect()
1212 })
1213 .collect()
1214 };
1215
1216 let layer_utils: Vec<Vec<f64>> =
1217 metric_decay(cur_layer_utils, &self.layer_utils, *USAGE_DECAY);
1218 let layer_membws: Vec<Vec<f64>> = metric_decay(cur_layer_membw, &self.layer_membws, 0.0);
1219
1220 let cur_total_cpu = read_total_cpu(proc_reader)?;
1221 let cpu_busy = calc_util(&cur_total_cpu, &self.prev_total_cpu)?;
1222
1223 const SYS_CPU_UTIL_EWMA_SECS: f64 = 10.0;
1225 let elapsed_f64 = elapsed.as_secs_f64();
1226 let alpha = elapsed_f64 / SYS_CPU_UTIL_EWMA_SECS.max(elapsed_f64);
1227 let system_cpu_util_ewma = alpha * cpu_busy + (1.0 - alpha) * self.system_cpu_util_ewma;
1228
1229 let cur_bpf_stats = BpfStats::read(skel, &cpu_ctxs);
1230 let bpf_stats = &cur_bpf_stats - &self.prev_bpf_stats;
1231
1232 const DSQ_INSERT_EWMA_SECS: f64 = 10.0;
1234 let dsq_alpha = elapsed_f64 / DSQ_INSERT_EWMA_SECS.max(elapsed_f64);
1235 let layer_dsq_insert_ewma: Vec<f64> = (0..self.nr_layers)
1236 .map(|layer_id| {
1237 let sel_local = bpf_stats.lstats[layer_id]
1238 [bpf_intf::layer_stat_id_LSTAT_SEL_LOCAL as usize]
1239 as f64;
1240 let enq_local = bpf_stats.lstats[layer_id]
1241 [bpf_intf::layer_stat_id_LSTAT_ENQ_LOCAL as usize]
1242 as f64;
1243 let enq_dsq = bpf_stats.lstats[layer_id]
1244 [bpf_intf::layer_stat_id_LSTAT_ENQ_DSQ as usize]
1245 as f64;
1246 let total_dispatches = sel_local + enq_local + enq_dsq;
1247
1248 let cur_ratio = if total_dispatches > 0.0 {
1249 enq_dsq / total_dispatches
1250 } else {
1251 0.0
1252 };
1253
1254 dsq_alpha * cur_ratio + (1.0 - dsq_alpha) * self.layer_dsq_insert_ewma[layer_id]
1255 })
1256 .collect();
1257
1258 let processing_dur = cur_processing_dur
1259 .checked_sub(self.prev_processing_dur)
1260 .unwrap();
1261
1262 *self = Self {
1263 at: now,
1264 elapsed,
1265 nr_layers: self.nr_layers,
1266 nr_layer_tasks,
1267 nr_nodes: self.nr_nodes,
1268
1269 total_util: layer_utils
1270 .iter()
1271 .map(|x| x.iter().take(LAYER_USAGE_SUM_UPTO + 1).sum::<f64>())
1272 .sum(),
1273 layer_utils,
1274 layer_membws,
1275 prev_layer_usages: cur_layer_usages,
1276 prev_layer_membw_agg: cur_layer_membw_agg,
1277 prev_pmu_resctrl_membw: (pmu_cur, resctrl_cur),
1279
1280 cpu_busy,
1281 prev_total_cpu: cur_total_cpu,
1282 system_cpu_util_ewma,
1283 layer_dsq_insert_ewma,
1284
1285 bpf_stats,
1286 prev_bpf_stats: cur_bpf_stats,
1287
1288 processing_dur,
1289 prev_processing_dur: cur_processing_dur,
1290
1291 layer_slice_us,
1292 gpu_tasks_affinitized: gpu_task_affinitizer.tasks_affinitized,
1293 gpu_task_affinitization_ms: gpu_task_affinitizer.last_task_affinitization_ms,
1294 };
1295 Ok(())
1296 }
1297}
1298
1299#[derive(Debug)]
1300struct Layer {
1301 name: String,
1302 kind: LayerKind,
1303 growth_algo: LayerGrowthAlgo,
1304 core_order: Vec<usize>,
1305
1306 target_llc_cpus: (usize, usize),
1307 assigned_llcs: Vec<usize>,
1308
1309 nr_cpus: usize,
1310 nr_llc_cpus: Vec<usize>,
1311 cpus: Cpumask,
1312 allowed_cpus: Cpumask,
1313}
1314
1315fn get_kallsyms_addr(sym_name: &str) -> Result<u64> {
1316 fs::read_to_string("/proc/kallsyms")?
1317 .lines()
1318 .find(|line| line.contains(sym_name))
1319 .and_then(|line| line.split_whitespace().next())
1320 .and_then(|addr| u64::from_str_radix(addr, 16).ok())
1321 .ok_or_else(|| anyhow!("Symbol '{}' not found", sym_name))
1322}
1323
1324fn resolve_cpus_pct_range(
1325 cpus_range: &Option<(usize, usize)>,
1326 cpus_range_frac: &Option<(f64, f64)>,
1327 max_cpus: usize,
1328) -> Result<(usize, usize)> {
1329 match (cpus_range, cpus_range_frac) {
1330 (Some(_x), Some(_y)) => {
1331 bail!("cpus_range cannot be used with cpus_pct.");
1332 }
1333 (Some((cpus_range_min, cpus_range_max)), None) => Ok((*cpus_range_min, *cpus_range_max)),
1334 (None, Some((cpus_frac_min, cpus_frac_max))) => {
1335 if *cpus_frac_min < 0_f64
1336 || *cpus_frac_min > 1_f64
1337 || *cpus_frac_max < 0_f64
1338 || *cpus_frac_max > 1_f64
1339 {
1340 bail!("cpus_range_frac values must be between 0.0 and 1.0");
1341 }
1342 let cpus_min_count = ((max_cpus as f64) * cpus_frac_min).round_ties_even() as usize;
1343 let cpus_max_count = ((max_cpus as f64) * cpus_frac_max).round_ties_even() as usize;
1344 Ok((
1345 std::cmp::max(cpus_min_count, 1),
1346 std::cmp::min(cpus_max_count, max_cpus),
1347 ))
1348 }
1349 (None, None) => Ok((0, max_cpus)),
1350 }
1351}
1352
1353impl Layer {
1354 fn new(spec: &LayerSpec, topo: &Topology, core_order: &Vec<usize>) -> Result<Self> {
1355 let name = &spec.name;
1356 let kind = spec.kind.clone();
1357 let mut allowed_cpus = Cpumask::new();
1358 match &kind {
1359 LayerKind::Confined {
1360 cpus_range,
1361 cpus_range_frac,
1362 common: LayerCommon { nodes, llcs, .. },
1363 ..
1364 } => {
1365 let cpus_range =
1366 resolve_cpus_pct_range(cpus_range, cpus_range_frac, topo.all_cpus.len())?;
1367 if cpus_range.0 > cpus_range.1 || cpus_range.1 == 0 {
1368 bail!("invalid cpus_range {:?}", cpus_range);
1369 }
1370 if nodes.is_empty() && llcs.is_empty() {
1371 allowed_cpus.set_all();
1372 } else {
1373 for (node_id, node) in &topo.nodes {
1375 if nodes.contains(node_id) {
1377 for &id in node.all_cpus.keys() {
1378 allowed_cpus.set_cpu(id)?;
1379 }
1380 }
1381 for (llc_id, llc) in &node.llcs {
1383 if llcs.contains(llc_id) {
1384 for &id in llc.all_cpus.keys() {
1385 allowed_cpus.set_cpu(id)?;
1386 }
1387 }
1388 }
1389 }
1390 }
1391 }
1392 LayerKind::Grouped {
1393 common: LayerCommon { nodes, llcs, .. },
1394 ..
1395 }
1396 | LayerKind::Open {
1397 common: LayerCommon { nodes, llcs, .. },
1398 ..
1399 } => {
1400 if nodes.is_empty() && llcs.is_empty() {
1401 allowed_cpus.set_all();
1402 } else {
1403 for (node_id, node) in &topo.nodes {
1405 if nodes.contains(node_id) {
1407 for &id in node.all_cpus.keys() {
1408 allowed_cpus.set_cpu(id)?;
1409 }
1410 }
1411 for (llc_id, llc) in &node.llcs {
1413 if llcs.contains(llc_id) {
1414 for &id in llc.all_cpus.keys() {
1415 allowed_cpus.set_cpu(id)?;
1416 }
1417 }
1418 }
1419 }
1420 }
1421 }
1422 }
1423
1424 if let Some(util_range) = kind.util_range() {
1427 if util_range.0 < 0.0 || util_range.1 < 0.0 || util_range.0 >= util_range.1 {
1428 bail!("invalid util_range {:?}", util_range);
1429 }
1430 }
1431
1432 let layer_growth_algo = kind.common().growth_algo.clone();
1433
1434 debug!(
1435 "layer: {} algo: {:?} core order: {:?}",
1436 name, &layer_growth_algo, core_order
1437 );
1438
1439 Ok(Self {
1440 name: name.into(),
1441 kind,
1442 growth_algo: layer_growth_algo,
1443 core_order: core_order.clone(),
1444
1445 target_llc_cpus: (0, 0),
1446 assigned_llcs: vec![],
1447
1448 nr_cpus: 0,
1449 nr_llc_cpus: vec![0; topo.all_llcs.len()],
1450 cpus: Cpumask::new(),
1451 allowed_cpus,
1452 })
1453 }
1454
1455 fn free_some_cpus(&mut self, cpu_pool: &mut CpuPool, max_to_free: usize) -> Result<usize> {
1456 let cpus_to_free = match cpu_pool.next_to_free(&self.cpus, self.core_order.iter().rev())? {
1457 Some(ret) => ret.clone(),
1458 None => return Ok(0),
1459 };
1460
1461 let nr_to_free = cpus_to_free.weight();
1462
1463 Ok(if nr_to_free <= max_to_free {
1464 trace!("[{}] freeing CPUs: {}", self.name, &cpus_to_free);
1465 self.cpus &= &cpus_to_free.not();
1466 self.nr_cpus -= nr_to_free;
1467 for cpu in cpus_to_free.iter() {
1468 self.nr_llc_cpus[cpu_pool.topo.all_cpus[&cpu].llc_id] -= 1;
1469 }
1470 cpu_pool.free(&cpus_to_free)?;
1471 nr_to_free
1472 } else {
1473 0
1474 })
1475 }
1476
1477 fn alloc_some_cpus(&mut self, cpu_pool: &mut CpuPool) -> Result<usize> {
1478 let new_cpus = match cpu_pool
1479 .alloc_cpus(&self.allowed_cpus, &self.core_order)
1480 .clone()
1481 {
1482 Some(ret) => ret.clone(),
1483 None => {
1484 trace!("layer-{} can't grow, no CPUs", &self.name);
1485 return Ok(0);
1486 }
1487 };
1488
1489 let nr_new_cpus = new_cpus.weight();
1490
1491 trace!("[{}] adding CPUs: {}", &self.name, &new_cpus);
1492 self.cpus |= &new_cpus;
1493 self.nr_cpus += nr_new_cpus;
1494 for cpu in new_cpus.iter() {
1495 self.nr_llc_cpus[cpu_pool.topo.all_cpus[&cpu].llc_id] += 1;
1496 }
1497 Ok(nr_new_cpus)
1498 }
1499}
1500#[derive(Debug, Clone)]
1501struct NodeInfo {
1502 node_mask: nix::sched::CpuSet,
1503 _node_id: usize,
1504}
1505
1506#[derive(Debug)]
1507struct GpuTaskAffinitizer {
1508 gpu_devs_to_node_info: HashMap<u32, NodeInfo>,
1511 gpu_pids_to_devs: HashMap<Pid, u32>,
1512 last_process_time: Option<Instant>,
1513 sys: System,
1514 pid_map: HashMap<Pid, Vec<Pid>>,
1515 poll_interval: Duration,
1516 enable: bool,
1517 tasks_affinitized: u64,
1518 last_task_affinitization_ms: u64,
1519}
1520
1521impl GpuTaskAffinitizer {
1522 pub fn new(poll_interval: u64, enable: bool) -> GpuTaskAffinitizer {
1523 GpuTaskAffinitizer {
1524 gpu_devs_to_node_info: HashMap::new(),
1525 gpu_pids_to_devs: HashMap::new(),
1526 last_process_time: None,
1527 sys: System::default(),
1528 pid_map: HashMap::new(),
1529 poll_interval: Duration::from_secs(poll_interval),
1530 enable,
1531 tasks_affinitized: 0,
1532 last_task_affinitization_ms: 0,
1533 }
1534 }
1535
1536 fn find_one_cpu(&self, affinity: Vec<u64>) -> Result<u32> {
1537 for (chunk, &mask) in affinity.iter().enumerate() {
1538 let mut inner_offset: u64 = 1;
1539 for _ in 0..64 {
1540 if (mask & inner_offset) != 0 {
1541 return Ok((64 * chunk + u64::trailing_zeros(inner_offset) as usize) as u32);
1542 }
1543 inner_offset = inner_offset << 1;
1544 }
1545 }
1546 anyhow::bail!("unable to get CPU from NVML bitmask");
1547 }
1548
1549 fn node_to_cpuset(&self, node: &scx_utils::Node) -> Result<CpuSet> {
1550 let mut cpuset = CpuSet::new();
1551 for (cpu_id, _cpu) in &node.all_cpus {
1552 cpuset.set(*cpu_id)?;
1553 }
1554 Ok(cpuset)
1555 }
1556
1557 fn init_dev_node_map(&mut self, topo: Arc<Topology>) -> Result<()> {
1558 let nvml = nvml()?;
1559 let device_count = nvml.device_count()?;
1560
1561 for idx in 0..device_count {
1562 let dev = nvml.device_by_index(idx)?;
1563 let cpu = dev.cpu_affinity(16)?;
1565 let ideal_cpu = self.find_one_cpu(cpu)?;
1566 if let Some(cpu) = topo.all_cpus.get(&(ideal_cpu as usize)) {
1567 self.gpu_devs_to_node_info.insert(
1568 idx,
1569 NodeInfo {
1570 node_mask: self.node_to_cpuset(
1571 topo.nodes.get(&cpu.node_id).expect("topo missing node"),
1572 )?,
1573 _node_id: cpu.node_id,
1574 },
1575 );
1576 }
1577 }
1578 Ok(())
1579 }
1580
1581 fn update_gpu_pids(&mut self) -> Result<()> {
1582 let nvml = nvml()?;
1583 for i in 0..nvml.device_count()? {
1584 let device = nvml.device_by_index(i)?;
1585 for proc in device
1586 .running_compute_processes()?
1587 .into_iter()
1588 .chain(device.running_graphics_processes()?.into_iter())
1589 {
1590 self.gpu_pids_to_devs.insert(Pid::from_u32(proc.pid), i);
1591 }
1592 }
1593 Ok(())
1594 }
1595
1596 fn update_process_info(&mut self) -> Result<()> {
1597 self.sys.refresh_processes_specifics(
1598 ProcessesToUpdate::All,
1599 true,
1600 ProcessRefreshKind::nothing(),
1601 );
1602 self.pid_map.clear();
1603 for (pid, proc_) in self.sys.processes() {
1604 if let Some(ppid) = proc_.parent() {
1605 self.pid_map.entry(ppid).or_default().push(*pid);
1606 }
1607 }
1608 Ok(())
1609 }
1610
1611 fn get_child_pids_and_tids(&self, root_pid: Pid) -> HashSet<Pid> {
1612 let mut work = VecDeque::from([root_pid]);
1613 let mut pids_and_tids: HashSet<Pid> = HashSet::new();
1614
1615 while let Some(pid) = work.pop_front() {
1616 if pids_and_tids.insert(pid) {
1617 if let Some(kids) = self.pid_map.get(&pid) {
1618 work.extend(kids);
1619 }
1620 if let Some(proc_) = self.sys.process(pid) {
1621 if let Some(tasks) = proc_.tasks() {
1622 pids_and_tids.extend(tasks.iter().copied());
1623 }
1624 }
1625 }
1626 }
1627 pids_and_tids
1628 }
1629
1630 fn affinitize_gpu_pids(&mut self) -> Result<()> {
1631 if !self.enable {
1632 return Ok(());
1633 }
1634 for (pid, dev) in &self.gpu_pids_to_devs {
1635 let node_info = self
1636 .gpu_devs_to_node_info
1637 .get(&dev)
1638 .expect("Unable to get gpu pid node mask");
1639 for child in self.get_child_pids_and_tids(*pid) {
1640 match nix::sched::sched_setaffinity(
1641 nix::unistd::Pid::from_raw(child.as_u32() as i32),
1642 &node_info.node_mask,
1643 ) {
1644 Ok(_) => {
1645 self.tasks_affinitized += 1;
1647 }
1648 Err(_) => {
1649 debug!(
1650 "Error affinitizing gpu pid {} to node {:#?}",
1651 child.as_u32(),
1652 node_info
1653 );
1654 }
1655 };
1656 }
1657 }
1658 Ok(())
1659 }
1660
1661 pub fn maybe_affinitize(&mut self) {
1662 if !self.enable {
1663 return;
1664 }
1665 let now = Instant::now();
1666
1667 if let Some(last_process_time) = self.last_process_time {
1668 if (now - last_process_time) < self.poll_interval {
1669 return;
1670 }
1671 }
1672
1673 match self.update_gpu_pids() {
1674 Ok(_) => {}
1675 Err(e) => {
1676 error!("Error updating GPU PIDs: {}", e);
1677 }
1678 };
1679 match self.update_process_info() {
1680 Ok(_) => {}
1681 Err(e) => {
1682 error!("Error updating process info to affinitize GPU PIDs: {}", e);
1683 }
1684 };
1685 match self.affinitize_gpu_pids() {
1686 Ok(_) => {}
1687 Err(e) => {
1688 error!("Error updating GPU PIDs: {}", e);
1689 }
1690 };
1691 self.last_process_time = Some(now);
1692 self.last_task_affinitization_ms = (Instant::now() - now).as_millis() as u64;
1693
1694 return;
1695 }
1696
1697 pub fn init(&mut self, topo: Arc<Topology>) {
1698 if !self.enable || self.last_process_time.is_some() {
1699 return;
1700 }
1701
1702 match self.init_dev_node_map(topo) {
1703 Ok(_) => {}
1704 Err(e) => {
1705 error!("Error initializing gpu node dev map: {}", e);
1706 }
1707 };
1708 self.sys = System::new_all();
1709 return;
1710 }
1711}
1712
1713struct Scheduler<'a> {
1714 skel: BpfSkel<'a>,
1715 struct_ops: Option<libbpf_rs::Link>,
1716 layer_specs: Vec<LayerSpec>,
1717
1718 sched_intv: Duration,
1719 layer_refresh_intv: Duration,
1720
1721 cpu_pool: CpuPool,
1722 layers: Vec<Layer>,
1723 idle_qos_enabled: bool,
1724
1725 proc_reader: fb_procfs::ProcReader,
1726 sched_stats: Stats,
1727
1728 cgroup_regexes: Option<HashMap<u32, Regex>>,
1729
1730 nr_layer_cpus_ranges: Vec<(usize, usize)>,
1731 processing_dur: Duration,
1732
1733 topo: Arc<Topology>,
1734 netdevs: BTreeMap<String, NetDev>,
1735 stats_server: StatsServer<StatsReq, StatsRes>,
1736 gpu_task_handler: GpuTaskAffinitizer,
1737}
1738
1739impl<'a> Scheduler<'a> {
1740 fn init_layers(
1741 skel: &mut OpenBpfSkel,
1742 specs: &[LayerSpec],
1743 topo: &Topology,
1744 ) -> Result<HashMap<u32, Regex>> {
1745 skel.maps.rodata_data.as_mut().unwrap().nr_layers = specs.len() as u32;
1746 let mut perf_set = false;
1747
1748 let mut layer_iteration_order = (0..specs.len()).collect::<Vec<_>>();
1749 let mut layer_weights: Vec<usize> = vec![];
1750 let mut cgroup_regex_id = 0;
1751 let mut cgroup_regexes = HashMap::new();
1752
1753 for (spec_i, spec) in specs.iter().enumerate() {
1754 let layer = &mut skel.maps.bss_data.as_mut().unwrap().layers[spec_i];
1755
1756 for (or_i, or) in spec.matches.iter().enumerate() {
1757 for (and_i, and) in or.iter().enumerate() {
1758 let mt = &mut layer.matches[or_i].matches[and_i];
1759
1760 mt.exclude.write(false);
1762
1763 match and {
1764 LayerMatch::CgroupPrefix(prefix) => {
1765 mt.kind = bpf_intf::layer_match_kind_MATCH_CGROUP_PREFIX as i32;
1766 copy_into_cstr(&mut mt.cgroup_prefix, prefix.as_str());
1767 }
1768 LayerMatch::CgroupSuffix(suffix) => {
1769 mt.kind = bpf_intf::layer_match_kind_MATCH_CGROUP_SUFFIX as i32;
1770 copy_into_cstr(&mut mt.cgroup_suffix, suffix.as_str());
1771 }
1772 LayerMatch::CgroupRegex(regex_str) => {
1773 if cgroup_regex_id >= bpf_intf::consts_MAX_CGROUP_REGEXES {
1774 bail!(
1775 "Too many cgroup regex rules. Maximum allowed: {}",
1776 bpf_intf::consts_MAX_CGROUP_REGEXES
1777 );
1778 }
1779
1780 mt.kind = bpf_intf::layer_match_kind_MATCH_CGROUP_REGEX as i32;
1782 mt.cgroup_regex_id = cgroup_regex_id;
1783
1784 let regex = Regex::new(regex_str).with_context(|| {
1785 format!("Invalid regex '{}' in layer '{}'", regex_str, spec.name)
1786 })?;
1787 cgroup_regexes.insert(cgroup_regex_id, regex);
1788 cgroup_regex_id += 1;
1789 }
1790 LayerMatch::CgroupContains(substr) => {
1791 mt.kind = bpf_intf::layer_match_kind_MATCH_CGROUP_CONTAINS as i32;
1792 copy_into_cstr(&mut mt.cgroup_substr, substr.as_str());
1793 }
1794 LayerMatch::CommPrefix(prefix) => {
1795 mt.kind = bpf_intf::layer_match_kind_MATCH_COMM_PREFIX as i32;
1796 copy_into_cstr(&mut mt.comm_prefix, prefix.as_str());
1797 }
1798 LayerMatch::CommPrefixExclude(prefix) => {
1799 mt.kind = bpf_intf::layer_match_kind_MATCH_COMM_PREFIX as i32;
1800 mt.exclude.write(true);
1801 copy_into_cstr(&mut mt.comm_prefix, prefix.as_str());
1802 }
1803 LayerMatch::PcommPrefix(prefix) => {
1804 mt.kind = bpf_intf::layer_match_kind_MATCH_PCOMM_PREFIX as i32;
1805 copy_into_cstr(&mut mt.pcomm_prefix, prefix.as_str());
1806 }
1807 LayerMatch::PcommPrefixExclude(prefix) => {
1808 mt.kind = bpf_intf::layer_match_kind_MATCH_PCOMM_PREFIX as i32;
1809 mt.exclude.write(true);
1810 copy_into_cstr(&mut mt.pcomm_prefix, prefix.as_str());
1811 }
1812 LayerMatch::NiceAbove(nice) => {
1813 mt.kind = bpf_intf::layer_match_kind_MATCH_NICE_ABOVE as i32;
1814 mt.nice = *nice;
1815 }
1816 LayerMatch::NiceBelow(nice) => {
1817 mt.kind = bpf_intf::layer_match_kind_MATCH_NICE_BELOW as i32;
1818 mt.nice = *nice;
1819 }
1820 LayerMatch::NiceEquals(nice) => {
1821 mt.kind = bpf_intf::layer_match_kind_MATCH_NICE_EQUALS as i32;
1822 mt.nice = *nice;
1823 }
1824 LayerMatch::UIDEquals(user_id) => {
1825 mt.kind = bpf_intf::layer_match_kind_MATCH_USER_ID_EQUALS as i32;
1826 mt.user_id = *user_id;
1827 }
1828 LayerMatch::GIDEquals(group_id) => {
1829 mt.kind = bpf_intf::layer_match_kind_MATCH_GROUP_ID_EQUALS as i32;
1830 mt.group_id = *group_id;
1831 }
1832 LayerMatch::PIDEquals(pid) => {
1833 mt.kind = bpf_intf::layer_match_kind_MATCH_PID_EQUALS as i32;
1834 mt.pid = *pid;
1835 }
1836 LayerMatch::PPIDEquals(ppid) => {
1837 mt.kind = bpf_intf::layer_match_kind_MATCH_PPID_EQUALS as i32;
1838 mt.ppid = *ppid;
1839 }
1840 LayerMatch::TGIDEquals(tgid) => {
1841 mt.kind = bpf_intf::layer_match_kind_MATCH_TGID_EQUALS as i32;
1842 mt.tgid = *tgid;
1843 }
1844 LayerMatch::NSPIDEquals(nsid, pid) => {
1845 mt.kind = bpf_intf::layer_match_kind_MATCH_NSPID_EQUALS as i32;
1846 mt.nsid = *nsid;
1847 mt.pid = *pid;
1848 }
1849 LayerMatch::NSEquals(nsid) => {
1850 mt.kind = bpf_intf::layer_match_kind_MATCH_NS_EQUALS as i32;
1851 mt.nsid = *nsid as u64;
1852 }
1853 LayerMatch::CmdJoin(joincmd) => {
1854 mt.kind = bpf_intf::layer_match_kind_MATCH_SCXCMD_JOIN as i32;
1855 copy_into_cstr(&mut mt.comm_prefix, joincmd);
1856 }
1857 LayerMatch::IsGroupLeader(polarity) => {
1858 mt.kind = bpf_intf::layer_match_kind_MATCH_IS_GROUP_LEADER as i32;
1859 mt.is_group_leader.write(*polarity);
1860 }
1861 LayerMatch::IsKthread(polarity) => {
1862 mt.kind = bpf_intf::layer_match_kind_MATCH_IS_KTHREAD as i32;
1863 mt.is_kthread.write(*polarity);
1864 }
1865 LayerMatch::UsedGpuTid(polarity) => {
1866 mt.kind = bpf_intf::layer_match_kind_MATCH_USED_GPU_TID as i32;
1867 mt.used_gpu_tid.write(*polarity);
1868 }
1869 LayerMatch::UsedGpuPid(polarity) => {
1870 mt.kind = bpf_intf::layer_match_kind_MATCH_USED_GPU_PID as i32;
1871 mt.used_gpu_pid.write(*polarity);
1872 }
1873 LayerMatch::AvgRuntime(min, max) => {
1874 mt.kind = bpf_intf::layer_match_kind_MATCH_AVG_RUNTIME as i32;
1875 mt.min_avg_runtime_us = *min;
1876 mt.max_avg_runtime_us = *max;
1877 }
1878 LayerMatch::HintEquals(hint) => {
1879 mt.kind = bpf_intf::layer_match_kind_MATCH_HINT_EQUALS as i32;
1880 mt.hint = *hint;
1881 }
1882 LayerMatch::SystemCpuUtilBelow(threshold) => {
1883 mt.kind = bpf_intf::layer_match_kind_MATCH_SYSTEM_CPU_UTIL_BELOW as i32;
1884 mt.system_cpu_util_below = (*threshold * 10000.0) as u64;
1885 }
1886 LayerMatch::DsqInsertBelow(threshold) => {
1887 mt.kind = bpf_intf::layer_match_kind_MATCH_DSQ_INSERT_BELOW as i32;
1888 mt.dsq_insert_below = (*threshold * 10000.0) as u64;
1889 }
1890 }
1891 }
1892 layer.matches[or_i].nr_match_ands = or.len() as i32;
1893 }
1894
1895 layer.nr_match_ors = spec.matches.len() as u32;
1896 layer.kind = spec.kind.as_bpf_enum();
1897
1898 {
1899 let LayerCommon {
1900 min_exec_us,
1901 yield_ignore,
1902 perf,
1903 preempt,
1904 preempt_first,
1905 exclusive,
1906 allow_node_aligned,
1907 skip_remote_node,
1908 prev_over_idle_core,
1909 growth_algo,
1910 nodes,
1911 slice_us,
1912 fifo,
1913 weight,
1914 disallow_open_after_us,
1915 disallow_preempt_after_us,
1916 xllc_mig_min_us,
1917 placement,
1918 member_expire_ms,
1919 ..
1920 } = spec.kind.common();
1921
1922 layer.slice_ns = *slice_us * 1000;
1923 layer.fifo.write(*fifo);
1924 layer.min_exec_ns = min_exec_us * 1000;
1925 layer.yield_step_ns = if *yield_ignore > 0.999 {
1926 0
1927 } else if *yield_ignore < 0.001 {
1928 layer.slice_ns
1929 } else {
1930 (layer.slice_ns as f64 * (1.0 - *yield_ignore)) as u64
1931 };
1932 let mut layer_name: String = spec.name.clone();
1933 layer_name.truncate(MAX_LAYER_NAME);
1934 copy_into_cstr(&mut layer.name, layer_name.as_str());
1935 layer.preempt.write(*preempt);
1936 layer.preempt_first.write(*preempt_first);
1937 layer.excl.write(*exclusive);
1938 layer.allow_node_aligned.write(*allow_node_aligned);
1939 layer.skip_remote_node.write(*skip_remote_node);
1940 layer.prev_over_idle_core.write(*prev_over_idle_core);
1941 layer.growth_algo = growth_algo.as_bpf_enum();
1942 layer.weight = *weight;
1943 layer.member_expire_ms = *member_expire_ms;
1944 layer.disallow_open_after_ns = match disallow_open_after_us.unwrap() {
1945 v if v == u64::MAX => v,
1946 v => v * 1000,
1947 };
1948 layer.disallow_preempt_after_ns = match disallow_preempt_after_us.unwrap() {
1949 v if v == u64::MAX => v,
1950 v => v * 1000,
1951 };
1952 layer.xllc_mig_min_ns = (xllc_mig_min_us * 1000.0) as u64;
1953 layer_weights.push(layer.weight.try_into().unwrap());
1954 layer.perf = u32::try_from(*perf)?;
1955 layer.node_mask = nodemask_from_nodes(nodes) as u64;
1956 for (topo_node_id, topo_node) in &topo.nodes {
1957 if !nodes.is_empty() && !nodes.contains(topo_node_id) {
1958 continue;
1959 }
1960 layer.llc_mask |= llcmask_from_llcs(&topo_node.llcs) as u64;
1961 }
1962
1963 let task_place = |place: u32| crate::types::layer_task_place(place);
1964 layer.task_place = match placement {
1965 LayerPlacement::Standard => {
1966 task_place(bpf_intf::layer_task_place_PLACEMENT_STD as u32)
1967 }
1968 LayerPlacement::Sticky => {
1969 task_place(bpf_intf::layer_task_place_PLACEMENT_STICK as u32)
1970 }
1971 LayerPlacement::Floating => {
1972 task_place(bpf_intf::layer_task_place_PLACEMENT_FLOAT as u32)
1973 }
1974 };
1975 }
1976
1977 layer.is_protected.write(match spec.kind {
1978 LayerKind::Open { .. } => false,
1979 LayerKind::Confined { protected, .. } | LayerKind::Grouped { protected, .. } => {
1980 protected
1981 }
1982 });
1983
1984 match &spec.cpuset {
1985 Some(mask) => {
1986 Self::update_cpumask(&mask, &mut layer.cpuset);
1987 }
1988 None => {
1989 for i in 0..layer.cpuset.len() {
1990 layer.cpuset[i] = u8::MAX;
1991 }
1992 }
1993 };
1994
1995 perf_set |= layer.perf > 0;
1996 }
1997
1998 layer_iteration_order.sort_by(|i, j| layer_weights[*i].cmp(&layer_weights[*j]));
1999 for (idx, layer_idx) in layer_iteration_order.iter().enumerate() {
2000 skel.maps
2001 .rodata_data
2002 .as_mut()
2003 .unwrap()
2004 .layer_iteration_order[idx] = *layer_idx as u32;
2005 }
2006
2007 if perf_set && !compat::ksym_exists("scx_bpf_cpuperf_set")? {
2008 warn!("cpufreq support not available, ignoring perf configurations");
2009 }
2010
2011 Ok(cgroup_regexes)
2012 }
2013
2014 fn init_nodes(skel: &mut OpenBpfSkel, _opts: &Opts, topo: &Topology) {
2015 skel.maps.rodata_data.as_mut().unwrap().nr_nodes = topo.nodes.len() as u32;
2016 skel.maps.rodata_data.as_mut().unwrap().nr_llcs = 0;
2017
2018 for (&node_id, node) in &topo.nodes {
2019 debug!("configuring node {}, LLCs {:?}", node_id, node.llcs.len());
2020 skel.maps.rodata_data.as_mut().unwrap().nr_llcs += node.llcs.len() as u32;
2021 let raw_numa_slice = node.span.as_raw_slice();
2022 let node_cpumask_slice =
2023 &mut skel.maps.rodata_data.as_mut().unwrap().numa_cpumasks[node_id];
2024 let (left, _) = node_cpumask_slice.split_at_mut(raw_numa_slice.len());
2025 left.clone_from_slice(raw_numa_slice);
2026 debug!(
2027 "node {} mask: {:?}",
2028 node_id,
2029 skel.maps.rodata_data.as_ref().unwrap().numa_cpumasks[node_id]
2030 );
2031
2032 for llc in node.llcs.values() {
2033 debug!("configuring llc {:?} for node {:?}", llc.id, node_id);
2034 skel.maps.rodata_data.as_mut().unwrap().llc_numa_id_map[llc.id] = node_id as u32;
2035 }
2036 }
2037
2038 for cpu in topo.all_cpus.values() {
2039 skel.maps.rodata_data.as_mut().unwrap().cpu_llc_id_map[cpu.id] = cpu.llc_id as u32;
2040 }
2041 }
2042
2043 fn init_cpu_prox_map(topo: &Topology, cpu_ctxs: &mut [bpf_intf::cpu_ctx]) {
2044 let radiate = |mut vec: Vec<usize>, center_id: usize| -> Vec<usize> {
2045 vec.sort_by_key(|&id| (center_id as i32 - id as i32).abs());
2046 vec
2047 };
2048 let radiate_cpu =
2049 |mut vec: Vec<usize>, center_cpu: usize, center_core: usize| -> Vec<usize> {
2050 vec.sort_by_key(|&id| {
2051 (
2052 (center_core as i32 - topo.all_cpus.get(&id).unwrap().core_id as i32).abs(),
2053 (center_cpu as i32 - id as i32).abs(),
2054 )
2055 });
2056 vec
2057 };
2058
2059 for (&cpu_id, cpu) in &topo.all_cpus {
2060 let mut core_span = topo.all_cores[&cpu.core_id].span.clone();
2062 let llc_span = &topo.all_llcs[&cpu.llc_id].span;
2063 let node_span = &topo.nodes[&cpu.node_id].span;
2064 let sys_span = &topo.span;
2065
2066 let sys_span = sys_span.and(&node_span.not());
2068 let node_span = node_span.and(&llc_span.not());
2069 let llc_span = llc_span.and(&core_span.not());
2070 core_span.clear_cpu(cpu_id).unwrap();
2071
2072 let mut sys_order: Vec<usize> = sys_span.iter().collect();
2074 let mut node_order: Vec<usize> = node_span.iter().collect();
2075 let mut llc_order: Vec<usize> = llc_span.iter().collect();
2076 let mut core_order: Vec<usize> = core_span.iter().collect();
2077
2078 sys_order = radiate_cpu(sys_order, cpu_id, cpu.core_id);
2083 node_order = radiate(node_order, cpu.node_id);
2084 llc_order = radiate_cpu(llc_order, cpu_id, cpu.core_id);
2085 core_order = radiate_cpu(core_order, cpu_id, cpu.core_id);
2086
2087 let mut order: Vec<usize> = vec![];
2089 let mut idx: usize = 0;
2090
2091 idx += 1;
2092 order.push(cpu_id);
2093
2094 idx += core_order.len();
2095 order.append(&mut core_order);
2096 let core_end = idx;
2097
2098 idx += llc_order.len();
2099 order.append(&mut llc_order);
2100 let llc_end = idx;
2101
2102 idx += node_order.len();
2103 order.append(&mut node_order);
2104 let node_end = idx;
2105
2106 idx += sys_order.len();
2107 order.append(&mut sys_order);
2108 let sys_end = idx;
2109
2110 debug!(
2111 "CPU[{}] proximity map[{}/{}/{}/{}]: {:?}",
2112 cpu_id, core_end, llc_end, node_end, sys_end, &order
2113 );
2114
2115 let pmap = &mut cpu_ctxs[cpu_id].prox_map;
2117 for (i, &cpu) in order.iter().enumerate() {
2118 pmap.cpus[i] = cpu as u16;
2119 }
2120 pmap.core_end = core_end as u32;
2121 pmap.llc_end = llc_end as u32;
2122 pmap.node_end = node_end as u32;
2123 pmap.sys_end = sys_end as u32;
2124 }
2125 }
2126
2127 fn convert_cpu_ctxs(cpu_ctxs: Vec<bpf_intf::cpu_ctx>) -> Vec<Vec<u8>> {
2128 cpu_ctxs
2129 .into_iter()
2130 .map(|cpu_ctx| {
2131 let bytes = unsafe {
2132 std::slice::from_raw_parts(
2133 &cpu_ctx as *const bpf_intf::cpu_ctx as *const u8,
2134 std::mem::size_of::<bpf_intf::cpu_ctx>(),
2135 )
2136 };
2137 bytes.to_vec()
2138 })
2139 .collect()
2140 }
2141
2142 fn init_cpus(skel: &BpfSkel, layer_specs: &[LayerSpec], topo: &Topology) -> Result<()> {
2143 let key = (0_u32).to_ne_bytes();
2144 let mut cpu_ctxs: Vec<bpf_intf::cpu_ctx> = vec![];
2145 let cpu_ctxs_vec = skel
2146 .maps
2147 .cpu_ctxs
2148 .lookup_percpu(&key, libbpf_rs::MapFlags::ANY)
2149 .context("Failed to lookup cpu_ctx")?
2150 .unwrap();
2151
2152 let op_layers: Vec<u32> = layer_specs
2153 .iter()
2154 .enumerate()
2155 .filter(|(_idx, spec)| match &spec.kind {
2156 LayerKind::Open { .. } => spec.kind.common().preempt,
2157 _ => false,
2158 })
2159 .map(|(idx, _)| idx as u32)
2160 .collect();
2161 let on_layers: Vec<u32> = layer_specs
2162 .iter()
2163 .enumerate()
2164 .filter(|(_idx, spec)| match &spec.kind {
2165 LayerKind::Open { .. } => !spec.kind.common().preempt,
2166 _ => false,
2167 })
2168 .map(|(idx, _)| idx as u32)
2169 .collect();
2170 let gp_layers: Vec<u32> = layer_specs
2171 .iter()
2172 .enumerate()
2173 .filter(|(_idx, spec)| match &spec.kind {
2174 LayerKind::Grouped { .. } => spec.kind.common().preempt,
2175 _ => false,
2176 })
2177 .map(|(idx, _)| idx as u32)
2178 .collect();
2179 let gn_layers: Vec<u32> = layer_specs
2180 .iter()
2181 .enumerate()
2182 .filter(|(_idx, spec)| match &spec.kind {
2183 LayerKind::Grouped { .. } => !spec.kind.common().preempt,
2184 _ => false,
2185 })
2186 .map(|(idx, _)| idx as u32)
2187 .collect();
2188
2189 for cpu in 0..*NR_CPUS_POSSIBLE {
2191 cpu_ctxs.push(*unsafe {
2192 &*(cpu_ctxs_vec[cpu].as_slice().as_ptr() as *const bpf_intf::cpu_ctx)
2193 });
2194
2195 let topo_cpu = topo.all_cpus.get(&cpu).unwrap();
2196 let is_big = topo_cpu.core_type == CoreType::Big { turbo: true };
2197 cpu_ctxs[cpu].cpu = cpu as i32;
2198 cpu_ctxs[cpu].layer_id = MAX_LAYERS as u32;
2199 cpu_ctxs[cpu].is_big = is_big;
2200
2201 fastrand::seed(cpu as u64);
2202
2203 let mut ogp_order = op_layers.clone();
2204 ogp_order.append(&mut gp_layers.clone());
2205 fastrand::shuffle(&mut ogp_order);
2206
2207 let mut ogn_order = on_layers.clone();
2208 ogn_order.append(&mut gn_layers.clone());
2209 fastrand::shuffle(&mut ogn_order);
2210
2211 let mut op_order = op_layers.clone();
2212 fastrand::shuffle(&mut op_order);
2213
2214 let mut on_order = on_layers.clone();
2215 fastrand::shuffle(&mut on_order);
2216
2217 let mut gp_order = gp_layers.clone();
2218 fastrand::shuffle(&mut gp_order);
2219
2220 let mut gn_order = gn_layers.clone();
2221 fastrand::shuffle(&mut gn_order);
2222
2223 for i in 0..MAX_LAYERS {
2224 cpu_ctxs[cpu].ogp_layer_order[i] =
2225 ogp_order.get(i).cloned().unwrap_or(MAX_LAYERS as u32);
2226 cpu_ctxs[cpu].ogn_layer_order[i] =
2227 ogn_order.get(i).cloned().unwrap_or(MAX_LAYERS as u32);
2228
2229 cpu_ctxs[cpu].op_layer_order[i] =
2230 op_order.get(i).cloned().unwrap_or(MAX_LAYERS as u32);
2231 cpu_ctxs[cpu].on_layer_order[i] =
2232 on_order.get(i).cloned().unwrap_or(MAX_LAYERS as u32);
2233 cpu_ctxs[cpu].gp_layer_order[i] =
2234 gp_order.get(i).cloned().unwrap_or(MAX_LAYERS as u32);
2235 cpu_ctxs[cpu].gn_layer_order[i] =
2236 gn_order.get(i).cloned().unwrap_or(MAX_LAYERS as u32);
2237 }
2238 }
2239
2240 Self::init_cpu_prox_map(topo, &mut cpu_ctxs);
2241
2242 skel.maps
2243 .cpu_ctxs
2244 .update_percpu(
2245 &key,
2246 &Self::convert_cpu_ctxs(cpu_ctxs),
2247 libbpf_rs::MapFlags::ANY,
2248 )
2249 .context("Failed to update cpu_ctx")?;
2250
2251 Ok(())
2252 }
2253
2254 fn init_llc_prox_map(skel: &mut BpfSkel, topo: &Topology) -> Result<()> {
2255 for (&llc_id, llc) in &topo.all_llcs {
2256 let mut node_order: Vec<usize> =
2258 topo.nodes[&llc.node_id].llcs.keys().cloned().collect();
2259 let mut sys_order: Vec<usize> = topo.all_llcs.keys().cloned().collect();
2260
2261 sys_order.retain(|id| !node_order.contains(id));
2263 node_order.retain(|&id| id != llc_id);
2264
2265 fastrand::seed(llc_id as u64);
2268 fastrand::shuffle(&mut sys_order);
2269 fastrand::shuffle(&mut node_order);
2270
2271 let mut order: Vec<usize> = vec![];
2273 let mut idx: usize = 0;
2274
2275 idx += 1;
2276 order.push(llc_id);
2277
2278 idx += node_order.len();
2279 order.append(&mut node_order);
2280 let node_end = idx;
2281
2282 idx += sys_order.len();
2283 order.append(&mut sys_order);
2284 let sys_end = idx;
2285
2286 debug!(
2287 "LLC[{}] proximity map[{}/{}]: {:?}",
2288 llc_id, node_end, sys_end, &order
2289 );
2290
2291 let key = llc_id as u32;
2296 let llc_id_slice =
2297 unsafe { std::slice::from_raw_parts((&key as *const u32) as *const u8, 4) };
2298 let v = skel
2299 .maps
2300 .llc_data
2301 .lookup(llc_id_slice, libbpf_rs::MapFlags::ANY)
2302 .unwrap()
2303 .unwrap();
2304 let mut llcc = unsafe { *(v.as_slice().as_ptr() as *const bpf_intf::llc_ctx) };
2305
2306 let pmap = &mut llcc.prox_map;
2307 for (i, &llc_id) in order.iter().enumerate() {
2308 pmap.llcs[i] = llc_id as u16;
2309 }
2310 pmap.node_end = node_end as u32;
2311 pmap.sys_end = sys_end as u32;
2312
2313 let v = unsafe {
2314 std::slice::from_raw_parts(
2315 &llcc as *const bpf_intf::llc_ctx as *const u8,
2316 std::mem::size_of::<bpf_intf::llc_ctx>(),
2317 )
2318 };
2319
2320 skel.maps
2321 .llc_data
2322 .update(llc_id_slice, v, libbpf_rs::MapFlags::ANY)?
2323 }
2324
2325 Ok(())
2326 }
2327
2328 fn init(
2329 opts: &'a Opts,
2330 layer_specs: &[LayerSpec],
2331 open_object: &'a mut MaybeUninit<OpenObject>,
2332 hint_to_layer_map: &HashMap<u64, HintLayerInfo>,
2333 membw_tracking: bool,
2334 ) -> Result<Self> {
2335 let nr_layers = layer_specs.len();
2336 let mut disable_topology = opts.disable_topology.unwrap_or(false);
2337
2338 let topo = Arc::new(if disable_topology {
2339 Topology::with_flattened_llc_node()?
2340 } else if opts.topology.virt_llc.is_some() {
2341 Topology::with_args(&opts.topology)?
2342 } else {
2343 Topology::new()?
2344 });
2345
2346 if topo.nodes.keys().enumerate().any(|(i, &k)| i != k) {
2353 bail!("Holes in node IDs detected: {:?}", topo.nodes.keys());
2354 }
2355 if topo.all_llcs.keys().enumerate().any(|(i, &k)| i != k) {
2356 bail!("Holes in LLC IDs detected: {:?}", topo.all_llcs.keys());
2357 }
2358 if topo.all_cpus.keys().enumerate().any(|(i, &k)| i != k) {
2359 bail!("Holes in CPU IDs detected: {:?}", topo.all_cpus.keys());
2360 }
2361
2362 let netdevs = if opts.netdev_irq_balance {
2363 warn!(
2364 "Experimental netdev IRQ balancing enabled. Reset IRQ masks of network devices after use!!!"
2365 );
2366 read_netdevs()?
2367 } else {
2368 BTreeMap::new()
2369 };
2370
2371 if !disable_topology {
2372 if topo.nodes.len() == 1 && topo.nodes[&0].llcs.len() == 1 {
2373 disable_topology = true;
2374 };
2375 info!(
2376 "Topology awareness not specified, selecting {} based on hardware",
2377 if disable_topology {
2378 "disabled"
2379 } else {
2380 "enabled"
2381 }
2382 );
2383 };
2384
2385 let cpu_pool = CpuPool::new(topo.clone())?;
2386
2387 let layer_specs: Vec<_> = if disable_topology {
2390 info!("Disabling topology awareness");
2391 layer_specs
2392 .iter()
2393 .cloned()
2394 .map(|mut s| {
2395 s.kind.common_mut().nodes.clear();
2396 s.kind.common_mut().llcs.clear();
2397 s
2398 })
2399 .collect()
2400 } else {
2401 layer_specs.to_vec()
2402 };
2403
2404 init_libbpf_logging(None);
2406 let kfuncs_in_syscall = scx_bpf_compat::kfuncs_supported_in_syscall()?;
2407 if !kfuncs_in_syscall {
2408 warn!("Using slow path: kfuncs not supported in syscall programs (a8e03b6bbb2c ∉ ker)");
2409 }
2410
2411 let debug_level = if opts.log_level.contains("trace") {
2413 2
2414 } else if opts.log_level.contains("debug") {
2415 1
2416 } else {
2417 0
2418 };
2419 let mut skel_builder = BpfSkelBuilder::default();
2420 skel_builder.obj_builder.debug(debug_level > 1);
2421
2422 info!(
2423 "Running scx_layered (build ID: {})",
2424 build_id::full_version(env!("CARGO_PKG_VERSION"))
2425 );
2426 let open_opts = opts.libbpf.clone().into_bpf_open_opts();
2427 let mut skel = scx_ops_open!(skel_builder, open_object, layered, open_opts)?;
2428
2429 skel.progs.scx_pmu_switch_tc.set_autoload(membw_tracking);
2431 skel.progs.scx_pmu_tick_tc.set_autoload(membw_tracking);
2432
2433 if opts.enable_gpu_support {
2436 if opts.gpu_kprobe_level >= 1 {
2439 compat::cond_kprobe_enable("nvidia_open", &skel.progs.kprobe_nvidia_open)?;
2440 }
2441 if opts.gpu_kprobe_level >= 2 {
2444 compat::cond_kprobe_enable("nvidia_mmap", &skel.progs.kprobe_nvidia_mmap)?;
2445 }
2446 if opts.gpu_kprobe_level >= 3 {
2447 compat::cond_kprobe_enable("nvidia_poll", &skel.progs.kprobe_nvidia_poll)?;
2448 }
2449 }
2450
2451 let ext_sched_class_addr = get_kallsyms_addr("ext_sched_class");
2452 let idle_sched_class_addr = get_kallsyms_addr("idle_sched_class");
2453
2454 let event = if membw_tracking {
2455 setup_membw_tracking(&mut skel)?
2456 } else {
2457 0
2458 };
2459
2460 let rodata = skel.maps.rodata_data.as_mut().unwrap();
2461
2462 if ext_sched_class_addr.is_ok() && idle_sched_class_addr.is_ok() {
2463 rodata.ext_sched_class_addr = ext_sched_class_addr.unwrap();
2464 rodata.idle_sched_class_addr = idle_sched_class_addr.unwrap();
2465 } else {
2466 warn!(
2467 "Unable to get sched_class addresses from /proc/kallsyms, disabling skip_preempt."
2468 );
2469 }
2470
2471 rodata.slice_ns = scx_enums.SCX_SLICE_DFL;
2472 rodata.max_exec_ns = 20 * scx_enums.SCX_SLICE_DFL;
2473
2474 skel.struct_ops.layered_mut().exit_dump_len = opts.exit_dump_len;
2476
2477 if !opts.disable_queued_wakeup {
2478 match *compat::SCX_OPS_ALLOW_QUEUED_WAKEUP {
2479 0 => info!("Kernel does not support queued wakeup optimization"),
2480 v => skel.struct_ops.layered_mut().flags |= v,
2481 }
2482 }
2483
2484 rodata.percpu_kthread_preempt = !opts.disable_percpu_kthread_preempt;
2485 rodata.percpu_kthread_preempt_all =
2486 !opts.disable_percpu_kthread_preempt && opts.percpu_kthread_preempt_all;
2487 rodata.debug = debug_level as u32;
2488 rodata.slice_ns = opts.slice_us * 1000;
2489 rodata.max_exec_ns = if opts.max_exec_us > 0 {
2490 opts.max_exec_us * 1000
2491 } else {
2492 opts.slice_us * 1000 * 20
2493 };
2494 rodata.nr_cpu_ids = *NR_CPU_IDS as u32;
2495 rodata.nr_possible_cpus = *NR_CPUS_POSSIBLE as u32;
2496 rodata.smt_enabled = topo.smt_enabled;
2497 rodata.has_little_cores = topo.has_little_cores();
2498 rodata.xnuma_preemption = opts.xnuma_preemption;
2499 rodata.antistall_sec = opts.antistall_sec;
2500 rodata.monitor_disable = opts.monitor_disable;
2501 rodata.lo_fb_wait_ns = opts.lo_fb_wait_us * 1000;
2502 rodata.lo_fb_share_ppk = ((opts.lo_fb_share * 1024.0) as u32).clamp(1, 1024);
2503 rodata.enable_antistall = !opts.disable_antistall;
2504 rodata.enable_match_debug = opts.enable_match_debug;
2505 rodata.enable_gpu_support = opts.enable_gpu_support;
2506 rodata.kfuncs_supported_in_syscall = kfuncs_in_syscall;
2507
2508 for (cpu, sib) in topo.sibling_cpus().iter().enumerate() {
2509 rodata.__sibling_cpu[cpu] = *sib;
2510 }
2511 for cpu in topo.all_cpus.keys() {
2512 rodata.all_cpus[cpu / 8] |= 1 << (cpu % 8);
2513 }
2514
2515 rodata.nr_op_layers = layer_specs
2516 .iter()
2517 .filter(|spec| match &spec.kind {
2518 LayerKind::Open { .. } => spec.kind.common().preempt,
2519 _ => false,
2520 })
2521 .count() as u32;
2522 rodata.nr_on_layers = layer_specs
2523 .iter()
2524 .filter(|spec| match &spec.kind {
2525 LayerKind::Open { .. } => !spec.kind.common().preempt,
2526 _ => false,
2527 })
2528 .count() as u32;
2529 rodata.nr_gp_layers = layer_specs
2530 .iter()
2531 .filter(|spec| match &spec.kind {
2532 LayerKind::Grouped { .. } => spec.kind.common().preempt,
2533 _ => false,
2534 })
2535 .count() as u32;
2536 rodata.nr_gn_layers = layer_specs
2537 .iter()
2538 .filter(|spec| match &spec.kind {
2539 LayerKind::Grouped { .. } => !spec.kind.common().preempt,
2540 _ => false,
2541 })
2542 .count() as u32;
2543 rodata.nr_excl_layers = layer_specs
2544 .iter()
2545 .filter(|spec| spec.kind.common().exclusive)
2546 .count() as u32;
2547
2548 let mut min_open = u64::MAX;
2549 let mut min_preempt = u64::MAX;
2550
2551 for spec in layer_specs.iter() {
2552 if let LayerKind::Open { common, .. } = &spec.kind {
2553 min_open = min_open.min(common.disallow_open_after_us.unwrap());
2554 min_preempt = min_preempt.min(common.disallow_preempt_after_us.unwrap());
2555 }
2556 }
2557
2558 rodata.min_open_layer_disallow_open_after_ns = match min_open {
2559 u64::MAX => *DFL_DISALLOW_OPEN_AFTER_US,
2560 v => v,
2561 };
2562 rodata.min_open_layer_disallow_preempt_after_ns = match min_preempt {
2563 u64::MAX => *DFL_DISALLOW_PREEMPT_AFTER_US,
2564 v => v,
2565 };
2566
2567 for i in 0..layer_specs.len() {
2569 skel.maps.bss_data.as_mut().unwrap().empty_layer_ids[i] = i as u32;
2570 }
2571 skel.maps.bss_data.as_mut().unwrap().nr_empty_layer_ids = nr_layers as u32;
2572
2573 let layered_task_hint_map_path = &opts.task_hint_map;
2578 let hint_map = &mut skel.maps.scx_layered_task_hint_map;
2579 if layered_task_hint_map_path.is_empty() == false {
2581 hint_map.set_pin_path(layered_task_hint_map_path).unwrap();
2582 rodata.task_hint_map_enabled = true;
2583 }
2584
2585 if !opts.hi_fb_thread_name.is_empty() {
2586 let bpf_hi_fb_thread_name = &mut rodata.hi_fb_thread_name;
2587 copy_into_cstr(bpf_hi_fb_thread_name, opts.hi_fb_thread_name.as_str());
2588 rodata.enable_hi_fb_thread_name_match = true;
2589 }
2590
2591 let cgroup_regexes = Self::init_layers(&mut skel, &layer_specs, &topo)?;
2592 skel.maps.rodata_data.as_mut().unwrap().nr_cgroup_regexes = cgroup_regexes.len() as u32;
2593 Self::init_nodes(&mut skel, opts, &topo);
2594
2595 let mut skel = scx_ops_load!(skel, layered, uei)?;
2596
2597 if hint_to_layer_map.len() != 0 {
2599 for (k, v) in hint_to_layer_map.iter() {
2600 let key: u32 = *k as u32;
2601
2602 let mut info_bytes = vec![0u8; std::mem::size_of::<bpf_intf::hint_layer_info>()];
2604 let info_ptr = info_bytes.as_mut_ptr() as *mut bpf_intf::hint_layer_info;
2605 unsafe {
2606 (*info_ptr).layer_id = v.layer_id as u32;
2607 (*info_ptr).system_cpu_util_below = match v.system_cpu_util_below {
2608 Some(threshold) => (threshold * 10000.0) as u64,
2609 None => u64::MAX, };
2611 (*info_ptr).dsq_insert_below = match v.dsq_insert_below {
2612 Some(threshold) => (threshold * 10000.0) as u64,
2613 None => u64::MAX, };
2615 }
2616
2617 skel.maps.hint_to_layer_id_map.update(
2618 &key.to_ne_bytes(),
2619 &info_bytes,
2620 libbpf_rs::MapFlags::ANY,
2621 )?;
2622 }
2623 }
2624
2625 if membw_tracking {
2626 create_perf_fds(&mut skel, event)?;
2627 }
2628
2629 let mut layers = vec![];
2630 let layer_growth_orders =
2631 LayerGrowthAlgo::layer_core_orders(&cpu_pool, &layer_specs, &topo)?;
2632 for (idx, spec) in layer_specs.iter().enumerate() {
2633 let growth_order = layer_growth_orders
2634 .get(&idx)
2635 .with_context(|| "layer has no growth order".to_string())?;
2636 layers.push(Layer::new(spec, &topo, growth_order)?);
2637 }
2638
2639 let mut idle_qos_enabled = layers
2640 .iter()
2641 .any(|layer| layer.kind.common().idle_resume_us.unwrap_or(0) > 0);
2642 if idle_qos_enabled && !cpu_idle_resume_latency_supported() {
2643 warn!("idle_resume_us not supported, ignoring");
2644 idle_qos_enabled = false;
2645 }
2646
2647 Self::init_cpus(&skel, &layer_specs, &topo)?;
2648 Self::init_llc_prox_map(&mut skel, &topo)?;
2649
2650 let proc_reader = fb_procfs::ProcReader::new();
2652
2653 let input = ProgramInput {
2655 ..Default::default()
2656 };
2657 let prog = &mut skel.progs.initialize_pid_namespace;
2658
2659 let _ = prog.test_run(input);
2660
2661 if layered_task_hint_map_path.is_empty() == false {
2670 let path = CString::new(layered_task_hint_map_path.as_bytes()).unwrap();
2671 let mode: libc::mode_t = 0o666;
2672 unsafe {
2673 if libc::chmod(path.as_ptr(), mode) != 0 {
2674 trace!("'chmod' to 666 of task hint map failed, continuing...");
2675 }
2676 }
2677 }
2678
2679 let struct_ops = scx_ops_attach!(skel, layered)?;
2681 let stats_server = StatsServer::new(stats::server_data()).launch()?;
2682 let mut gpu_task_handler =
2683 GpuTaskAffinitizer::new(opts.gpu_affinitize_secs, opts.enable_gpu_affinitize);
2684 gpu_task_handler.init(topo.clone());
2685
2686 let sched = Self {
2687 struct_ops: Some(struct_ops),
2688 layer_specs,
2689
2690 sched_intv: Duration::from_secs_f64(opts.interval),
2691 layer_refresh_intv: Duration::from_millis(opts.layer_refresh_ms_avgruntime),
2692
2693 cpu_pool,
2694 layers,
2695 idle_qos_enabled,
2696
2697 sched_stats: Stats::new(&mut skel, &proc_reader, &gpu_task_handler)?,
2698
2699 cgroup_regexes: Some(cgroup_regexes),
2700 nr_layer_cpus_ranges: vec![(0, 0); nr_layers],
2701 processing_dur: Default::default(),
2702
2703 proc_reader,
2704 skel,
2705
2706 topo,
2707 netdevs,
2708 stats_server,
2709 gpu_task_handler,
2710 };
2711
2712 info!("Layered Scheduler Attached. Run `scx_layered --monitor` for metrics.");
2713
2714 Ok(sched)
2715 }
2716
2717 fn update_cpumask(mask: &Cpumask, bpfmask: &mut [u8]) {
2718 for cpu in 0..mask.len() {
2719 if mask.test_cpu(cpu) {
2720 bpfmask[cpu / 8] |= 1 << (cpu % 8);
2721 } else {
2722 bpfmask[cpu / 8] &= !(1 << (cpu % 8));
2723 }
2724 }
2725 }
2726
2727 fn update_bpf_layer_cpumask(layer: &Layer, bpf_layer: &mut types::layer) {
2728 trace!("[{}] Updating BPF CPUs: {}", layer.name, &layer.cpus);
2729 Self::update_cpumask(&layer.cpus, &mut bpf_layer.cpus);
2730
2731 bpf_layer.nr_cpus = layer.nr_cpus as u32;
2732 for (llc_id, &nr_llc_cpus) in layer.nr_llc_cpus.iter().enumerate() {
2733 bpf_layer.nr_llc_cpus[llc_id] = nr_llc_cpus as u32;
2734 }
2735
2736 bpf_layer.refresh_cpus = 1;
2737 }
2738
2739 fn update_netdev_cpumasks(&mut self) -> Result<()> {
2740 let available_cpus = self.cpu_pool.available_cpus();
2741 if available_cpus.is_empty() {
2742 return Ok(());
2743 }
2744
2745 for (iface, netdev) in self.netdevs.iter_mut() {
2746 let node = self
2747 .topo
2748 .nodes
2749 .values()
2750 .take_while(|n| n.id == netdev.node())
2751 .next()
2752 .ok_or_else(|| anyhow!("Failed to get netdev node"))?;
2753 let node_cpus = node.span.clone();
2754 for (irq, irqmask) in netdev.irqs.iter_mut() {
2755 irqmask.clear_all();
2756 for cpu in available_cpus.iter() {
2757 if !node_cpus.test_cpu(cpu) {
2758 continue;
2759 }
2760 let _ = irqmask.set_cpu(cpu);
2761 }
2762 if irqmask.weight() == 0 {
2764 for cpu in node_cpus.iter() {
2765 let _ = irqmask.set_cpu(cpu);
2766 }
2767 }
2768 trace!("{} updating irq {} cpumask {:?}", iface, irq, irqmask);
2769 }
2770 netdev.apply_cpumasks()?;
2771 }
2772
2773 Ok(())
2774 }
2775
2776 fn clamp_target_by_membw(
2777 &self,
2778 layer: &Layer,
2779 membw_limit: f64,
2780 membw: f64,
2781 curtarget: u64,
2782 ) -> usize {
2783 let ncpu: u64 = layer.cpus.weight() as u64;
2784 let membw = (membw * (1024 as f64).powf(3.0)).round() as u64;
2785 let membw_limit = (membw_limit * (1024 as f64).powf(3.0)).round() as u64;
2786 let last_membw_percpu = if ncpu > 0 { membw / ncpu } else { 0 };
2787
2788 if membw_limit == 0 || last_membw_percpu == 0 {
2791 return curtarget as usize;
2792 }
2793
2794 return (membw_limit / last_membw_percpu) as usize;
2795 }
2796
2797 fn calc_target_nr_cpus(&self) -> Vec<(usize, usize)> {
2803 let nr_cpus = self.cpu_pool.topo.all_cpus.len();
2804 let utils = &self.sched_stats.layer_utils;
2805 let membws = &self.sched_stats.layer_membws;
2806
2807 let mut records: Vec<(u64, u64, u64, usize, usize, usize)> = vec![];
2808 let mut targets: Vec<(usize, usize)> = vec![];
2809
2810 for (idx, layer) in self.layers.iter().enumerate() {
2811 targets.push(match &layer.kind {
2812 LayerKind::Confined {
2813 util_range,
2814 cpus_range,
2815 cpus_range_frac,
2816 membw_gb,
2817 ..
2818 }
2819 | LayerKind::Grouped {
2820 util_range,
2821 cpus_range,
2822 cpus_range_frac,
2823 membw_gb,
2824 ..
2825 } => {
2826 let cpus_range =
2827 resolve_cpus_pct_range(cpus_range, cpus_range_frac, nr_cpus).unwrap();
2828
2829 let owned = utils[idx][LAYER_USAGE_OWNED];
2834 let open = utils[idx][LAYER_USAGE_OPEN];
2835
2836 let membw_owned = membws[idx][LAYER_USAGE_OWNED];
2837 let membw_open = membws[idx][LAYER_USAGE_OPEN];
2838
2839 let mut util = owned;
2840 let mut membw = membw_owned;
2841 if layer.kind.util_includes_open_cputime() || layer.nr_cpus == 0 {
2842 util += open;
2843 membw += membw_open;
2844 }
2845
2846 let util = if util < 0.01 { 0.0 } else { util };
2847 let low = (util / util_range.1).ceil() as usize;
2848 let high = ((util / util_range.0).floor() as usize).max(low);
2849
2850 let membw_limit = match membw_gb {
2851 Some(membw_limit) => *membw_limit,
2852 None => 0.0,
2853 };
2854
2855 trace!(
2856 "layer {0} (membw, membw_limit): ({membw} gi_b, {membw_limit} gi_b)",
2857 layer.name
2858 );
2859
2860 let target = layer.cpus.weight().clamp(low, high);
2861
2862 records.push((
2863 (owned * 100.0) as u64,
2864 (open * 100.0) as u64,
2865 (util * 100.0) as u64,
2866 low,
2867 high,
2868 target,
2869 ));
2870
2871 let target = target.clamp(cpus_range.0, cpus_range.1);
2872 let membw_target = self.clamp_target_by_membw(
2873 &layer,
2874 membw_limit as f64,
2875 membw as f64,
2876 target as u64,
2877 );
2878
2879 trace!("CPU target pre- and post-membw adjustment: {target} -> {membw_target}");
2880
2881 if membw_target < cpus_range.0 {
2884 warn!("cannot satisfy memory bw limit for layer {}", layer.name);
2885 warn!("membw_target {membw_target} low {}", cpus_range.0);
2886 };
2887
2888 let target = membw_target.clamp(cpus_range.0, target);
2891
2892 (target, cpus_range.0)
2893 }
2894 LayerKind::Open { .. } => (0, 0),
2895 });
2896 }
2897
2898 trace!("(owned, open, util, low, high, target): {:?}", &records);
2899 targets
2900 }
2901
2902 fn weighted_target_nr_cpus(&self, targets: &[(usize, usize)]) -> Vec<usize> {
2906 let mut nr_left = self.cpu_pool.topo.all_cpus.len();
2907 let weights: Vec<usize> = self
2908 .layers
2909 .iter()
2910 .map(|layer| layer.kind.common().weight as usize)
2911 .collect();
2912 let mut cands: BTreeMap<usize, (usize, usize, usize)> = targets
2913 .iter()
2914 .zip(&weights)
2915 .enumerate()
2916 .map(|(i, ((target, min), weight))| (i, (*target, *min, *weight)))
2917 .collect();
2918 let mut weight_sum: usize = weights.iter().sum();
2919 let mut weighted: Vec<usize> = vec![0; self.layers.len()];
2920
2921 trace!("cands: {:?}", &cands);
2922
2923 cands.retain(|&i, &mut (target, min, weight)| {
2925 if target <= min {
2926 let target = target.min(nr_left);
2927 weighted[i] = target;
2928 weight_sum -= weight;
2929 nr_left -= target;
2930 false
2931 } else {
2932 true
2933 }
2934 });
2935
2936 trace!("cands after accepting mins: {:?}", &cands);
2937
2938 let calc_share = |nr_left, weight, weight_sum| {
2940 (((nr_left * weight) as f64 / weight_sum as f64).ceil() as usize).min(nr_left)
2941 };
2942
2943 while !cands.is_empty() {
2944 let mut progress = false;
2945
2946 cands.retain(|&i, &mut (target, _min, weight)| {
2947 let share = calc_share(nr_left, weight, weight_sum);
2948 if target <= share {
2949 weighted[i] = target;
2950 weight_sum -= weight;
2951 nr_left -= target;
2952 progress = true;
2953 false
2954 } else {
2955 true
2956 }
2957 });
2958
2959 if !progress {
2960 break;
2961 }
2962 }
2963
2964 trace!("cands after accepting under allotted: {:?}", &cands);
2965
2966 let nr_to_share = nr_left;
2969 for (i, (_target, _min, weight)) in cands.into_iter() {
2970 let share = calc_share(nr_to_share, weight, weight_sum).min(nr_left);
2971 weighted[i] = share;
2972 nr_left -= share;
2973 }
2974
2975 trace!("weighted: {:?}", &weighted);
2976
2977 weighted
2978 }
2979
2980 fn compute_target_llcs(target: usize, topo: &Topology) -> (usize, usize) {
2984 let cores_per_llc = topo.all_cores.len() / topo.all_llcs.len();
2986 let cpus_per_core = topo.all_cores.first_key_value().unwrap().1.cpus.len();
2988 let cpus_per_llc = cores_per_llc * cpus_per_core;
2989
2990 let full = target / cpus_per_llc;
2991 let extra = target % cpus_per_llc;
2992
2993 (full, extra.div_ceil(cpus_per_core))
2994 }
2995
2996 fn recompute_layer_core_order(&mut self, layer_targets: &Vec<(usize, usize)>) -> Result<bool> {
3004 debug!(
3006 " free: before pass: free_llcs={:?}",
3007 self.cpu_pool.free_llcs
3008 );
3009 for &(idx, target) in layer_targets.iter().rev() {
3010 let layer = &mut self.layers[idx];
3011 let old_tlc = layer.target_llc_cpus;
3012 let new_tlc = Self::compute_target_llcs(target, &self.topo);
3013
3014 if layer.growth_algo != LayerGrowthAlgo::StickyDynamic {
3015 continue;
3016 }
3017
3018 let mut to_free = (old_tlc.0 as i32 - new_tlc.0 as i32).max(0) as usize;
3019
3020 debug!(
3021 " free: layer={} old_tlc={:?} new_tlc={:?} to_free={} assigned={} free={}",
3022 layer.name,
3023 old_tlc,
3024 new_tlc,
3025 to_free,
3026 layer.assigned_llcs.len(),
3027 self.cpu_pool.free_llcs.len()
3028 );
3029
3030 while to_free > 0 && layer.assigned_llcs.len() > 0 {
3031 let llc = layer.assigned_llcs.pop().unwrap();
3032 self.cpu_pool.free_llcs.push((llc, 0));
3033 to_free -= 1;
3034
3035 debug!(" layer={} freed_llc={}", layer.name, llc);
3036 }
3037 }
3038 debug!(" free: after pass: free_llcs={:?}", self.cpu_pool.free_llcs);
3039
3040 for &(idx, target) in layer_targets.iter().rev() {
3042 let layer = &mut self.layers[idx];
3043 let old_tlc = layer.target_llc_cpus;
3044 let new_tlc = Self::compute_target_llcs(target, &self.topo);
3045
3046 if layer.growth_algo != LayerGrowthAlgo::StickyDynamic {
3047 continue;
3048 }
3049
3050 let mut to_alloc = (new_tlc.0 as i32 - old_tlc.0 as i32).max(0) as usize;
3051
3052 debug!(
3053 " alloc: layer={} old_tlc={:?} new_tlc={:?} to_alloc={} assigned={} free={}",
3054 layer.name,
3055 old_tlc,
3056 new_tlc,
3057 to_alloc,
3058 layer.assigned_llcs.len(),
3059 self.cpu_pool.free_llcs.len()
3060 );
3061
3062 while to_alloc > 0
3063 && self.cpu_pool.free_llcs.len() > 0
3064 && to_alloc <= self.cpu_pool.free_llcs.len()
3065 {
3066 let llc = self.cpu_pool.free_llcs.pop().unwrap().0;
3067 layer.assigned_llcs.push(llc);
3068 to_alloc -= 1;
3069
3070 debug!(" layer={} alloc_llc={}", layer.name, llc);
3071 }
3072
3073 debug!(
3074 " alloc: layer={} assigned_llcs={:?}",
3075 layer.name, layer.assigned_llcs
3076 );
3077
3078 layer.target_llc_cpus = new_tlc;
3080 }
3081
3082 for &(idx, _) in layer_targets.iter() {
3085 let mut core_order = vec![];
3086 let layer = &mut self.layers[idx];
3087
3088 if layer.growth_algo != LayerGrowthAlgo::StickyDynamic {
3089 continue;
3090 }
3091
3092 let tlc = layer.target_llc_cpus;
3093 let mut extra = tlc.1;
3094 let cores_per_llc = self.topo.all_cores.len() / self.topo.all_llcs.len();
3096 let cpus_per_core = self.topo.all_cores.first_key_value().unwrap().1.cpus.len();
3097 let cpus_per_llc = cores_per_llc * cpus_per_core;
3098
3099 for i in 0..self.cpu_pool.free_llcs.len() {
3101 let free_vec = &mut self.cpu_pool.free_llcs;
3102 let avail = cpus_per_llc - free_vec[i].1;
3104 let mut used = extra.min(avail);
3106 let cores_to_add = used;
3107
3108 let shift = free_vec[i].1;
3109 free_vec[i].1 += used;
3110
3111 let llc_id = free_vec[i].0;
3112 let llc = self.topo.all_llcs.get(&llc_id).unwrap();
3113
3114 for core in llc.cores.iter().skip(shift) {
3115 if used == 0 {
3116 break;
3117 }
3118 core_order.push(core.1.id);
3119 used -= 1;
3120 }
3121
3122 extra -= cores_to_add;
3123 if extra == 0 {
3124 break;
3125 }
3126 }
3127
3128 core_order.reverse();
3129 layer.core_order = core_order;
3130 }
3131
3132 for i in 0..self.cpu_pool.free_llcs.len() {
3134 self.cpu_pool.free_llcs[i].1 = 0;
3135 }
3136
3137 for &(idx, _) in layer_targets.iter() {
3138 let layer = &mut self.layers[idx];
3139
3140 if layer.growth_algo != LayerGrowthAlgo::StickyDynamic {
3141 continue;
3142 }
3143
3144 for core in self.topo.all_cores.iter() {
3145 let llc_id = core.1.llc_id;
3146 if layer.assigned_llcs.contains(&llc_id) {
3147 layer.core_order.push(core.1.id);
3148 }
3149 }
3150 layer.core_order.reverse();
3152
3153 debug!(
3154 " alloc: layer={} core_order={:?}",
3155 layer.name, layer.core_order
3156 );
3157 }
3158
3159 let mut updated = false;
3163
3164 for &(idx, _) in layer_targets.iter() {
3166 let layer = &mut self.layers[idx];
3167
3168 if layer.growth_algo != LayerGrowthAlgo::StickyDynamic {
3169 continue;
3170 }
3171
3172 let mut new_cpus = Cpumask::new();
3174 for &core_id in &layer.core_order {
3175 if let Some(core) = self.topo.all_cores.get(&core_id) {
3176 new_cpus |= &core.span;
3177 }
3178 }
3179
3180 new_cpus &= &layer.allowed_cpus;
3182
3183 let cpus_to_free = layer.cpus.clone().and(&new_cpus.clone().not());
3185
3186 if cpus_to_free.weight() > 0 {
3187 debug!(
3188 " apply: layer={} freeing CPUs: {}",
3189 layer.name, cpus_to_free
3190 );
3191 layer.cpus &= &cpus_to_free.not();
3193 layer.nr_cpus -= cpus_to_free.weight();
3194 for cpu in cpus_to_free.iter() {
3195 layer.nr_llc_cpus[self.cpu_pool.topo.all_cpus[&cpu].llc_id] -= 1;
3196 }
3197 self.cpu_pool.free(&cpus_to_free)?;
3198 updated = true;
3199 }
3200 }
3201
3202 for &(idx, _) in layer_targets.iter() {
3204 let layer = &mut self.layers[idx];
3205
3206 if layer.growth_algo != LayerGrowthAlgo::StickyDynamic {
3207 continue;
3208 }
3209
3210 let mut new_cpus = Cpumask::new();
3212 for &core_id in &layer.core_order {
3213 if let Some(core) = self.topo.all_cores.get(&core_id) {
3214 new_cpus |= &core.span;
3215 }
3216 }
3217 new_cpus &= &layer.allowed_cpus;
3218
3219 let available_cpus = self.cpu_pool.available_cpus();
3221 let desired_to_alloc = new_cpus.clone().and(&layer.cpus.clone().not());
3222 let cpus_to_alloc = desired_to_alloc.clone().and(&available_cpus);
3223
3224 if desired_to_alloc.weight() > cpus_to_alloc.weight() {
3225 debug!(
3226 " apply: layer={} wanted to alloc {} CPUs but only {} available",
3227 layer.name,
3228 desired_to_alloc.weight(),
3229 cpus_to_alloc.weight()
3230 );
3231 }
3232
3233 if cpus_to_alloc.weight() > 0 {
3234 debug!(
3235 " apply: layer={} allocating CPUs: {}",
3236 layer.name, cpus_to_alloc
3237 );
3238 layer.cpus |= &cpus_to_alloc;
3240 layer.nr_cpus += cpus_to_alloc.weight();
3241 for cpu in cpus_to_alloc.iter() {
3242 layer.nr_llc_cpus[self.cpu_pool.topo.all_cpus[&cpu].llc_id] += 1;
3243 }
3244 self.cpu_pool.mark_allocated(&cpus_to_alloc)?;
3245 updated = true;
3246 }
3247
3248 debug!(
3249 " apply: layer={} final cpus.weight()={} nr_cpus={}",
3250 layer.name,
3251 layer.cpus.weight(),
3252 layer.nr_cpus
3253 );
3254 }
3255
3256 Ok(updated)
3257 }
3258
3259 fn refresh_cpumasks(&mut self) -> Result<()> {
3260 let layer_is_open = |layer: &Layer| matches!(layer.kind, LayerKind::Open { .. });
3261
3262 let mut updated = false;
3263 let targets = self.calc_target_nr_cpus();
3264 let targets = self.weighted_target_nr_cpus(&targets);
3265
3266 let mut ascending: Vec<(usize, usize)> = targets.iter().copied().enumerate().collect();
3267 ascending.sort_by(|a, b| a.1.cmp(&b.1));
3268
3269 let sticky_dynamic_updated = self.recompute_layer_core_order(&ascending)?;
3270 updated |= sticky_dynamic_updated;
3271
3272 if sticky_dynamic_updated {
3274 for (idx, layer) in self.layers.iter().enumerate() {
3275 if layer.growth_algo == LayerGrowthAlgo::StickyDynamic {
3276 Self::update_bpf_layer_cpumask(
3277 layer,
3278 &mut self.skel.maps.bss_data.as_mut().unwrap().layers[idx],
3279 );
3280 }
3281 }
3282 }
3283
3284 let mut force_free = self
3287 .layers
3288 .iter()
3289 .zip(targets.iter())
3290 .any(|(layer, &target)| layer.nr_cpus < target);
3291
3292 for &(idx, target) in ascending.iter().rev() {
3296 let layer = &mut self.layers[idx];
3297 if layer_is_open(layer) {
3298 continue;
3299 }
3300
3301 if layer.growth_algo == LayerGrowthAlgo::StickyDynamic {
3303 continue;
3304 }
3305
3306 let nr_cur = layer.cpus.weight();
3307 if nr_cur <= target {
3308 continue;
3309 }
3310 let mut nr_to_free = nr_cur - target;
3311
3312 let nr_to_break_at = nr_to_free / 2;
3317
3318 let mut freed = false;
3319
3320 while nr_to_free > 0 {
3321 let max_to_free = if force_free {
3322 force_free = false;
3323 layer.nr_cpus
3324 } else {
3325 nr_to_free
3326 };
3327
3328 let nr_freed = layer.free_some_cpus(&mut self.cpu_pool, max_to_free)?;
3329 if nr_freed == 0 {
3330 break;
3331 }
3332
3333 nr_to_free = nr_to_free.saturating_sub(nr_freed);
3334 freed = true;
3335
3336 if nr_to_free <= nr_to_break_at {
3337 break;
3338 }
3339 }
3340
3341 if freed {
3342 Self::update_bpf_layer_cpumask(
3343 layer,
3344 &mut self.skel.maps.bss_data.as_mut().unwrap().layers[idx],
3345 );
3346 updated = true;
3347 }
3348 }
3349
3350 for &(idx, target) in &ascending {
3356 let layer = &mut self.layers[idx];
3357
3358 if layer_is_open(layer) {
3359 continue;
3360 }
3361
3362 if layer.growth_algo == LayerGrowthAlgo::StickyDynamic {
3364 continue;
3365 }
3366
3367 let nr_cur = layer.cpus.weight();
3368 if nr_cur >= target {
3369 continue;
3370 }
3371
3372 let mut nr_to_alloc = target - nr_cur;
3373 let mut alloced = false;
3374
3375 while nr_to_alloc > 0 {
3376 let nr_alloced = layer.alloc_some_cpus(&mut self.cpu_pool)?;
3377 if nr_alloced == 0 {
3378 break;
3379 }
3380 alloced = true;
3381 nr_to_alloc -= nr_alloced.min(nr_to_alloc);
3382 }
3383
3384 if alloced {
3385 Self::update_bpf_layer_cpumask(
3386 layer,
3387 &mut self.skel.maps.bss_data.as_mut().unwrap().layers[idx],
3388 );
3389 updated = true;
3390 }
3391 }
3392
3393 if updated {
3395 for (idx, layer) in self.layers.iter_mut().enumerate() {
3396 if !layer_is_open(layer) {
3397 continue;
3398 }
3399
3400 let bpf_layer = &mut self.skel.maps.bss_data.as_mut().unwrap().layers[idx];
3401 let available_cpus = self.cpu_pool.available_cpus().and(&layer.allowed_cpus);
3402 let nr_available_cpus = available_cpus.weight();
3403
3404 layer.cpus = available_cpus;
3407 layer.nr_cpus = nr_available_cpus;
3408 Self::update_bpf_layer_cpumask(layer, bpf_layer);
3409 }
3410
3411 self.skel.maps.bss_data.as_mut().unwrap().fallback_cpu =
3412 self.cpu_pool.fallback_cpu as u32;
3413
3414 for (lidx, layer) in self.layers.iter().enumerate() {
3415 self.nr_layer_cpus_ranges[lidx] = (
3416 self.nr_layer_cpus_ranges[lidx].0.min(layer.nr_cpus),
3417 self.nr_layer_cpus_ranges[lidx].1.max(layer.nr_cpus),
3418 );
3419 }
3420
3421 let input = ProgramInput {
3423 ..Default::default()
3424 };
3425 let prog = &mut self.skel.progs.refresh_layer_cpumasks;
3426 let _ = prog.test_run(input);
3427
3428 let empty_layer_ids: Vec<u32> = self
3430 .layers
3431 .iter()
3432 .enumerate()
3433 .filter(|(_idx, layer)| layer.nr_cpus == 0)
3434 .map(|(idx, _layer)| idx as u32)
3435 .collect();
3436 for i in 0..self.layers.len() {
3437 self.skel.maps.bss_data.as_mut().unwrap().empty_layer_ids[i] =
3438 empty_layer_ids.get(i).cloned().unwrap_or(MAX_LAYERS as u32);
3439 }
3440 self.skel.maps.bss_data.as_mut().unwrap().nr_empty_layer_ids =
3441 empty_layer_ids.len() as u32;
3442 }
3443
3444 let _ = self.update_netdev_cpumasks();
3445 Ok(())
3446 }
3447
3448 fn refresh_idle_qos(&mut self) -> Result<()> {
3449 if !self.idle_qos_enabled {
3450 return Ok(());
3451 }
3452
3453 let mut cpu_idle_qos = vec![0; *NR_CPU_IDS];
3454 for layer in self.layers.iter() {
3455 let idle_resume_us = layer.kind.common().idle_resume_us.unwrap_or(0) as i32;
3456 for cpu in layer.cpus.iter() {
3457 cpu_idle_qos[cpu] = idle_resume_us;
3458 }
3459 }
3460
3461 for (cpu, idle_resume_usec) in cpu_idle_qos.iter().enumerate() {
3462 update_cpu_idle_resume_latency(cpu, *idle_resume_usec)?;
3463 }
3464
3465 Ok(())
3466 }
3467
3468 fn step(&mut self) -> Result<()> {
3469 let started_at = Instant::now();
3470 self.sched_stats.refresh(
3471 &mut self.skel,
3472 &self.proc_reader,
3473 started_at,
3474 self.processing_dur,
3475 &self.gpu_task_handler,
3476 )?;
3477
3478 self.skel
3480 .maps
3481 .bss_data
3482 .as_mut()
3483 .unwrap()
3484 .system_cpu_util_ewma = (self.sched_stats.system_cpu_util_ewma * 10000.0) as u64;
3485
3486 for layer_id in 0..self.sched_stats.nr_layers {
3487 self.skel
3488 .maps
3489 .bss_data
3490 .as_mut()
3491 .unwrap()
3492 .layer_dsq_insert_ewma[layer_id] =
3493 (self.sched_stats.layer_dsq_insert_ewma[layer_id] * 10000.0) as u64;
3494 }
3495
3496 self.refresh_cpumasks()?;
3497 self.refresh_idle_qos()?;
3498 self.gpu_task_handler.maybe_affinitize();
3499 self.processing_dur += Instant::now().duration_since(started_at);
3500 Ok(())
3501 }
3502
3503 fn generate_sys_stats(
3504 &mut self,
3505 stats: &Stats,
3506 cpus_ranges: &mut [(usize, usize)],
3507 ) -> Result<SysStats> {
3508 let bstats = &stats.bpf_stats;
3509 let mut sys_stats = SysStats::new(stats, bstats, self.cpu_pool.fallback_cpu)?;
3510
3511 for (lidx, (spec, layer)) in self.layer_specs.iter().zip(self.layers.iter()).enumerate() {
3512 let layer_stats = LayerStats::new(lidx, layer, stats, bstats, cpus_ranges[lidx]);
3513 sys_stats.layers.insert(spec.name.to_string(), layer_stats);
3514 cpus_ranges[lidx] = (layer.nr_cpus, layer.nr_cpus);
3515 }
3516
3517 Ok(sys_stats)
3518 }
3519
3520 fn process_cgroup_creation(
3522 path: &Path,
3523 cgroup_regexes: &HashMap<u32, Regex>,
3524 cgroup_path_to_id: &mut HashMap<String, u64>,
3525 sender: &crossbeam::channel::Sender<CgroupEvent>,
3526 ) {
3527 let path_str = path.to_string_lossy().to_string();
3528
3529 let cgroup_id = std::fs::metadata(path)
3531 .map(|metadata| {
3532 use std::os::unix::fs::MetadataExt;
3533 metadata.ino()
3534 })
3535 .unwrap_or(0);
3536
3537 let mut match_bitmap = 0u64;
3539 for (rule_id, regex) in cgroup_regexes {
3540 if regex.is_match(&path_str) {
3541 match_bitmap |= 1u64 << rule_id;
3542 }
3543 }
3544
3545 cgroup_path_to_id.insert(path_str.clone(), cgroup_id);
3547
3548 if let Err(e) = sender.send(CgroupEvent::Created {
3550 path: path_str,
3551 cgroup_id,
3552 match_bitmap,
3553 }) {
3554 error!("Failed to send cgroup creation event: {}", e);
3555 }
3556 }
3557
3558 fn start_cgroup_watcher(
3559 shutdown: Arc<AtomicBool>,
3560 cgroup_regexes: HashMap<u32, Regex>,
3561 ) -> Result<Receiver<CgroupEvent>> {
3562 let mut inotify = Inotify::init().context("Failed to initialize inotify")?;
3563 let mut wd_to_path = HashMap::new();
3564
3565 let (sender, receiver) = crossbeam::channel::bounded::<CgroupEvent>(1024);
3567
3568 let root_wd = inotify
3570 .watches()
3571 .add("/sys/fs/cgroup", WatchMask::CREATE | WatchMask::DELETE)
3572 .context("Failed to add watch for /sys/fs/cgroup")?;
3573 wd_to_path.insert(root_wd, PathBuf::from("/sys/fs/cgroup"));
3574
3575 Self::add_recursive_watches(&mut inotify, &mut wd_to_path, Path::new("/sys/fs/cgroup"))?;
3577
3578 std::thread::spawn(move || {
3580 let mut buffer = [0; 4096];
3581 let inotify_fd = inotify.as_raw_fd();
3582 let mut cgroup_path_to_id = HashMap::<String, u64>::new();
3584
3585 for entry in WalkDir::new("/sys/fs/cgroup")
3587 .into_iter()
3588 .filter_map(|e| e.ok())
3589 .filter(|e| e.file_type().is_dir())
3590 {
3591 let path = entry.path();
3592 Self::process_cgroup_creation(
3593 path,
3594 &cgroup_regexes,
3595 &mut cgroup_path_to_id,
3596 &sender,
3597 );
3598 }
3599
3600 while !shutdown.load(Ordering::Relaxed) {
3601 let ready = unsafe {
3603 let mut read_fds: libc::fd_set = std::mem::zeroed();
3604 libc::FD_ZERO(&mut read_fds);
3605 libc::FD_SET(inotify_fd, &mut read_fds);
3606
3607 let mut timeout = libc::timeval {
3608 tv_sec: 0,
3609 tv_usec: 100_000, };
3611
3612 libc::select(
3613 inotify_fd + 1,
3614 &mut read_fds,
3615 std::ptr::null_mut(),
3616 std::ptr::null_mut(),
3617 &mut timeout,
3618 )
3619 };
3620
3621 if ready <= 0 {
3622 continue;
3624 }
3625
3626 let events = match inotify.read_events(&mut buffer) {
3628 Ok(events) => events,
3629 Err(e) => {
3630 error!("Error reading inotify events: {}", e);
3631 break;
3632 }
3633 };
3634
3635 for event in events {
3636 if !event.mask.contains(inotify::EventMask::CREATE)
3637 && !event.mask.contains(inotify::EventMask::DELETE)
3638 {
3639 continue;
3640 }
3641
3642 let name = match event.name {
3643 Some(name) => name,
3644 None => continue,
3645 };
3646
3647 let parent_path = match wd_to_path.get(&event.wd) {
3648 Some(parent) => parent,
3649 None => {
3650 warn!("Unknown watch descriptor: {:?}", event.wd);
3651 continue;
3652 }
3653 };
3654
3655 let path = parent_path.join(name.to_string_lossy().as_ref());
3656
3657 if event.mask.contains(inotify::EventMask::CREATE) {
3658 if !path.is_dir() {
3659 continue;
3660 }
3661
3662 Self::process_cgroup_creation(
3663 &path,
3664 &cgroup_regexes,
3665 &mut cgroup_path_to_id,
3666 &sender,
3667 );
3668
3669 match inotify
3671 .watches()
3672 .add(&path, WatchMask::CREATE | WatchMask::DELETE)
3673 {
3674 Ok(wd) => {
3675 wd_to_path.insert(wd, path.clone());
3676 }
3677 Err(e) => {
3678 warn!(
3679 "Failed to add watch for new cgroup {}: {}",
3680 path.display(),
3681 e
3682 );
3683 }
3684 }
3685 } else if event.mask.contains(inotify::EventMask::DELETE) {
3686 let path_str = path.to_string_lossy().to_string();
3687
3688 let cgroup_id = cgroup_path_to_id.remove(&path_str).unwrap_or(0);
3690
3691 if let Err(e) = sender.send(CgroupEvent::Removed {
3693 path: path_str,
3694 cgroup_id,
3695 }) {
3696 error!("Failed to send cgroup removal event: {}", e);
3697 }
3698
3699 let wd_to_remove = wd_to_path.iter().find_map(|(wd, watched_path)| {
3701 if watched_path == &path {
3702 Some(wd.clone())
3703 } else {
3704 None
3705 }
3706 });
3707 if let Some(wd) = wd_to_remove {
3708 wd_to_path.remove(&wd);
3709 }
3710 }
3711 }
3712 }
3713 });
3714
3715 Ok(receiver)
3716 }
3717
3718 fn add_recursive_watches(
3719 inotify: &mut Inotify,
3720 wd_to_path: &mut HashMap<inotify::WatchDescriptor, PathBuf>,
3721 path: &Path,
3722 ) -> Result<()> {
3723 for entry in WalkDir::new(path)
3724 .into_iter()
3725 .filter_map(|e| e.ok())
3726 .filter(|e| e.file_type().is_dir())
3727 .skip(1)
3728 {
3729 let entry_path = entry.path();
3730 match inotify
3732 .watches()
3733 .add(entry_path, WatchMask::CREATE | WatchMask::DELETE)
3734 {
3735 Ok(wd) => {
3736 wd_to_path.insert(wd, entry_path.to_path_buf());
3737 }
3738 Err(e) => {
3739 debug!("Failed to add watch for {}: {}", entry_path.display(), e);
3740 }
3741 }
3742 }
3743 Ok(())
3744 }
3745
3746 fn run(&mut self, shutdown: Arc<AtomicBool>) -> Result<UserExitInfo> {
3747 let (res_ch, req_ch) = self.stats_server.channels();
3748 let mut next_sched_at = Instant::now() + self.sched_intv;
3749 let enable_layer_refresh = !self.layer_refresh_intv.is_zero();
3750 let mut next_layer_refresh_at = Instant::now() + self.layer_refresh_intv;
3751 let mut cpus_ranges = HashMap::<ThreadId, Vec<(usize, usize)>>::new();
3752
3753 let cgroup_regexes = self.cgroup_regexes.take().unwrap();
3755 let cgroup_event_rx = if !cgroup_regexes.is_empty() {
3756 Some(Self::start_cgroup_watcher(
3757 shutdown.clone(),
3758 cgroup_regexes,
3759 )?)
3760 } else {
3761 None
3762 };
3763
3764 while !shutdown.load(Ordering::Relaxed) && !uei_exited!(&self.skel, uei) {
3765 let now = Instant::now();
3766
3767 if now >= next_sched_at {
3768 self.step()?;
3769 while next_sched_at < now {
3770 next_sched_at += self.sched_intv;
3771 }
3772 }
3773
3774 if enable_layer_refresh && now >= next_layer_refresh_at {
3775 self.skel
3776 .maps
3777 .bss_data
3778 .as_mut()
3779 .unwrap()
3780 .layer_refresh_seq_avgruntime += 1;
3781 while next_layer_refresh_at < now {
3782 next_layer_refresh_at += self.layer_refresh_intv;
3783 }
3784 }
3785
3786 let timeout_duration = next_sched_at.saturating_duration_since(Instant::now());
3788 let never_rx = crossbeam::channel::never();
3789 let cgroup_rx = cgroup_event_rx.as_ref().unwrap_or(&never_rx);
3790
3791 select! {
3792 recv(req_ch) -> msg => match msg {
3793 Ok(StatsReq::Hello(tid)) => {
3794 cpus_ranges.insert(
3795 tid,
3796 self.layers.iter().map(|l| (l.nr_cpus, l.nr_cpus)).collect(),
3797 );
3798 let stats =
3799 Stats::new(&mut self.skel, &self.proc_reader, &self.gpu_task_handler)?;
3800 res_ch.send(StatsRes::Hello(stats))?;
3801 }
3802 Ok(StatsReq::Refresh(tid, mut stats)) => {
3803 for i in 0..self.nr_layer_cpus_ranges.len() {
3805 for (_, ranges) in cpus_ranges.iter_mut() {
3806 ranges[i] = (
3807 ranges[i].0.min(self.nr_layer_cpus_ranges[i].0),
3808 ranges[i].1.max(self.nr_layer_cpus_ranges[i].1),
3809 );
3810 }
3811 self.nr_layer_cpus_ranges[i] =
3812 (self.layers[i].nr_cpus, self.layers[i].nr_cpus);
3813 }
3814
3815 stats.refresh(
3816 &mut self.skel,
3817 &self.proc_reader,
3818 now,
3819 self.processing_dur,
3820 &self.gpu_task_handler,
3821 )?;
3822 let sys_stats =
3823 self.generate_sys_stats(&stats, cpus_ranges.get_mut(&tid).unwrap())?;
3824 res_ch.send(StatsRes::Refreshed((stats, sys_stats)))?;
3825 }
3826 Ok(StatsReq::Bye(tid)) => {
3827 cpus_ranges.remove(&tid);
3828 res_ch.send(StatsRes::Bye)?;
3829 }
3830 Err(e) => Err(e)?,
3831 },
3832
3833 recv(cgroup_rx) -> event => match event {
3834 Ok(CgroupEvent::Created { path, cgroup_id, match_bitmap }) => {
3835 self.skel.maps.cgroup_match_bitmap.update(
3837 &cgroup_id.to_ne_bytes(),
3838 &match_bitmap.to_ne_bytes(),
3839 libbpf_rs::MapFlags::ANY,
3840 ).with_context(|| format!(
3841 "Failed to insert cgroup {}({}) into BPF map. Cgroup map may be full \
3842 (max 16384 entries). Aborting.",
3843 cgroup_id, path
3844 ))?;
3845
3846 debug!("Added cgroup {} to BPF map with bitmap 0x{:x}", cgroup_id, match_bitmap);
3847 }
3848 Ok(CgroupEvent::Removed { path, cgroup_id }) => {
3849 if let Err(e) = self.skel.maps.cgroup_match_bitmap.delete(&cgroup_id.to_ne_bytes()) {
3851 warn!("Failed to delete cgroup {} from BPF map: {}", cgroup_id, e);
3852 } else {
3853 debug!("Removed cgroup {}({}) from BPF map", cgroup_id, path);
3854 }
3855 }
3856 Err(e) => {
3857 error!("Error receiving cgroup event: {}", e);
3858 }
3859 },
3860
3861 recv(crossbeam::channel::after(timeout_duration)) -> _ => {
3862 }
3864 }
3865 }
3866
3867 let _ = self.struct_ops.take();
3868 uei_report!(&self.skel, uei)
3869 }
3870}
3871
3872impl Drop for Scheduler<'_> {
3873 fn drop(&mut self) {
3874 info!("Unregister {SCHEDULER_NAME} scheduler");
3875
3876 if let Some(struct_ops) = self.struct_ops.take() {
3877 drop(struct_ops);
3878 }
3879 }
3880}
3881
3882fn write_example_file(path: &str) -> Result<()> {
3883 let mut f = fs::OpenOptions::new()
3884 .create_new(true)
3885 .write(true)
3886 .open(path)?;
3887 Ok(f.write_all(serde_json::to_string_pretty(&*EXAMPLE_CONFIG)?.as_bytes())?)
3888}
3889
3890struct HintLayerInfo {
3891 layer_id: usize,
3892 system_cpu_util_below: Option<f64>,
3893 dsq_insert_below: Option<f64>,
3894}
3895
3896fn verify_layer_specs(specs: &[LayerSpec]) -> Result<HashMap<u64, HintLayerInfo>> {
3897 let mut hint_to_layer_map = HashMap::<u64, (usize, String, Option<f64>, Option<f64>)>::new();
3898
3899 let nr_specs = specs.len();
3900 if nr_specs == 0 {
3901 bail!("No layer spec");
3902 }
3903 if nr_specs > MAX_LAYERS {
3904 bail!("Too many layer specs");
3905 }
3906
3907 for (idx, spec) in specs.iter().enumerate() {
3908 if idx < nr_specs - 1 {
3909 if spec.matches.is_empty() {
3910 bail!("Non-terminal spec {:?} has NULL matches", spec.name);
3911 }
3912 } else {
3913 if spec.matches.len() != 1 || !spec.matches[0].is_empty() {
3914 bail!("Terminal spec {:?} must have an empty match", spec.name);
3915 }
3916 }
3917
3918 if spec.matches.len() > MAX_LAYER_MATCH_ORS {
3919 bail!(
3920 "Spec {:?} has too many ({}) OR match blocks",
3921 spec.name,
3922 spec.matches.len()
3923 );
3924 }
3925
3926 for (ands_idx, ands) in spec.matches.iter().enumerate() {
3927 if ands.len() > NR_LAYER_MATCH_KINDS {
3928 bail!(
3929 "Spec {:?}'s {}th OR block has too many ({}) match conditions",
3930 spec.name,
3931 ands_idx,
3932 ands.len()
3933 );
3934 }
3935 let mut hint_equals_cnt = 0;
3936 let mut system_cpu_util_below_cnt = 0;
3937 let mut dsq_insert_below_cnt = 0;
3938 let mut hint_value: Option<u64> = None;
3939 let mut system_cpu_util_threshold: Option<f64> = None;
3940 let mut dsq_insert_threshold: Option<f64> = None;
3941 for one in ands.iter() {
3942 match one {
3943 LayerMatch::CgroupPrefix(prefix) => {
3944 if prefix.len() > MAX_PATH {
3945 bail!("Spec {:?} has too long a cgroup prefix", spec.name);
3946 }
3947 }
3948 LayerMatch::CgroupSuffix(suffix) => {
3949 if suffix.len() > MAX_PATH {
3950 bail!("Spec {:?} has too long a cgroup suffix", spec.name);
3951 }
3952 }
3953 LayerMatch::CgroupContains(substr) => {
3954 if substr.len() > MAX_PATH {
3955 bail!("Spec {:?} has too long a cgroup substr", spec.name);
3956 }
3957 }
3958 LayerMatch::CommPrefix(prefix) => {
3959 if prefix.len() > MAX_COMM {
3960 bail!("Spec {:?} has too long a comm prefix", spec.name);
3961 }
3962 }
3963 LayerMatch::PcommPrefix(prefix) => {
3964 if prefix.len() > MAX_COMM {
3965 bail!("Spec {:?} has too long a process name prefix", spec.name);
3966 }
3967 }
3968 LayerMatch::SystemCpuUtilBelow(threshold) => {
3969 if *threshold < 0.0 || *threshold > 1.0 {
3970 bail!(
3971 "Spec {:?} has SystemCpuUtilBelow threshold outside the range [0.0, 1.0]",
3972 spec.name
3973 );
3974 }
3975 system_cpu_util_threshold = Some(*threshold);
3976 system_cpu_util_below_cnt += 1;
3977 }
3978 LayerMatch::DsqInsertBelow(threshold) => {
3979 if *threshold < 0.0 || *threshold > 1.0 {
3980 bail!(
3981 "Spec {:?} has DsqInsertBelow threshold outside the range [0.0, 1.0]",
3982 spec.name
3983 );
3984 }
3985 dsq_insert_threshold = Some(*threshold);
3986 dsq_insert_below_cnt += 1;
3987 }
3988 LayerMatch::HintEquals(hint) => {
3989 if *hint > 1024 {
3990 bail!(
3991 "Spec {:?} has hint value outside the range [0, 1024]",
3992 spec.name
3993 );
3994 }
3995 hint_value = Some(*hint);
3996 hint_equals_cnt += 1;
3997 }
3998 _ => {}
3999 }
4000 }
4001 if hint_equals_cnt > 1 {
4002 bail!("Only 1 HintEquals match permitted per AND block");
4003 }
4004 let high_freq_matcher_cnt = system_cpu_util_below_cnt + dsq_insert_below_cnt;
4005 if high_freq_matcher_cnt > 0 {
4006 if hint_equals_cnt != 1 {
4007 bail!("High-frequency matchers (SystemCpuUtilBelow, DsqInsertBelow) must be used with one HintEquals");
4008 }
4009 if system_cpu_util_below_cnt > 1 {
4010 bail!("Only 1 SystemCpuUtilBelow match permitted per AND block");
4011 }
4012 if dsq_insert_below_cnt > 1 {
4013 bail!("Only 1 DsqInsertBelow match permitted per AND block");
4014 }
4015 if ands.len() != hint_equals_cnt + system_cpu_util_below_cnt + dsq_insert_below_cnt
4016 {
4017 bail!("High-frequency matchers must be used only with HintEquals (no other matchers)");
4018 }
4019 } else if hint_equals_cnt == 1 && ands.len() != 1 {
4020 bail!("HintEquals match cannot be in conjunction with other matches");
4021 }
4022
4023 if let Some(hint) = hint_value {
4025 if let Some((layer_id, name, _, _)) = hint_to_layer_map.get(&hint) {
4026 if *layer_id != idx {
4027 bail!(
4028 "Spec {:?} has hint value ({}) that is already mapped to Spec {:?}",
4029 spec.name,
4030 hint,
4031 name
4032 );
4033 }
4034 } else {
4035 hint_to_layer_map.insert(
4036 hint,
4037 (
4038 idx,
4039 spec.name.clone(),
4040 system_cpu_util_threshold,
4041 dsq_insert_threshold,
4042 ),
4043 );
4044 }
4045 }
4046 }
4047
4048 match spec.kind {
4049 LayerKind::Confined {
4050 cpus_range,
4051 util_range,
4052 ..
4053 }
4054 | LayerKind::Grouped {
4055 cpus_range,
4056 util_range,
4057 ..
4058 } => {
4059 if let Some((cpus_min, cpus_max)) = cpus_range {
4060 if cpus_min > cpus_max {
4061 bail!(
4062 "Spec {:?} has invalid cpus_range({}, {})",
4063 spec.name,
4064 cpus_min,
4065 cpus_max
4066 );
4067 }
4068 }
4069 if util_range.0 >= util_range.1 {
4070 bail!(
4071 "Spec {:?} has invalid util_range ({}, {})",
4072 spec.name,
4073 util_range.0,
4074 util_range.1
4075 );
4076 }
4077 }
4078 _ => {}
4079 }
4080 }
4081
4082 Ok(hint_to_layer_map
4083 .into_iter()
4084 .map(|(k, v)| {
4085 (
4086 k,
4087 HintLayerInfo {
4088 layer_id: v.0,
4089 system_cpu_util_below: v.2,
4090 dsq_insert_below: v.3,
4091 },
4092 )
4093 })
4094 .collect())
4095}
4096
4097fn name_suffix(cgroup: &str, len: usize) -> String {
4098 let suffixlen = std::cmp::min(len, cgroup.len());
4099 let suffixrev: String = cgroup.chars().rev().take(suffixlen).collect();
4100
4101 suffixrev.chars().rev().collect()
4102}
4103
4104fn traverse_sysfs(dir: &Path) -> Result<Vec<PathBuf>> {
4105 let mut paths = vec![];
4106
4107 if !dir.is_dir() {
4108 panic!("path {:?} does not correspond to directory", dir);
4109 }
4110
4111 let direntries = fs::read_dir(dir)?;
4112
4113 for entry in direntries {
4114 let path = entry?.path();
4115 if path.is_dir() {
4116 paths.append(&mut traverse_sysfs(&path)?);
4117 paths.push(path);
4118 }
4119 }
4120
4121 Ok(paths)
4122}
4123
4124fn find_cpumask(cgroup: &str) -> Cpumask {
4125 let mut path = String::from(cgroup);
4126 path.push_str("/cpuset.cpus.effective");
4127
4128 let description = fs::read_to_string(&mut path).unwrap();
4129
4130 Cpumask::from_cpulist(&description).unwrap()
4131}
4132
4133fn expand_template(rule: &LayerMatch) -> Result<Vec<(LayerMatch, Cpumask)>> {
4134 match rule {
4135 LayerMatch::CgroupSuffix(suffix) => Ok(traverse_sysfs(Path::new("/sys/fs/cgroup"))?
4136 .into_iter()
4137 .map(|cgroup| String::from(cgroup.to_str().expect("could not parse cgroup path")))
4138 .filter(|cgroup| cgroup.ends_with(suffix))
4139 .map(|cgroup| {
4140 (
4141 {
4142 let mut slashterminated = cgroup.clone();
4143 slashterminated.push('/');
4144 LayerMatch::CgroupSuffix(name_suffix(&slashterminated, 64))
4145 },
4146 find_cpumask(&cgroup),
4147 )
4148 })
4149 .collect()),
4150 LayerMatch::CgroupRegex(expr) => Ok(traverse_sysfs(Path::new("/sys/fs/cgroup"))?
4151 .into_iter()
4152 .map(|cgroup| String::from(cgroup.to_str().expect("could not parse cgroup path")))
4153 .filter(|cgroup| {
4154 let re = Regex::new(expr).unwrap();
4155 re.is_match(cgroup)
4156 })
4157 .map(|cgroup| {
4158 (
4159 {
4163 let mut slashterminated = cgroup.clone();
4164 slashterminated.push('/');
4165 LayerMatch::CgroupSuffix(name_suffix(&slashterminated, 64))
4166 },
4167 find_cpumask(&cgroup),
4168 )
4169 })
4170 .collect()),
4171 _ => panic!("Unimplemented template enum {:?}", rule),
4172 }
4173}
4174
4175fn create_perf_fds(skel: &mut BpfSkel, event: u64) -> Result<()> {
4176 let mut attr = perf::bindings::perf_event_attr::default();
4177 attr.size = std::mem::size_of::<perf::bindings::perf_event_attr>() as u32;
4178 attr.type_ = perf::bindings::PERF_TYPE_RAW;
4179 attr.config = event;
4180 attr.sample_type = 0u64;
4181 attr.__bindgen_anon_1.sample_period = 0u64;
4182 attr.set_disabled(0);
4183
4184 let perf_events_map = &skel.maps.scx_pmu_map;
4185 let map_fd = unsafe { libbpf_sys::bpf_map__fd(perf_events_map.as_libbpf_object().as_ptr()) };
4186
4187 let mut failures = 0u64;
4188
4189 for cpu in 0..*NR_CPUS_POSSIBLE {
4190 let fd = unsafe { perf::perf_event_open(&mut attr as *mut _, -1, cpu as i32, -1, 0) };
4191 if fd < 0 {
4192 failures += 1;
4193 trace!(
4194 "perf_event_open failed cpu={cpu} errno={}",
4195 std::io::Error::last_os_error()
4196 );
4197 continue;
4198 }
4199
4200 let key = cpu as u32;
4201 let val = fd as u32;
4202 let ret = unsafe {
4203 libbpf_sys::bpf_map_update_elem(
4204 map_fd,
4205 &key as *const _ as *const _,
4206 &val as *const _ as *const _,
4207 0,
4208 )
4209 };
4210 if ret != 0 {
4211 trace!("bpf_map_update_elem failed cpu={cpu} fd={fd} ret={ret}");
4212 } else {
4213 trace!("mapped cpu={cpu} -> fd={fd}");
4214 }
4215 }
4216
4217 if failures > 0 {
4218 println!("membw tracking: failed to install {failures} counters");
4219 }
4221
4222 Ok(())
4223}
4224
4225fn setup_membw_tracking(skel: &mut OpenBpfSkel) -> Result<u64> {
4227 let pmumanager = PMUManager::new()?;
4228 let codename = &pmumanager.codename as &str;
4229
4230 let pmuspec = match codename {
4231 "amdzen1" | "amdzen2" | "amdzen3" => {
4232 trace!("found AMD codename {codename}");
4233 pmumanager.pmus.get("ls_any_fills_from_sys.mem_io_local")
4234 }
4235 "amdzen4" | "amdzen5" => {
4236 trace!("found AMD codename {codename}");
4237 pmumanager.pmus.get("ls_any_fills_from_sys.dram_io_all")
4238 }
4239
4240 "haswell" | "broadwell" | "broadwellde" | "broadwellx" | "skylake" | "skylakex"
4241 | "cascadelakex" | "arrowlake" | "meteorlake" | "sapphirerapids" | "emeraldrapids"
4242 | "graniterapids" => {
4243 trace!("found Intel codename {codename}");
4244 pmumanager.pmus.get("LONGEST_LAT_CACHE.MISS")
4245 }
4246
4247 _ => {
4248 trace!("found unknown codename {codename}");
4249 None
4250 }
4251 };
4252
4253 let spec = pmuspec.ok_or("not_found").unwrap();
4254 let config = (spec.umask << 8) | spec.event[0];
4255
4256 skel.maps.rodata_data.as_mut().unwrap().membw_event = config;
4258
4259 Ok(config)
4260}
4261
4262#[clap_main::clap_main]
4263fn main(opts: Opts) -> Result<()> {
4264 if opts.version {
4265 println!(
4266 "scx_layered {}",
4267 build_id::full_version(env!("CARGO_PKG_VERSION"))
4268 );
4269 return Ok(());
4270 }
4271
4272 if opts.help_stats {
4273 stats::server_data().describe_meta(&mut std::io::stdout(), None)?;
4274 return Ok(());
4275 }
4276
4277 let env_filter = EnvFilter::try_from_default_env()
4278 .or_else(|_| match EnvFilter::try_new(&opts.log_level) {
4279 Ok(filter) => Ok(filter),
4280 Err(e) => {
4281 eprintln!(
4282 "invalid log envvar: {}, using info, err is: {}",
4283 opts.log_level, e
4284 );
4285 EnvFilter::try_new("info")
4286 }
4287 })
4288 .unwrap_or_else(|_| EnvFilter::new("info"));
4289
4290 match tracing_subscriber::fmt()
4291 .with_env_filter(env_filter)
4292 .with_target(true)
4293 .with_thread_ids(true)
4294 .with_file(true)
4295 .with_line_number(true)
4296 .try_init()
4297 {
4298 Ok(()) => {}
4299 Err(e) => eprintln!("failed to init logger: {}", e),
4300 }
4301
4302 if opts.verbose > 0 {
4303 warn!("Setting verbose via -v is depricated and will be an error in future releases.");
4304 }
4305
4306 if opts.no_load_frac_limit {
4307 warn!("--no-load-frac-limit is deprecated and noop");
4308 }
4309 if opts.layer_preempt_weight_disable != 0.0 {
4310 warn!("--layer-preempt-weight-disable is deprecated and noop");
4311 }
4312 if opts.layer_growth_weight_disable != 0.0 {
4313 warn!("--layer-growth-weight-disable is deprecated and noop");
4314 }
4315 if opts.local_llc_iteration {
4316 warn!("--local_llc_iteration is deprecated and noop");
4317 }
4318
4319 debug!("opts={:?}", &opts);
4320
4321 if let Some(run_id) = opts.run_id {
4322 info!("scx_layered run_id: {}", run_id);
4323 }
4324
4325 let shutdown = Arc::new(AtomicBool::new(false));
4326 let shutdown_clone = shutdown.clone();
4327 ctrlc::set_handler(move || {
4328 shutdown_clone.store(true, Ordering::Relaxed);
4329 })
4330 .context("Error setting Ctrl-C handler")?;
4331
4332 if let Some(intv) = opts.monitor.or(opts.stats) {
4333 let shutdown_copy = shutdown.clone();
4334 let jh = std::thread::spawn(move || {
4335 match stats::monitor(Duration::from_secs_f64(intv), shutdown_copy) {
4336 Ok(_) => {
4337 debug!("stats monitor thread finished successfully")
4338 }
4339 Err(error_object) => {
4340 warn!(
4341 "stats monitor thread finished because of an error {}",
4342 error_object
4343 )
4344 }
4345 }
4346 });
4347 if opts.monitor.is_some() {
4348 let _ = jh.join();
4349 return Ok(());
4350 }
4351 }
4352
4353 if let Some(path) = &opts.example {
4354 write_example_file(path)?;
4355 return Ok(());
4356 }
4357
4358 let mut layer_config = match opts.run_example {
4359 true => EXAMPLE_CONFIG.clone(),
4360 false => LayerConfig { specs: vec![] },
4361 };
4362
4363 for (idx, input) in opts.specs.iter().enumerate() {
4364 let specs = LayerSpec::parse(input)
4365 .context(format!("Failed to parse specs[{}] ({:?})", idx, input))?;
4366
4367 for spec in specs {
4368 match spec.template {
4369 Some(ref rule) => {
4370 let matches = expand_template(&rule)?;
4371 if matches.is_empty() {
4374 layer_config.specs.push(spec);
4375 } else {
4376 for (mt, mask) in matches {
4377 let mut genspec = spec.clone();
4378
4379 genspec.cpuset = Some(mask);
4380
4381 for orterm in &mut genspec.matches {
4383 orterm.push(mt.clone());
4384 }
4385
4386 match &mt {
4387 LayerMatch::CgroupSuffix(cgroup) => genspec.name.push_str(cgroup),
4388 _ => bail!("Template match has unexpected type"),
4389 }
4390
4391 layer_config.specs.push(genspec);
4393 }
4394 }
4395 }
4396
4397 None => {
4398 layer_config.specs.push(spec);
4399 }
4400 }
4401 }
4402 }
4403
4404 for spec in layer_config.specs.iter_mut() {
4405 let common = spec.kind.common_mut();
4406
4407 if common.slice_us == 0 {
4408 common.slice_us = opts.slice_us;
4409 }
4410
4411 if common.weight == 0 {
4412 common.weight = DEFAULT_LAYER_WEIGHT;
4413 }
4414 common.weight = common.weight.clamp(MIN_LAYER_WEIGHT, MAX_LAYER_WEIGHT);
4415
4416 if common.preempt {
4417 if common.disallow_open_after_us.is_some() {
4418 warn!(
4419 "Preempt layer {} has non-null disallow_open_after_us, ignored",
4420 &spec.name
4421 );
4422 }
4423 if common.disallow_preempt_after_us.is_some() {
4424 warn!(
4425 "Preempt layer {} has non-null disallow_preempt_after_us, ignored",
4426 &spec.name
4427 );
4428 }
4429 common.disallow_open_after_us = Some(u64::MAX);
4430 common.disallow_preempt_after_us = Some(u64::MAX);
4431 } else {
4432 if common.disallow_open_after_us.is_none() {
4433 common.disallow_open_after_us = Some(*DFL_DISALLOW_OPEN_AFTER_US);
4434 }
4435
4436 if common.disallow_preempt_after_us.is_none() {
4437 common.disallow_preempt_after_us = Some(*DFL_DISALLOW_PREEMPT_AFTER_US);
4438 }
4439 }
4440
4441 if common.idle_smt.is_some() {
4442 warn!("Layer {} has deprecated flag \"idle_smt\"", &spec.name);
4443 }
4444 }
4445
4446 let membw_required = layer_config.specs.iter().any(|spec| match spec.kind {
4447 LayerKind::Confined { membw_gb, .. } | LayerKind::Grouped { membw_gb, .. } => {
4448 membw_gb.is_some()
4449 }
4450 LayerKind::Open { .. } => false,
4451 });
4452
4453 if opts.print_and_exit {
4454 println!("specs={}", serde_json::to_string_pretty(&layer_config)?);
4455 return Ok(());
4456 }
4457
4458 debug!("specs={}", serde_json::to_string_pretty(&layer_config)?);
4459 let hint_to_layer_map = verify_layer_specs(&layer_config.specs)?;
4460
4461 let mut open_object = MaybeUninit::uninit();
4462 loop {
4463 let mut sched = Scheduler::init(
4464 &opts,
4465 &layer_config.specs,
4466 &mut open_object,
4467 &hint_to_layer_map,
4468 membw_required,
4469 )?;
4470 if !sched.run(shutdown.clone())?.should_restart() {
4471 break;
4472 }
4473 }
4474
4475 Ok(())
4476}