1mod bpf_skel;
6mod stats;
7
8use std::collections::BTreeMap;
9use std::collections::HashMap;
10use std::collections::HashSet;
11use std::ffi::CString;
12use std::fs;
13use std::io::Write;
14use std::mem::MaybeUninit;
15use std::ops::Sub;
16use std::path::Path;
17use std::path::PathBuf;
18use std::sync::atomic::AtomicBool;
19use std::sync::atomic::Ordering;
20use std::sync::Arc;
21use std::thread::ThreadId;
22use std::time::Duration;
23use std::time::Instant;
24
25use inotify::{Inotify, WatchMask};
26use libc;
27use std::os::unix::io::AsRawFd;
28
29use anyhow::anyhow;
30use anyhow::bail;
31use anyhow::Context;
32use anyhow::Result;
33pub use bpf_skel::*;
34use clap::Parser;
35use crossbeam::channel::Receiver;
36use crossbeam::select;
37use lazy_static::lazy_static;
38use libbpf_rs::libbpf_sys;
39use libbpf_rs::AsRawLibbpf;
40use libbpf_rs::MapCore as _;
41use libbpf_rs::OpenObject;
42use libbpf_rs::ProgramInput;
43use nix::sched::CpuSet;
44use nvml_wrapper::error::NvmlError;
45use nvml_wrapper::Nvml;
46use once_cell::sync::OnceCell;
47use regex::Regex;
48use scx_bpf_compat;
49use scx_layered::*;
50use scx_raw_pmu::PMUManager;
51use scx_stats::prelude::*;
52use scx_utils::build_id;
53use scx_utils::compat;
54use scx_utils::init_libbpf_logging;
55use scx_utils::libbpf_clap_opts::LibbpfOpts;
56use scx_utils::perf;
57use scx_utils::pm::{cpu_idle_resume_latency_supported, update_cpu_idle_resume_latency};
58use scx_utils::read_netdevs;
59use scx_utils::scx_enums;
60use scx_utils::scx_ops_attach;
61use scx_utils::scx_ops_load;
62use scx_utils::scx_ops_open;
63use scx_utils::uei_exited;
64use scx_utils::uei_report;
65use scx_utils::CoreType;
66use scx_utils::Cpumask;
67use scx_utils::Llc;
68use scx_utils::NetDev;
69use scx_utils::Topology;
70use scx_utils::TopologyArgs;
71use scx_utils::UserExitInfo;
72use scx_utils::NR_CPUS_POSSIBLE;
73use scx_utils::NR_CPU_IDS;
74use stats::LayerStats;
75use stats::StatsReq;
76use stats::StatsRes;
77use stats::SysStats;
78use std::collections::VecDeque;
79use sysinfo::{Pid, ProcessRefreshKind, ProcessesToUpdate, System};
80use tracing::{debug, error, info, trace, warn};
81use tracing_subscriber::filter::EnvFilter;
82use walkdir::WalkDir;
83
84const SCHEDULER_NAME: &str = "scx_layered";
85const MAX_PATH: usize = bpf_intf::consts_MAX_PATH as usize;
86const MAX_COMM: usize = bpf_intf::consts_MAX_COMM as usize;
87const MAX_LAYER_WEIGHT: u32 = bpf_intf::consts_MAX_LAYER_WEIGHT;
88const MIN_LAYER_WEIGHT: u32 = bpf_intf::consts_MIN_LAYER_WEIGHT;
89const MAX_LAYER_MATCH_ORS: usize = bpf_intf::consts_MAX_LAYER_MATCH_ORS as usize;
90const MAX_LAYER_NAME: usize = bpf_intf::consts_MAX_LAYER_NAME as usize;
91const MAX_LAYERS: usize = bpf_intf::consts_MAX_LAYERS as usize;
92const DEFAULT_LAYER_WEIGHT: u32 = bpf_intf::consts_DEFAULT_LAYER_WEIGHT;
93const USAGE_HALF_LIFE: u32 = bpf_intf::consts_USAGE_HALF_LIFE;
94const USAGE_HALF_LIFE_F64: f64 = USAGE_HALF_LIFE as f64 / 1_000_000_000.0;
95
96const LAYER_USAGE_OWNED: usize = bpf_intf::layer_usage_LAYER_USAGE_OWNED as usize;
97const LAYER_USAGE_OPEN: usize = bpf_intf::layer_usage_LAYER_USAGE_OPEN as usize;
98const LAYER_USAGE_SUM_UPTO: usize = bpf_intf::layer_usage_LAYER_USAGE_SUM_UPTO as usize;
99const LAYER_USAGE_PROTECTED: usize = bpf_intf::layer_usage_LAYER_USAGE_PROTECTED as usize;
100const LAYER_USAGE_PROTECTED_PREEMPT: usize =
101 bpf_intf::layer_usage_LAYER_USAGE_PROTECTED_PREEMPT as usize;
102const NR_LAYER_USAGES: usize = bpf_intf::layer_usage_NR_LAYER_USAGES as usize;
103
104const NR_GSTATS: usize = bpf_intf::global_stat_id_NR_GSTATS as usize;
105const NR_LSTATS: usize = bpf_intf::layer_stat_id_NR_LSTATS as usize;
106const NR_LLC_LSTATS: usize = bpf_intf::llc_layer_stat_id_NR_LLC_LSTATS as usize;
107
108const NR_LAYER_MATCH_KINDS: usize = bpf_intf::layer_match_kind_NR_LAYER_MATCH_KINDS as usize;
109
110static NVML: OnceCell<Nvml> = OnceCell::new();
111
112fn nvml() -> Result<&'static Nvml, NvmlError> {
113 NVML.get_or_try_init(Nvml::init)
114}
115
116lazy_static! {
117 static ref USAGE_DECAY: f64 = 0.5f64.powf(1.0 / USAGE_HALF_LIFE_F64);
118 static ref DFL_DISALLOW_OPEN_AFTER_US: u64 = 2 * scx_enums.SCX_SLICE_DFL / 1000;
119 static ref DFL_DISALLOW_PREEMPT_AFTER_US: u64 = 4 * scx_enums.SCX_SLICE_DFL / 1000;
120 static ref EXAMPLE_CONFIG: LayerConfig = LayerConfig {
121 specs: vec![
122 LayerSpec {
123 name: "batch".into(),
124 comment: Some("tasks under system.slice or tasks with nice value > 0".into()),
125 cpuset: None,
126 template: None,
127 matches: vec![
128 vec![LayerMatch::CgroupPrefix("system.slice/".into())],
129 vec![LayerMatch::NiceAbove(0)],
130 ],
131 kind: LayerKind::Confined {
132 util_range: (0.8, 0.9),
133 cpus_range: Some((0, 16)),
134 cpus_range_frac: None,
135 protected: false,
136 membw_gb: None,
137 common: LayerCommon {
138 min_exec_us: 1000,
139 yield_ignore: 0.0,
140 preempt: false,
141 preempt_first: false,
142 exclusive: false,
143 allow_node_aligned: false,
144 skip_remote_node: false,
145 prev_over_idle_core: false,
146 idle_smt: None,
147 slice_us: 20000,
148 fifo: false,
149 weight: DEFAULT_LAYER_WEIGHT,
150 disallow_open_after_us: None,
151 disallow_preempt_after_us: None,
152 xllc_mig_min_us: 1000.0,
153 growth_algo: LayerGrowthAlgo::Sticky,
154 idle_resume_us: None,
155 perf: 1024,
156 nodes: vec![],
157 llcs: vec![],
158 member_expire_ms: 0,
159 placement: LayerPlacement::Standard,
160 },
161 },
162 },
163 LayerSpec {
164 name: "immediate".into(),
165 comment: Some("tasks under workload.slice with nice value < 0".into()),
166 cpuset: None,
167 template: None,
168 matches: vec![vec![
169 LayerMatch::CgroupPrefix("workload.slice/".into()),
170 LayerMatch::NiceBelow(0),
171 ]],
172 kind: LayerKind::Open {
173 common: LayerCommon {
174 min_exec_us: 100,
175 yield_ignore: 0.25,
176 preempt: true,
177 preempt_first: false,
178 exclusive: true,
179 allow_node_aligned: true,
180 skip_remote_node: false,
181 prev_over_idle_core: true,
182 idle_smt: None,
183 slice_us: 20000,
184 fifo: false,
185 weight: DEFAULT_LAYER_WEIGHT,
186 disallow_open_after_us: None,
187 disallow_preempt_after_us: None,
188 xllc_mig_min_us: 0.0,
189 growth_algo: LayerGrowthAlgo::Sticky,
190 perf: 1024,
191 idle_resume_us: None,
192 nodes: vec![],
193 llcs: vec![],
194 member_expire_ms: 0,
195 placement: LayerPlacement::Standard,
196 },
197 },
198 },
199 LayerSpec {
200 name: "stress-ng".into(),
201 comment: Some("stress-ng test layer".into()),
202 cpuset: None,
203 template: None,
204 matches: vec![
205 vec![LayerMatch::CommPrefix("stress-ng".into()),],
206 vec![LayerMatch::PcommPrefix("stress-ng".into()),]
207 ],
208 kind: LayerKind::Confined {
209 cpus_range: None,
210 util_range: (0.2, 0.8),
211 protected: false,
212 cpus_range_frac: None,
213 membw_gb: None,
214 common: LayerCommon {
215 min_exec_us: 800,
216 yield_ignore: 0.0,
217 preempt: true,
218 preempt_first: false,
219 exclusive: false,
220 allow_node_aligned: false,
221 skip_remote_node: false,
222 prev_over_idle_core: false,
223 idle_smt: None,
224 slice_us: 800,
225 fifo: false,
226 weight: DEFAULT_LAYER_WEIGHT,
227 disallow_open_after_us: None,
228 disallow_preempt_after_us: None,
229 xllc_mig_min_us: 0.0,
230 growth_algo: LayerGrowthAlgo::Topo,
231 perf: 1024,
232 idle_resume_us: None,
233 nodes: vec![],
234 llcs: vec![],
235 member_expire_ms: 0,
236 placement: LayerPlacement::Standard,
237 },
238 },
239 },
240 LayerSpec {
241 name: "normal".into(),
242 comment: Some("the rest".into()),
243 cpuset: None,
244 template: None,
245 matches: vec![vec![]],
246 kind: LayerKind::Grouped {
247 cpus_range: None,
248 util_range: (0.5, 0.6),
249 util_includes_open_cputime: true,
250 protected: false,
251 cpus_range_frac: None,
252 membw_gb: None,
253 common: LayerCommon {
254 min_exec_us: 200,
255 yield_ignore: 0.0,
256 preempt: false,
257 preempt_first: false,
258 exclusive: false,
259 allow_node_aligned: false,
260 skip_remote_node: false,
261 prev_over_idle_core: false,
262 idle_smt: None,
263 slice_us: 20000,
264 fifo: false,
265 weight: DEFAULT_LAYER_WEIGHT,
266 disallow_open_after_us: None,
267 disallow_preempt_after_us: None,
268 xllc_mig_min_us: 100.0,
269 growth_algo: LayerGrowthAlgo::Linear,
270 perf: 1024,
271 idle_resume_us: None,
272 nodes: vec![],
273 llcs: vec![],
274 member_expire_ms: 0,
275 placement: LayerPlacement::Standard,
276 },
277 },
278 },
279 ],
280 };
281}
282
283#[derive(Debug, Parser)]
571#[command(verbatim_doc_comment)]
572struct Opts {
573 #[clap(short = 'v', long, action = clap::ArgAction::Count)]
575 verbose: u8,
576
577 #[clap(short = 's', long, default_value = "20000")]
579 slice_us: u64,
580
581 #[clap(short = 'M', long, default_value = "0")]
586 max_exec_us: u64,
587
588 #[clap(short = 'i', long, default_value = "0.1")]
590 interval: f64,
591
592 #[clap(short = 'n', long, default_value = "false")]
595 no_load_frac_limit: bool,
596
597 #[clap(long, default_value = "0")]
599 exit_dump_len: u32,
600
601 #[clap(long, default_value = "info")]
604 log_level: String,
605
606 #[arg(short = 't', long, num_args = 0..=1, default_missing_value = "true", require_equals = true)]
610 disable_topology: Option<bool>,
611
612 #[clap(long)]
614 xnuma_preemption: bool,
615
616 #[clap(long)]
618 monitor_disable: bool,
619
620 #[clap(short = 'e', long)]
622 example: Option<String>,
623
624 #[clap(long, default_value = "0.0")]
628 layer_preempt_weight_disable: f64,
629
630 #[clap(long, default_value = "0.0")]
634 layer_growth_weight_disable: f64,
635
636 #[clap(long)]
638 stats: Option<f64>,
639
640 #[clap(long)]
643 monitor: Option<f64>,
644
645 #[clap(long)]
647 run_example: bool,
648
649 #[clap(long, default_value = "false")]
652 local_llc_iteration: bool,
653
654 #[clap(long, default_value = "10000")]
659 lo_fb_wait_us: u64,
660
661 #[clap(long, default_value = ".05")]
664 lo_fb_share: f64,
665
666 #[clap(long, default_value = "false")]
668 disable_antistall: bool,
669
670 #[clap(long, default_value = "false")]
672 enable_gpu_affinitize: bool,
673
674 #[clap(long, default_value = "900")]
677 gpu_affinitize_secs: u64,
678
679 #[clap(long, default_value = "false")]
684 enable_match_debug: bool,
685
686 #[clap(long, default_value = "3")]
688 antistall_sec: u64,
689
690 #[clap(long, default_value = "false")]
692 enable_gpu_support: bool,
693
694 #[clap(long, default_value = "3")]
702 gpu_kprobe_level: u64,
703
704 #[clap(long, default_value = "false")]
706 netdev_irq_balance: bool,
707
708 #[clap(long, default_value = "false")]
710 disable_queued_wakeup: bool,
711
712 #[clap(long, default_value = "false")]
714 disable_percpu_kthread_preempt: bool,
715
716 #[clap(long, default_value = "false")]
720 percpu_kthread_preempt_all: bool,
721
722 #[clap(short = 'V', long, action = clap::ArgAction::SetTrue)]
724 version: bool,
725
726 #[clap(long)]
728 run_id: Option<u64>,
729
730 #[clap(long)]
732 help_stats: bool,
733
734 specs: Vec<String>,
736
737 #[clap(long, default_value = "2000")]
740 layer_refresh_ms_avgruntime: u64,
741
742 #[clap(long, default_value = "")]
744 task_hint_map: String,
745
746 #[clap(long, default_value = "false")]
748 print_and_exit: bool,
749
750 #[clap(long, default_value = "")]
752 hi_fb_thread_name: String,
753
754 #[clap(flatten, next_help_heading = "Topology Options")]
755 topology: TopologyArgs,
756
757 #[clap(flatten, next_help_heading = "Libbpf Options")]
758 pub libbpf: LibbpfOpts,
759}
760
761#[derive(Debug, Clone)]
763enum CgroupEvent {
764 Created {
765 path: String,
766 cgroup_id: u64, match_bitmap: u64, },
769 Removed {
770 path: String,
771 cgroup_id: u64, },
773}
774
775fn read_total_cpu(reader: &fb_procfs::ProcReader) -> Result<fb_procfs::CpuStat> {
776 reader
777 .read_stat()
778 .context("Failed to read procfs")?
779 .total_cpu
780 .ok_or_else(|| anyhow!("Could not read total cpu stat in proc"))
781}
782
783fn calc_util(curr: &fb_procfs::CpuStat, prev: &fb_procfs::CpuStat) -> Result<f64> {
784 match (curr, prev) {
785 (
786 fb_procfs::CpuStat {
787 user_usec: Some(curr_user),
788 nice_usec: Some(curr_nice),
789 system_usec: Some(curr_system),
790 idle_usec: Some(curr_idle),
791 iowait_usec: Some(curr_iowait),
792 irq_usec: Some(curr_irq),
793 softirq_usec: Some(curr_softirq),
794 stolen_usec: Some(curr_stolen),
795 ..
796 },
797 fb_procfs::CpuStat {
798 user_usec: Some(prev_user),
799 nice_usec: Some(prev_nice),
800 system_usec: Some(prev_system),
801 idle_usec: Some(prev_idle),
802 iowait_usec: Some(prev_iowait),
803 irq_usec: Some(prev_irq),
804 softirq_usec: Some(prev_softirq),
805 stolen_usec: Some(prev_stolen),
806 ..
807 },
808 ) => {
809 let idle_usec = curr_idle.saturating_sub(*prev_idle);
810 let iowait_usec = curr_iowait.saturating_sub(*prev_iowait);
811 let user_usec = curr_user.saturating_sub(*prev_user);
812 let system_usec = curr_system.saturating_sub(*prev_system);
813 let nice_usec = curr_nice.saturating_sub(*prev_nice);
814 let irq_usec = curr_irq.saturating_sub(*prev_irq);
815 let softirq_usec = curr_softirq.saturating_sub(*prev_softirq);
816 let stolen_usec = curr_stolen.saturating_sub(*prev_stolen);
817
818 let busy_usec =
819 user_usec + system_usec + nice_usec + irq_usec + softirq_usec + stolen_usec;
820 let total_usec = idle_usec + busy_usec + iowait_usec;
821 if total_usec > 0 {
822 Ok(((busy_usec as f64) / (total_usec as f64)).clamp(0.0, 1.0))
823 } else {
824 Ok(1.0)
825 }
826 }
827 _ => bail!("Missing stats in cpustat"),
828 }
829}
830
831fn copy_into_cstr(dst: &mut [i8], src: &str) {
832 let cstr = CString::new(src).unwrap();
833 let bytes = unsafe { std::mem::transmute::<&[u8], &[i8]>(cstr.as_bytes_with_nul()) };
834 dst[0..bytes.len()].copy_from_slice(bytes);
835}
836
837fn nodemask_from_nodes(nodes: &Vec<usize>) -> usize {
838 let mut mask = 0;
839 for node in nodes {
840 mask |= 1 << node;
841 }
842 mask
843}
844
845fn llcmask_from_llcs(llcs: &BTreeMap<usize, Arc<Llc>>) -> usize {
846 let mut mask = 0;
847 for (_, cache) in llcs {
848 mask |= 1 << cache.id;
849 }
850 mask
851}
852
853fn read_cpu_ctxs(skel: &BpfSkel) -> Result<Vec<bpf_intf::cpu_ctx>> {
854 let mut cpu_ctxs = vec![];
855 let cpu_ctxs_vec = skel
856 .maps
857 .cpu_ctxs
858 .lookup_percpu(&0u32.to_ne_bytes(), libbpf_rs::MapFlags::ANY)
859 .context("Failed to lookup cpu_ctx")?
860 .unwrap();
861 for cpu in 0..*NR_CPUS_POSSIBLE {
862 cpu_ctxs.push(*unsafe {
863 &*(cpu_ctxs_vec[cpu].as_slice().as_ptr() as *const bpf_intf::cpu_ctx)
864 });
865 }
866 Ok(cpu_ctxs)
867}
868
869#[derive(Clone, Debug)]
870struct BpfStats {
871 gstats: Vec<u64>,
872 lstats: Vec<Vec<u64>>,
873 lstats_sums: Vec<u64>,
874 llc_lstats: Vec<Vec<Vec<u64>>>, }
876
877impl BpfStats {
878 fn read(skel: &BpfSkel, cpu_ctxs: &[bpf_intf::cpu_ctx]) -> Self {
879 let nr_layers = skel.maps.rodata_data.as_ref().unwrap().nr_layers as usize;
880 let nr_llcs = skel.maps.rodata_data.as_ref().unwrap().nr_llcs as usize;
881 let mut gstats = vec![0u64; NR_GSTATS];
882 let mut lstats = vec![vec![0u64; NR_LSTATS]; nr_layers];
883 let mut llc_lstats = vec![vec![vec![0u64; NR_LLC_LSTATS]; nr_llcs]; nr_layers];
884
885 for cpu in 0..*NR_CPUS_POSSIBLE {
886 for stat in 0..NR_GSTATS {
887 gstats[stat] += cpu_ctxs[cpu].gstats[stat];
888 }
889 for layer in 0..nr_layers {
890 for stat in 0..NR_LSTATS {
891 lstats[layer][stat] += cpu_ctxs[cpu].lstats[layer][stat];
892 }
893 }
894 }
895
896 let mut lstats_sums = vec![0u64; NR_LSTATS];
897 for layer in 0..nr_layers {
898 for stat in 0..NR_LSTATS {
899 lstats_sums[stat] += lstats[layer][stat];
900 }
901 }
902
903 for llc_id in 0..nr_llcs {
904 let key = llc_id as u32;
910 let llc_id_slice =
911 unsafe { std::slice::from_raw_parts((&key as *const u32) as *const u8, 4) };
912 let v = skel
913 .maps
914 .llc_data
915 .lookup(llc_id_slice, libbpf_rs::MapFlags::ANY)
916 .unwrap()
917 .unwrap();
918 let llcc = unsafe { *(v.as_slice().as_ptr() as *const bpf_intf::llc_ctx) };
919
920 for layer_id in 0..nr_layers {
921 for stat_id in 0..NR_LLC_LSTATS {
922 llc_lstats[layer_id][llc_id][stat_id] = llcc.lstats[layer_id][stat_id];
923 }
924 }
925 }
926
927 Self {
928 gstats,
929 lstats,
930 lstats_sums,
931 llc_lstats,
932 }
933 }
934}
935
936impl<'a, 'b> Sub<&'b BpfStats> for &'a BpfStats {
937 type Output = BpfStats;
938
939 fn sub(self, rhs: &'b BpfStats) -> BpfStats {
940 let vec_sub = |l: &[u64], r: &[u64]| l.iter().zip(r.iter()).map(|(l, r)| *l - *r).collect();
941 BpfStats {
942 gstats: vec_sub(&self.gstats, &rhs.gstats),
943 lstats: self
944 .lstats
945 .iter()
946 .zip(rhs.lstats.iter())
947 .map(|(l, r)| vec_sub(l, r))
948 .collect(),
949 lstats_sums: vec_sub(&self.lstats_sums, &rhs.lstats_sums),
950 llc_lstats: self
951 .llc_lstats
952 .iter()
953 .zip(rhs.llc_lstats.iter())
954 .map(|(l_layer, r_layer)| {
955 l_layer
956 .iter()
957 .zip(r_layer.iter())
958 .map(|(l_llc, r_llc)| {
959 let (l_llc, mut r_llc) = (l_llc.clone(), r_llc.clone());
960 r_llc[bpf_intf::llc_layer_stat_id_LLC_LSTAT_LAT as usize] = 0;
962 vec_sub(&l_llc, &r_llc)
963 })
964 .collect()
965 })
966 .collect(),
967 }
968 }
969}
970
971#[derive(Clone, Debug)]
972struct Stats {
973 at: Instant,
974 elapsed: Duration,
975 nr_layers: usize,
976 nr_layer_tasks: Vec<usize>,
977 nr_nodes: usize,
978
979 total_util: f64, layer_utils: Vec<Vec<f64>>,
981 prev_layer_usages: Vec<Vec<u64>>,
982
983 layer_membws: Vec<Vec<f64>>, prev_layer_membw_agg: Vec<Vec<u64>>, cpu_busy: f64, prev_total_cpu: fb_procfs::CpuStat,
988 prev_pmu_resctrl_membw: (u64, u64), system_cpu_util_ewma: f64, layer_dsq_insert_ewma: Vec<f64>, bpf_stats: BpfStats,
994 prev_bpf_stats: BpfStats,
995
996 processing_dur: Duration,
997 prev_processing_dur: Duration,
998
999 layer_slice_us: Vec<u64>,
1000
1001 gpu_tasks_affinitized: u64,
1002 gpu_task_affinitization_ms: u64,
1003}
1004
1005impl Stats {
1006 fn read_layer_usages(cpu_ctxs: &[bpf_intf::cpu_ctx], nr_layers: usize) -> Vec<Vec<u64>> {
1007 let mut layer_usages = vec![vec![0u64; NR_LAYER_USAGES]; nr_layers];
1008
1009 for cpu in 0..*NR_CPUS_POSSIBLE {
1010 for layer in 0..nr_layers {
1011 for usage in 0..NR_LAYER_USAGES {
1012 layer_usages[layer][usage] += cpu_ctxs[cpu].layer_usages[layer][usage];
1013 }
1014 }
1015 }
1016
1017 layer_usages
1018 }
1019
1020 fn read_layer_membw_agg(cpu_ctxs: &[bpf_intf::cpu_ctx], nr_layers: usize) -> Vec<Vec<u64>> {
1022 let mut layer_membw_agg = vec![vec![0u64; NR_LAYER_USAGES]; nr_layers];
1023
1024 for cpu in 0..*NR_CPUS_POSSIBLE {
1025 for layer in 0..nr_layers {
1026 for usage in 0..NR_LAYER_USAGES {
1027 layer_membw_agg[layer][usage] += cpu_ctxs[cpu].layer_membw_agg[layer][usage];
1028 }
1029 }
1030 }
1031
1032 layer_membw_agg
1033 }
1034
1035 fn resctrl_read_total_membw() -> Result<u64> {
1047 let mut total_membw = 0u64;
1048 for entry in WalkDir::new("/sys/fs/resctrl/mon_data")
1049 .min_depth(1)
1050 .into_iter()
1051 .filter_map(Result::ok)
1052 .filter(|x| x.path().is_dir())
1053 {
1054 let mut path = entry.path().to_path_buf();
1055 path.push("mbm_total_bytes");
1056 total_membw += fs::read_to_string(path)?.trim().parse::<u64>()?;
1057 }
1058
1059 Ok(total_membw)
1060 }
1061
1062 fn new(
1063 skel: &mut BpfSkel,
1064 proc_reader: &fb_procfs::ProcReader,
1065 gpu_task_affinitizer: &GpuTaskAffinitizer,
1066 ) -> Result<Self> {
1067 let nr_layers = skel.maps.rodata_data.as_ref().unwrap().nr_layers as usize;
1068 let cpu_ctxs = read_cpu_ctxs(skel)?;
1069 let bpf_stats = BpfStats::read(skel, &cpu_ctxs);
1070 let nr_nodes = skel.maps.rodata_data.as_ref().unwrap().nr_nodes as usize;
1071 let pmu_membw = Self::read_layer_membw_agg(&cpu_ctxs, nr_layers);
1072
1073 Ok(Self {
1074 at: Instant::now(),
1075 elapsed: Default::default(),
1076 nr_layers,
1077 nr_layer_tasks: vec![0; nr_layers],
1078 nr_nodes,
1079
1080 total_util: 0.0,
1081 layer_utils: vec![vec![0.0; NR_LAYER_USAGES]; nr_layers],
1082 layer_membws: vec![vec![0.0; NR_LAYER_USAGES]; nr_layers],
1083 prev_layer_usages: Self::read_layer_usages(&cpu_ctxs, nr_layers),
1084 prev_layer_membw_agg: pmu_membw,
1088 prev_pmu_resctrl_membw: (0, 0),
1089
1090 cpu_busy: 0.0,
1091 prev_total_cpu: read_total_cpu(proc_reader)?,
1092 system_cpu_util_ewma: 0.0,
1093 layer_dsq_insert_ewma: vec![0.0; nr_layers],
1094
1095 bpf_stats: bpf_stats.clone(),
1096 prev_bpf_stats: bpf_stats,
1097
1098 processing_dur: Default::default(),
1099 prev_processing_dur: Default::default(),
1100
1101 layer_slice_us: vec![0; nr_layers],
1102 gpu_tasks_affinitized: gpu_task_affinitizer.tasks_affinitized,
1103 gpu_task_affinitization_ms: gpu_task_affinitizer.last_task_affinitization_ms,
1104 })
1105 }
1106
1107 fn refresh(
1108 &mut self,
1109 skel: &mut BpfSkel,
1110 proc_reader: &fb_procfs::ProcReader,
1111 now: Instant,
1112 cur_processing_dur: Duration,
1113 gpu_task_affinitizer: &GpuTaskAffinitizer,
1114 ) -> Result<()> {
1115 let elapsed = now.duration_since(self.at);
1116 let elapsed_f64 = elapsed.as_secs_f64();
1117 let cpu_ctxs = read_cpu_ctxs(skel)?;
1118
1119 let nr_layer_tasks: Vec<usize> = skel
1120 .maps
1121 .bss_data
1122 .as_ref()
1123 .unwrap()
1124 .layers
1125 .iter()
1126 .take(self.nr_layers)
1127 .map(|layer| layer.nr_tasks as usize)
1128 .collect();
1129 let layer_slice_us: Vec<u64> = skel
1130 .maps
1131 .bss_data
1132 .as_ref()
1133 .unwrap()
1134 .layers
1135 .iter()
1136 .take(self.nr_layers)
1137 .map(|layer| layer.slice_ns / 1000_u64)
1138 .collect();
1139
1140 let cur_layer_usages = Self::read_layer_usages(&cpu_ctxs, self.nr_layers);
1141 let cur_layer_membw_agg = Self::read_layer_membw_agg(&cpu_ctxs, self.nr_layers);
1142
1143 let (pmu_prev, resctrl_prev) = self.prev_pmu_resctrl_membw;
1151 let pmu_cur: u64 = cur_layer_membw_agg
1152 .iter()
1153 .map(|membw_agg| membw_agg[LAYER_USAGE_OPEN] + membw_agg[LAYER_USAGE_OWNED])
1154 .sum();
1155 let resctrl_cur = Self::resctrl_read_total_membw()?;
1156 let factor = (resctrl_cur - resctrl_prev) as f64 / (pmu_cur - pmu_prev) as f64;
1157
1158 let compute_diff = |cur_agg: &Vec<Vec<u64>>, prev_agg: &Vec<Vec<u64>>| {
1160 cur_agg
1161 .iter()
1162 .zip(prev_agg.iter())
1163 .map(|(cur, prev)| {
1164 cur.iter()
1165 .zip(prev.iter())
1166 .map(|(c, p)| (c - p) as f64 / 1_000_000_000.0 / elapsed_f64)
1167 .collect()
1168 })
1169 .collect()
1170 };
1171
1172 let compute_mem_diff = |cur_agg: &Vec<Vec<u64>>, prev_agg: &Vec<Vec<u64>>| {
1175 cur_agg
1176 .iter()
1177 .zip(prev_agg.iter())
1178 .map(|(cur, prev)| {
1179 cur.iter()
1180 .zip(prev.iter())
1181 .map(|(c, p)| (*c as i64 - *p as i64) as f64 / (1024 as f64).powf(3.0))
1182 .collect()
1183 })
1184 .collect()
1185 };
1186
1187 let cur_layer_utils: Vec<Vec<f64>> =
1188 compute_diff(&cur_layer_usages, &self.prev_layer_usages);
1189
1190 let cur_layer_membw: Vec<Vec<f64>> =
1192 compute_mem_diff(&cur_layer_membw_agg, &self.prev_layer_membw_agg);
1193
1194 let cur_layer_membw: Vec<Vec<f64>> = cur_layer_membw
1195 .iter()
1196 .map(|x| x.iter().map(|x| *x * factor).collect())
1197 .collect();
1198
1199 let metric_decay =
1200 |cur_metric: Vec<Vec<f64>>, prev_metric: &Vec<Vec<f64>>, decay_rate: f64| {
1201 cur_metric
1202 .iter()
1203 .zip(prev_metric.iter())
1204 .map(|(cur, prev)| {
1205 cur.iter()
1206 .zip(prev.iter())
1207 .map(|(c, p)| {
1208 let decay = decay_rate.powf(elapsed_f64);
1209 p * decay + c * (1.0 - decay)
1210 })
1211 .collect()
1212 })
1213 .collect()
1214 };
1215
1216 let layer_utils: Vec<Vec<f64>> =
1217 metric_decay(cur_layer_utils, &self.layer_utils, *USAGE_DECAY);
1218 let layer_membws: Vec<Vec<f64>> = metric_decay(cur_layer_membw, &self.layer_membws, 0.0);
1219
1220 let cur_total_cpu = read_total_cpu(proc_reader)?;
1221 let cpu_busy = calc_util(&cur_total_cpu, &self.prev_total_cpu)?;
1222
1223 const SYS_CPU_UTIL_EWMA_SECS: f64 = 10.0;
1225 let elapsed_f64 = elapsed.as_secs_f64();
1226 let alpha = elapsed_f64 / SYS_CPU_UTIL_EWMA_SECS.max(elapsed_f64);
1227 let system_cpu_util_ewma = alpha * cpu_busy + (1.0 - alpha) * self.system_cpu_util_ewma;
1228
1229 let cur_bpf_stats = BpfStats::read(skel, &cpu_ctxs);
1230 let bpf_stats = &cur_bpf_stats - &self.prev_bpf_stats;
1231
1232 const DSQ_INSERT_EWMA_SECS: f64 = 10.0;
1234 let dsq_alpha = elapsed_f64 / DSQ_INSERT_EWMA_SECS.max(elapsed_f64);
1235 let layer_dsq_insert_ewma: Vec<f64> = (0..self.nr_layers)
1236 .map(|layer_id| {
1237 let sel_local = bpf_stats.lstats[layer_id]
1238 [bpf_intf::layer_stat_id_LSTAT_SEL_LOCAL as usize]
1239 as f64;
1240 let enq_local = bpf_stats.lstats[layer_id]
1241 [bpf_intf::layer_stat_id_LSTAT_ENQ_LOCAL as usize]
1242 as f64;
1243 let enq_dsq = bpf_stats.lstats[layer_id]
1244 [bpf_intf::layer_stat_id_LSTAT_ENQ_DSQ as usize]
1245 as f64;
1246 let total_dispatches = sel_local + enq_local + enq_dsq;
1247
1248 let cur_ratio = if total_dispatches > 0.0 {
1249 enq_dsq / total_dispatches
1250 } else {
1251 0.0
1252 };
1253
1254 dsq_alpha * cur_ratio + (1.0 - dsq_alpha) * self.layer_dsq_insert_ewma[layer_id]
1255 })
1256 .collect();
1257
1258 let processing_dur = cur_processing_dur
1259 .checked_sub(self.prev_processing_dur)
1260 .unwrap();
1261
1262 *self = Self {
1263 at: now,
1264 elapsed,
1265 nr_layers: self.nr_layers,
1266 nr_layer_tasks,
1267 nr_nodes: self.nr_nodes,
1268
1269 total_util: layer_utils
1270 .iter()
1271 .map(|x| x.iter().take(LAYER_USAGE_SUM_UPTO + 1).sum::<f64>())
1272 .sum(),
1273 layer_utils,
1274 layer_membws,
1275 prev_layer_usages: cur_layer_usages,
1276 prev_layer_membw_agg: cur_layer_membw_agg,
1277 prev_pmu_resctrl_membw: (pmu_cur, resctrl_cur),
1279
1280 cpu_busy,
1281 prev_total_cpu: cur_total_cpu,
1282 system_cpu_util_ewma,
1283 layer_dsq_insert_ewma,
1284
1285 bpf_stats,
1286 prev_bpf_stats: cur_bpf_stats,
1287
1288 processing_dur,
1289 prev_processing_dur: cur_processing_dur,
1290
1291 layer_slice_us,
1292 gpu_tasks_affinitized: gpu_task_affinitizer.tasks_affinitized,
1293 gpu_task_affinitization_ms: gpu_task_affinitizer.last_task_affinitization_ms,
1294 };
1295 Ok(())
1296 }
1297}
1298
1299#[derive(Debug)]
1300struct Layer {
1301 name: String,
1302 kind: LayerKind,
1303 growth_algo: LayerGrowthAlgo,
1304 core_order: Vec<usize>,
1305
1306 target_llc_cpus: (usize, usize),
1307 assigned_llcs: Vec<usize>,
1308
1309 nr_cpus: usize,
1310 nr_llc_cpus: Vec<usize>,
1311 cpus: Cpumask,
1312 allowed_cpus: Cpumask,
1313}
1314
1315fn get_kallsyms_addr(sym_name: &str) -> Result<u64> {
1316 fs::read_to_string("/proc/kallsyms")?
1317 .lines()
1318 .find(|line| line.contains(sym_name))
1319 .and_then(|line| line.split_whitespace().next())
1320 .and_then(|addr| u64::from_str_radix(addr, 16).ok())
1321 .ok_or_else(|| anyhow!("Symbol '{}' not found", sym_name))
1322}
1323
1324fn resolve_cpus_pct_range(
1325 cpus_range: &Option<(usize, usize)>,
1326 cpus_range_frac: &Option<(f64, f64)>,
1327 max_cpus: usize,
1328) -> Result<(usize, usize)> {
1329 match (cpus_range, cpus_range_frac) {
1330 (Some(_x), Some(_y)) => {
1331 bail!("cpus_range cannot be used with cpus_pct.");
1332 }
1333 (Some((cpus_range_min, cpus_range_max)), None) => Ok((*cpus_range_min, *cpus_range_max)),
1334 (None, Some((cpus_frac_min, cpus_frac_max))) => {
1335 if *cpus_frac_min < 0_f64
1336 || *cpus_frac_min > 1_f64
1337 || *cpus_frac_max < 0_f64
1338 || *cpus_frac_max > 1_f64
1339 {
1340 bail!("cpus_range_frac values must be between 0.0 and 1.0");
1341 }
1342 let cpus_min_count = ((max_cpus as f64) * cpus_frac_min).round_ties_even() as usize;
1343 let cpus_max_count = ((max_cpus as f64) * cpus_frac_max).round_ties_even() as usize;
1344 Ok((
1345 std::cmp::max(cpus_min_count, 1),
1346 std::cmp::min(cpus_max_count, max_cpus),
1347 ))
1348 }
1349 (None, None) => Ok((0, max_cpus)),
1350 }
1351}
1352
1353impl Layer {
1354 fn new(spec: &LayerSpec, topo: &Topology, core_order: &Vec<usize>) -> Result<Self> {
1355 let name = &spec.name;
1356 let kind = spec.kind.clone();
1357 let mut allowed_cpus = Cpumask::new();
1358 match &kind {
1359 LayerKind::Confined {
1360 cpus_range,
1361 cpus_range_frac,
1362 common: LayerCommon { nodes, llcs, .. },
1363 ..
1364 } => {
1365 let cpus_range =
1366 resolve_cpus_pct_range(cpus_range, cpus_range_frac, topo.all_cpus.len())?;
1367 if cpus_range.0 > cpus_range.1 || cpus_range.1 == 0 {
1368 bail!("invalid cpus_range {:?}", cpus_range);
1369 }
1370 if nodes.is_empty() && llcs.is_empty() {
1371 allowed_cpus.set_all();
1372 } else {
1373 for (node_id, node) in &topo.nodes {
1375 if nodes.contains(node_id) {
1377 for &id in node.all_cpus.keys() {
1378 allowed_cpus.set_cpu(id)?;
1379 }
1380 }
1381 for (llc_id, llc) in &node.llcs {
1383 if llcs.contains(llc_id) {
1384 for &id in llc.all_cpus.keys() {
1385 allowed_cpus.set_cpu(id)?;
1386 }
1387 }
1388 }
1389 }
1390 }
1391 }
1392 LayerKind::Grouped {
1393 common: LayerCommon { nodes, llcs, .. },
1394 ..
1395 }
1396 | LayerKind::Open {
1397 common: LayerCommon { nodes, llcs, .. },
1398 ..
1399 } => {
1400 if nodes.is_empty() && llcs.is_empty() {
1401 allowed_cpus.set_all();
1402 } else {
1403 for (node_id, node) in &topo.nodes {
1405 if nodes.contains(node_id) {
1407 for &id in node.all_cpus.keys() {
1408 allowed_cpus.set_cpu(id)?;
1409 }
1410 }
1411 for (llc_id, llc) in &node.llcs {
1413 if llcs.contains(llc_id) {
1414 for &id in llc.all_cpus.keys() {
1415 allowed_cpus.set_cpu(id)?;
1416 }
1417 }
1418 }
1419 }
1420 }
1421 }
1422 }
1423
1424 if let Some(util_range) = kind.util_range() {
1427 if util_range.0 < 0.0 || util_range.1 < 0.0 || util_range.0 >= util_range.1 {
1428 bail!("invalid util_range {:?}", util_range);
1429 }
1430 }
1431
1432 let layer_growth_algo = kind.common().growth_algo.clone();
1433
1434 debug!(
1435 "layer: {} algo: {:?} core order: {:?}",
1436 name, &layer_growth_algo, core_order
1437 );
1438
1439 Ok(Self {
1440 name: name.into(),
1441 kind,
1442 growth_algo: layer_growth_algo,
1443 core_order: core_order.clone(),
1444
1445 target_llc_cpus: (0, 0),
1446 assigned_llcs: vec![],
1447
1448 nr_cpus: 0,
1449 nr_llc_cpus: vec![0; topo.all_llcs.len()],
1450 cpus: Cpumask::new(),
1451 allowed_cpus,
1452 })
1453 }
1454
1455 fn free_some_cpus(&mut self, cpu_pool: &mut CpuPool, max_to_free: usize) -> Result<usize> {
1456 let cpus_to_free = match cpu_pool.next_to_free(&self.cpus, self.core_order.iter().rev())? {
1457 Some(ret) => ret.clone(),
1458 None => return Ok(0),
1459 };
1460
1461 let nr_to_free = cpus_to_free.weight();
1462
1463 Ok(if nr_to_free <= max_to_free {
1464 trace!("[{}] freeing CPUs: {}", self.name, &cpus_to_free);
1465 self.cpus &= &cpus_to_free.not();
1466 self.nr_cpus -= nr_to_free;
1467 for cpu in cpus_to_free.iter() {
1468 self.nr_llc_cpus[cpu_pool.topo.all_cpus[&cpu].llc_id] -= 1;
1469 }
1470 cpu_pool.free(&cpus_to_free)?;
1471 nr_to_free
1472 } else {
1473 0
1474 })
1475 }
1476
1477 fn alloc_some_cpus(&mut self, cpu_pool: &mut CpuPool) -> Result<usize> {
1478 let new_cpus = match cpu_pool
1479 .alloc_cpus(&self.allowed_cpus, &self.core_order)
1480 .clone()
1481 {
1482 Some(ret) => ret.clone(),
1483 None => {
1484 trace!("layer-{} can't grow, no CPUs", &self.name);
1485 return Ok(0);
1486 }
1487 };
1488
1489 let nr_new_cpus = new_cpus.weight();
1490
1491 trace!("[{}] adding CPUs: {}", &self.name, &new_cpus);
1492 self.cpus |= &new_cpus;
1493 self.nr_cpus += nr_new_cpus;
1494 for cpu in new_cpus.iter() {
1495 self.nr_llc_cpus[cpu_pool.topo.all_cpus[&cpu].llc_id] += 1;
1496 }
1497 Ok(nr_new_cpus)
1498 }
1499}
1500#[derive(Debug, Clone)]
1501struct NodeInfo {
1502 node_mask: nix::sched::CpuSet,
1503 _node_id: usize,
1504}
1505
1506#[derive(Debug)]
1507struct GpuTaskAffinitizer {
1508 gpu_devs_to_node_info: HashMap<u32, NodeInfo>,
1511 gpu_pids_to_devs: HashMap<Pid, u32>,
1512 last_process_time: Option<Instant>,
1513 sys: System,
1514 pid_map: HashMap<Pid, Vec<Pid>>,
1515 poll_interval: Duration,
1516 enable: bool,
1517 tasks_affinitized: u64,
1518 last_task_affinitization_ms: u64,
1519}
1520
1521impl GpuTaskAffinitizer {
1522 pub fn new(poll_interval: u64, enable: bool) -> GpuTaskAffinitizer {
1523 GpuTaskAffinitizer {
1524 gpu_devs_to_node_info: HashMap::new(),
1525 gpu_pids_to_devs: HashMap::new(),
1526 last_process_time: None,
1527 sys: System::default(),
1528 pid_map: HashMap::new(),
1529 poll_interval: Duration::from_secs(poll_interval),
1530 enable,
1531 tasks_affinitized: 0,
1532 last_task_affinitization_ms: 0,
1533 }
1534 }
1535
1536 fn find_one_cpu(&self, affinity: Vec<u64>) -> Result<u32> {
1537 for (chunk, &mask) in affinity.iter().enumerate() {
1538 let mut inner_offset: u64 = 1;
1539 for _ in 0..64 {
1540 if (mask & inner_offset) != 0 {
1541 return Ok((64 * chunk + u64::trailing_zeros(inner_offset) as usize) as u32);
1542 }
1543 inner_offset = inner_offset << 1;
1544 }
1545 }
1546 anyhow::bail!("unable to get CPU from NVML bitmask");
1547 }
1548
1549 fn node_to_cpuset(&self, node: &scx_utils::Node) -> Result<CpuSet> {
1550 let mut cpuset = CpuSet::new();
1551 for (cpu_id, _cpu) in &node.all_cpus {
1552 cpuset.set(*cpu_id)?;
1553 }
1554 Ok(cpuset)
1555 }
1556
1557 fn init_dev_node_map(&mut self, topo: Arc<Topology>) -> Result<()> {
1558 let nvml = nvml()?;
1559 let device_count = nvml.device_count()?;
1560
1561 for idx in 0..device_count {
1562 let dev = nvml.device_by_index(idx)?;
1563 let cpu = dev.cpu_affinity(16)?;
1565 let ideal_cpu = self.find_one_cpu(cpu)?;
1566 if let Some(cpu) = topo.all_cpus.get(&(ideal_cpu as usize)) {
1567 self.gpu_devs_to_node_info.insert(
1568 idx,
1569 NodeInfo {
1570 node_mask: self.node_to_cpuset(
1571 topo.nodes.get(&cpu.node_id).expect("topo missing node"),
1572 )?,
1573 _node_id: cpu.node_id,
1574 },
1575 );
1576 }
1577 }
1578 Ok(())
1579 }
1580
1581 fn update_gpu_pids(&mut self) -> Result<()> {
1582 let nvml = nvml()?;
1583 for i in 0..nvml.device_count()? {
1584 let device = nvml.device_by_index(i)?;
1585 for proc in device
1586 .running_compute_processes()?
1587 .into_iter()
1588 .chain(device.running_graphics_processes()?.into_iter())
1589 {
1590 self.gpu_pids_to_devs.insert(Pid::from_u32(proc.pid), i);
1591 }
1592 }
1593 Ok(())
1594 }
1595
1596 fn update_process_info(&mut self) -> Result<()> {
1597 self.sys.refresh_processes_specifics(
1598 ProcessesToUpdate::All,
1599 true,
1600 ProcessRefreshKind::nothing(),
1601 );
1602 self.pid_map.clear();
1603 for (pid, proc_) in self.sys.processes() {
1604 if let Some(ppid) = proc_.parent() {
1605 self.pid_map.entry(ppid).or_default().push(*pid);
1606 }
1607 }
1608 Ok(())
1609 }
1610
1611 fn get_child_pids_and_tids(&self, root_pid: Pid) -> HashSet<Pid> {
1612 let mut work = VecDeque::from([root_pid]);
1613 let mut pids_and_tids: HashSet<Pid> = HashSet::new();
1614
1615 while let Some(pid) = work.pop_front() {
1616 if pids_and_tids.insert(pid) {
1617 if let Some(kids) = self.pid_map.get(&pid) {
1618 work.extend(kids);
1619 }
1620 if let Some(proc_) = self.sys.process(pid) {
1621 if let Some(tasks) = proc_.tasks() {
1622 pids_and_tids.extend(tasks.iter().copied());
1623 }
1624 }
1625 }
1626 }
1627 pids_and_tids
1628 }
1629
1630 fn affinitize_gpu_pids(&mut self) -> Result<()> {
1631 if !self.enable {
1632 return Ok(());
1633 }
1634 for (pid, dev) in &self.gpu_pids_to_devs {
1635 let node_info = self
1636 .gpu_devs_to_node_info
1637 .get(&dev)
1638 .expect("Unable to get gpu pid node mask");
1639 for child in self.get_child_pids_and_tids(*pid) {
1640 match nix::sched::sched_setaffinity(
1641 nix::unistd::Pid::from_raw(child.as_u32() as i32),
1642 &node_info.node_mask,
1643 ) {
1644 Ok(_) => {
1645 self.tasks_affinitized += 1;
1647 }
1648 Err(_) => {
1649 debug!(
1650 "Error affinitizing gpu pid {} to node {:#?}",
1651 child.as_u32(),
1652 node_info
1653 );
1654 }
1655 };
1656 }
1657 }
1658 Ok(())
1659 }
1660
1661 pub fn maybe_affinitize(&mut self) {
1662 if !self.enable {
1663 return;
1664 }
1665 let now = Instant::now();
1666
1667 if let Some(last_process_time) = self.last_process_time {
1668 if (now - last_process_time) < self.poll_interval {
1669 return;
1670 }
1671 }
1672
1673 match self.update_gpu_pids() {
1674 Ok(_) => {}
1675 Err(e) => {
1676 error!("Error updating GPU PIDs: {}", e);
1677 }
1678 };
1679 match self.update_process_info() {
1680 Ok(_) => {}
1681 Err(e) => {
1682 error!("Error updating process info to affinitize GPU PIDs: {}", e);
1683 }
1684 };
1685 match self.affinitize_gpu_pids() {
1686 Ok(_) => {}
1687 Err(e) => {
1688 error!("Error updating GPU PIDs: {}", e);
1689 }
1690 };
1691 self.last_process_time = Some(now);
1692 self.last_task_affinitization_ms = (Instant::now() - now).as_millis() as u64;
1693
1694 return;
1695 }
1696
1697 pub fn init(&mut self, topo: Arc<Topology>) {
1698 if !self.enable || self.last_process_time.is_some() {
1699 return;
1700 }
1701
1702 match self.init_dev_node_map(topo) {
1703 Ok(_) => {}
1704 Err(e) => {
1705 error!("Error initializing gpu node dev map: {}", e);
1706 }
1707 };
1708 self.sys = System::new_all();
1709 return;
1710 }
1711}
1712
1713struct Scheduler<'a> {
1714 skel: BpfSkel<'a>,
1715 struct_ops: Option<libbpf_rs::Link>,
1716 layer_specs: Vec<LayerSpec>,
1717
1718 sched_intv: Duration,
1719 layer_refresh_intv: Duration,
1720
1721 cpu_pool: CpuPool,
1722 layers: Vec<Layer>,
1723 idle_qos_enabled: bool,
1724
1725 proc_reader: fb_procfs::ProcReader,
1726 sched_stats: Stats,
1727
1728 cgroup_regexes: Option<HashMap<u32, Regex>>,
1729
1730 nr_layer_cpus_ranges: Vec<(usize, usize)>,
1731 processing_dur: Duration,
1732
1733 topo: Arc<Topology>,
1734 netdevs: BTreeMap<String, NetDev>,
1735 stats_server: StatsServer<StatsReq, StatsRes>,
1736 gpu_task_handler: GpuTaskAffinitizer,
1737}
1738
1739impl<'a> Scheduler<'a> {
1740 fn init_layers(
1741 skel: &mut OpenBpfSkel,
1742 specs: &[LayerSpec],
1743 topo: &Topology,
1744 ) -> Result<HashMap<u32, Regex>> {
1745 skel.maps.rodata_data.as_mut().unwrap().nr_layers = specs.len() as u32;
1746 let mut perf_set = false;
1747
1748 let mut layer_iteration_order = (0..specs.len()).collect::<Vec<_>>();
1749 let mut layer_weights: Vec<usize> = vec![];
1750 let mut cgroup_regex_id = 0;
1751 let mut cgroup_regexes = HashMap::new();
1752
1753 for (spec_i, spec) in specs.iter().enumerate() {
1754 let layer = &mut skel.maps.bss_data.as_mut().unwrap().layers[spec_i];
1755
1756 for (or_i, or) in spec.matches.iter().enumerate() {
1757 for (and_i, and) in or.iter().enumerate() {
1758 let mt = &mut layer.matches[or_i].matches[and_i];
1759
1760 mt.exclude.write(false);
1762
1763 match and {
1764 LayerMatch::CgroupPrefix(prefix) => {
1765 mt.kind = bpf_intf::layer_match_kind_MATCH_CGROUP_PREFIX as i32;
1766 copy_into_cstr(&mut mt.cgroup_prefix, prefix.as_str());
1767 }
1768 LayerMatch::CgroupSuffix(suffix) => {
1769 mt.kind = bpf_intf::layer_match_kind_MATCH_CGROUP_SUFFIX as i32;
1770 copy_into_cstr(&mut mt.cgroup_suffix, suffix.as_str());
1771 }
1772 LayerMatch::CgroupRegex(regex_str) => {
1773 if cgroup_regex_id >= bpf_intf::consts_MAX_CGROUP_REGEXES {
1774 bail!(
1775 "Too many cgroup regex rules. Maximum allowed: {}",
1776 bpf_intf::consts_MAX_CGROUP_REGEXES
1777 );
1778 }
1779
1780 mt.kind = bpf_intf::layer_match_kind_MATCH_CGROUP_REGEX as i32;
1782 mt.cgroup_regex_id = cgroup_regex_id;
1783
1784 let regex = Regex::new(regex_str).with_context(|| {
1785 format!("Invalid regex '{}' in layer '{}'", regex_str, spec.name)
1786 })?;
1787 cgroup_regexes.insert(cgroup_regex_id, regex);
1788 cgroup_regex_id += 1;
1789 }
1790 LayerMatch::CgroupContains(substr) => {
1791 mt.kind = bpf_intf::layer_match_kind_MATCH_CGROUP_CONTAINS as i32;
1792 copy_into_cstr(&mut mt.cgroup_substr, substr.as_str());
1793 }
1794 LayerMatch::CommPrefix(prefix) => {
1795 mt.kind = bpf_intf::layer_match_kind_MATCH_COMM_PREFIX as i32;
1796 copy_into_cstr(&mut mt.comm_prefix, prefix.as_str());
1797 }
1798 LayerMatch::CommPrefixExclude(prefix) => {
1799 mt.kind = bpf_intf::layer_match_kind_MATCH_COMM_PREFIX as i32;
1800 mt.exclude.write(true);
1801 copy_into_cstr(&mut mt.comm_prefix, prefix.as_str());
1802 }
1803 LayerMatch::PcommPrefix(prefix) => {
1804 mt.kind = bpf_intf::layer_match_kind_MATCH_PCOMM_PREFIX as i32;
1805 copy_into_cstr(&mut mt.pcomm_prefix, prefix.as_str());
1806 }
1807 LayerMatch::PcommPrefixExclude(prefix) => {
1808 mt.kind = bpf_intf::layer_match_kind_MATCH_PCOMM_PREFIX as i32;
1809 mt.exclude.write(true);
1810 copy_into_cstr(&mut mt.pcomm_prefix, prefix.as_str());
1811 }
1812 LayerMatch::NiceAbove(nice) => {
1813 mt.kind = bpf_intf::layer_match_kind_MATCH_NICE_ABOVE as i32;
1814 mt.nice = *nice;
1815 }
1816 LayerMatch::NiceBelow(nice) => {
1817 mt.kind = bpf_intf::layer_match_kind_MATCH_NICE_BELOW as i32;
1818 mt.nice = *nice;
1819 }
1820 LayerMatch::NiceEquals(nice) => {
1821 mt.kind = bpf_intf::layer_match_kind_MATCH_NICE_EQUALS as i32;
1822 mt.nice = *nice;
1823 }
1824 LayerMatch::UIDEquals(user_id) => {
1825 mt.kind = bpf_intf::layer_match_kind_MATCH_USER_ID_EQUALS as i32;
1826 mt.user_id = *user_id;
1827 }
1828 LayerMatch::GIDEquals(group_id) => {
1829 mt.kind = bpf_intf::layer_match_kind_MATCH_GROUP_ID_EQUALS as i32;
1830 mt.group_id = *group_id;
1831 }
1832 LayerMatch::PIDEquals(pid) => {
1833 mt.kind = bpf_intf::layer_match_kind_MATCH_PID_EQUALS as i32;
1834 mt.pid = *pid;
1835 }
1836 LayerMatch::PPIDEquals(ppid) => {
1837 mt.kind = bpf_intf::layer_match_kind_MATCH_PPID_EQUALS as i32;
1838 mt.ppid = *ppid;
1839 }
1840 LayerMatch::TGIDEquals(tgid) => {
1841 mt.kind = bpf_intf::layer_match_kind_MATCH_TGID_EQUALS as i32;
1842 mt.tgid = *tgid;
1843 }
1844 LayerMatch::NSPIDEquals(nsid, pid) => {
1845 mt.kind = bpf_intf::layer_match_kind_MATCH_NSPID_EQUALS as i32;
1846 mt.nsid = *nsid;
1847 mt.pid = *pid;
1848 }
1849 LayerMatch::NSEquals(nsid) => {
1850 mt.kind = bpf_intf::layer_match_kind_MATCH_NS_EQUALS as i32;
1851 mt.nsid = *nsid as u64;
1852 }
1853 LayerMatch::CmdJoin(joincmd) => {
1854 mt.kind = bpf_intf::layer_match_kind_MATCH_SCXCMD_JOIN as i32;
1855 copy_into_cstr(&mut mt.comm_prefix, joincmd);
1856 }
1857 LayerMatch::IsGroupLeader(polarity) => {
1858 mt.kind = bpf_intf::layer_match_kind_MATCH_IS_GROUP_LEADER as i32;
1859 mt.is_group_leader.write(*polarity);
1860 }
1861 LayerMatch::IsKthread(polarity) => {
1862 mt.kind = bpf_intf::layer_match_kind_MATCH_IS_KTHREAD as i32;
1863 mt.is_kthread.write(*polarity);
1864 }
1865 LayerMatch::UsedGpuTid(polarity) => {
1866 mt.kind = bpf_intf::layer_match_kind_MATCH_USED_GPU_TID as i32;
1867 mt.used_gpu_tid.write(*polarity);
1868 }
1869 LayerMatch::UsedGpuPid(polarity) => {
1870 mt.kind = bpf_intf::layer_match_kind_MATCH_USED_GPU_PID as i32;
1871 mt.used_gpu_pid.write(*polarity);
1872 }
1873 LayerMatch::AvgRuntime(min, max) => {
1874 mt.kind = bpf_intf::layer_match_kind_MATCH_AVG_RUNTIME as i32;
1875 mt.min_avg_runtime_us = *min;
1876 mt.max_avg_runtime_us = *max;
1877 }
1878 LayerMatch::HintEquals(hint) => {
1879 mt.kind = bpf_intf::layer_match_kind_MATCH_HINT_EQUALS as i32;
1880 mt.hint = *hint;
1881 }
1882 LayerMatch::SystemCpuUtilBelow(threshold) => {
1883 mt.kind = bpf_intf::layer_match_kind_MATCH_SYSTEM_CPU_UTIL_BELOW as i32;
1884 mt.system_cpu_util_below = (*threshold * 10000.0) as u64;
1885 }
1886 LayerMatch::DsqInsertBelow(threshold) => {
1887 mt.kind = bpf_intf::layer_match_kind_MATCH_DSQ_INSERT_BELOW as i32;
1888 mt.dsq_insert_below = (*threshold * 10000.0) as u64;
1889 }
1890 LayerMatch::NumaNode(node_id) => {
1891 if *node_id as usize >= topo.nodes.len() {
1892 bail!(
1893 "Spec {:?} has invalid NUMA node ID {} (available nodes: 0-{})",
1894 spec.name,
1895 node_id,
1896 topo.nodes.len() - 1
1897 );
1898 }
1899 mt.kind = bpf_intf::layer_match_kind_MATCH_NUMA_NODE as i32;
1900 mt.numa_node_id = *node_id;
1901 }
1902 }
1903 }
1904 layer.matches[or_i].nr_match_ands = or.len() as i32;
1905 }
1906
1907 layer.nr_match_ors = spec.matches.len() as u32;
1908 layer.kind = spec.kind.as_bpf_enum();
1909
1910 {
1911 let LayerCommon {
1912 min_exec_us,
1913 yield_ignore,
1914 perf,
1915 preempt,
1916 preempt_first,
1917 exclusive,
1918 allow_node_aligned,
1919 skip_remote_node,
1920 prev_over_idle_core,
1921 growth_algo,
1922 nodes,
1923 slice_us,
1924 fifo,
1925 weight,
1926 disallow_open_after_us,
1927 disallow_preempt_after_us,
1928 xllc_mig_min_us,
1929 placement,
1930 member_expire_ms,
1931 ..
1932 } = spec.kind.common();
1933
1934 layer.slice_ns = *slice_us * 1000;
1935 layer.fifo.write(*fifo);
1936 layer.min_exec_ns = min_exec_us * 1000;
1937 layer.yield_step_ns = if *yield_ignore > 0.999 {
1938 0
1939 } else if *yield_ignore < 0.001 {
1940 layer.slice_ns
1941 } else {
1942 (layer.slice_ns as f64 * (1.0 - *yield_ignore)) as u64
1943 };
1944 let mut layer_name: String = spec.name.clone();
1945 layer_name.truncate(MAX_LAYER_NAME);
1946 copy_into_cstr(&mut layer.name, layer_name.as_str());
1947 layer.preempt.write(*preempt);
1948 layer.preempt_first.write(*preempt_first);
1949 layer.excl.write(*exclusive);
1950 layer.allow_node_aligned.write(*allow_node_aligned);
1951 layer.skip_remote_node.write(*skip_remote_node);
1952 layer.prev_over_idle_core.write(*prev_over_idle_core);
1953 layer.growth_algo = growth_algo.as_bpf_enum();
1954 layer.weight = *weight;
1955 layer.member_expire_ms = *member_expire_ms;
1956 layer.disallow_open_after_ns = match disallow_open_after_us.unwrap() {
1957 v if v == u64::MAX => v,
1958 v => v * 1000,
1959 };
1960 layer.disallow_preempt_after_ns = match disallow_preempt_after_us.unwrap() {
1961 v if v == u64::MAX => v,
1962 v => v * 1000,
1963 };
1964 layer.xllc_mig_min_ns = (xllc_mig_min_us * 1000.0) as u64;
1965 layer_weights.push(layer.weight.try_into().unwrap());
1966 layer.perf = u32::try_from(*perf)?;
1967 layer.node_mask = nodemask_from_nodes(nodes) as u64;
1968 for (topo_node_id, topo_node) in &topo.nodes {
1969 if !nodes.is_empty() && !nodes.contains(topo_node_id) {
1970 continue;
1971 }
1972 layer.llc_mask |= llcmask_from_llcs(&topo_node.llcs) as u64;
1973 }
1974
1975 let task_place = |place: u32| crate::types::layer_task_place(place);
1976 layer.task_place = match placement {
1977 LayerPlacement::Standard => {
1978 task_place(bpf_intf::layer_task_place_PLACEMENT_STD as u32)
1979 }
1980 LayerPlacement::Sticky => {
1981 task_place(bpf_intf::layer_task_place_PLACEMENT_STICK as u32)
1982 }
1983 LayerPlacement::Floating => {
1984 task_place(bpf_intf::layer_task_place_PLACEMENT_FLOAT as u32)
1985 }
1986 };
1987 }
1988
1989 layer.is_protected.write(match spec.kind {
1990 LayerKind::Open { .. } => false,
1991 LayerKind::Confined { protected, .. } | LayerKind::Grouped { protected, .. } => {
1992 protected
1993 }
1994 });
1995
1996 match &spec.cpuset {
1997 Some(mask) => {
1998 Self::update_cpumask(&mask, &mut layer.cpuset);
1999 }
2000 None => {
2001 for i in 0..layer.cpuset.len() {
2002 layer.cpuset[i] = u8::MAX;
2003 }
2004 }
2005 };
2006
2007 perf_set |= layer.perf > 0;
2008 }
2009
2010 layer_iteration_order.sort_by(|i, j| layer_weights[*i].cmp(&layer_weights[*j]));
2011 for (idx, layer_idx) in layer_iteration_order.iter().enumerate() {
2012 skel.maps
2013 .rodata_data
2014 .as_mut()
2015 .unwrap()
2016 .layer_iteration_order[idx] = *layer_idx as u32;
2017 }
2018
2019 if perf_set && !compat::ksym_exists("scx_bpf_cpuperf_set")? {
2020 warn!("cpufreq support not available, ignoring perf configurations");
2021 }
2022
2023 Ok(cgroup_regexes)
2024 }
2025
2026 fn init_nodes(skel: &mut OpenBpfSkel, _opts: &Opts, topo: &Topology) {
2027 skel.maps.rodata_data.as_mut().unwrap().nr_nodes = topo.nodes.len() as u32;
2028 skel.maps.rodata_data.as_mut().unwrap().nr_llcs = 0;
2029
2030 for (&node_id, node) in &topo.nodes {
2031 debug!("configuring node {}, LLCs {:?}", node_id, node.llcs.len());
2032 skel.maps.rodata_data.as_mut().unwrap().nr_llcs += node.llcs.len() as u32;
2033 let raw_numa_slice = node.span.as_raw_slice();
2034 let node_cpumask_slice =
2035 &mut skel.maps.rodata_data.as_mut().unwrap().numa_cpumasks[node_id];
2036 let (left, _) = node_cpumask_slice.split_at_mut(raw_numa_slice.len());
2037 left.clone_from_slice(raw_numa_slice);
2038 debug!(
2039 "node {} mask: {:?}",
2040 node_id,
2041 skel.maps.rodata_data.as_ref().unwrap().numa_cpumasks[node_id]
2042 );
2043
2044 for llc in node.llcs.values() {
2045 debug!("configuring llc {:?} for node {:?}", llc.id, node_id);
2046 skel.maps.rodata_data.as_mut().unwrap().llc_numa_id_map[llc.id] = node_id as u32;
2047 }
2048 }
2049
2050 for cpu in topo.all_cpus.values() {
2051 skel.maps.rodata_data.as_mut().unwrap().cpu_llc_id_map[cpu.id] = cpu.llc_id as u32;
2052 }
2053 }
2054
2055 fn init_cpu_prox_map(topo: &Topology, cpu_ctxs: &mut [bpf_intf::cpu_ctx]) {
2056 let radiate = |mut vec: Vec<usize>, center_id: usize| -> Vec<usize> {
2057 vec.sort_by_key(|&id| (center_id as i32 - id as i32).abs());
2058 vec
2059 };
2060 let radiate_cpu =
2061 |mut vec: Vec<usize>, center_cpu: usize, center_core: usize| -> Vec<usize> {
2062 vec.sort_by_key(|&id| {
2063 (
2064 (center_core as i32 - topo.all_cpus.get(&id).unwrap().core_id as i32).abs(),
2065 (center_cpu as i32 - id as i32).abs(),
2066 )
2067 });
2068 vec
2069 };
2070
2071 for (&cpu_id, cpu) in &topo.all_cpus {
2072 let mut core_span = topo.all_cores[&cpu.core_id].span.clone();
2074 let llc_span = &topo.all_llcs[&cpu.llc_id].span;
2075 let node_span = &topo.nodes[&cpu.node_id].span;
2076 let sys_span = &topo.span;
2077
2078 let sys_span = sys_span.and(&node_span.not());
2080 let node_span = node_span.and(&llc_span.not());
2081 let llc_span = llc_span.and(&core_span.not());
2082 core_span.clear_cpu(cpu_id).unwrap();
2083
2084 let mut sys_order: Vec<usize> = sys_span.iter().collect();
2086 let mut node_order: Vec<usize> = node_span.iter().collect();
2087 let mut llc_order: Vec<usize> = llc_span.iter().collect();
2088 let mut core_order: Vec<usize> = core_span.iter().collect();
2089
2090 sys_order = radiate_cpu(sys_order, cpu_id, cpu.core_id);
2095 node_order = radiate(node_order, cpu.node_id);
2096 llc_order = radiate_cpu(llc_order, cpu_id, cpu.core_id);
2097 core_order = radiate_cpu(core_order, cpu_id, cpu.core_id);
2098
2099 let mut order: Vec<usize> = vec![];
2101 let mut idx: usize = 0;
2102
2103 idx += 1;
2104 order.push(cpu_id);
2105
2106 idx += core_order.len();
2107 order.append(&mut core_order);
2108 let core_end = idx;
2109
2110 idx += llc_order.len();
2111 order.append(&mut llc_order);
2112 let llc_end = idx;
2113
2114 idx += node_order.len();
2115 order.append(&mut node_order);
2116 let node_end = idx;
2117
2118 idx += sys_order.len();
2119 order.append(&mut sys_order);
2120 let sys_end = idx;
2121
2122 debug!(
2123 "CPU[{}] proximity map[{}/{}/{}/{}]: {:?}",
2124 cpu_id, core_end, llc_end, node_end, sys_end, &order
2125 );
2126
2127 let pmap = &mut cpu_ctxs[cpu_id].prox_map;
2129 for (i, &cpu) in order.iter().enumerate() {
2130 pmap.cpus[i] = cpu as u16;
2131 }
2132 pmap.core_end = core_end as u32;
2133 pmap.llc_end = llc_end as u32;
2134 pmap.node_end = node_end as u32;
2135 pmap.sys_end = sys_end as u32;
2136 }
2137 }
2138
2139 fn convert_cpu_ctxs(cpu_ctxs: Vec<bpf_intf::cpu_ctx>) -> Vec<Vec<u8>> {
2140 cpu_ctxs
2141 .into_iter()
2142 .map(|cpu_ctx| {
2143 let bytes = unsafe {
2144 std::slice::from_raw_parts(
2145 &cpu_ctx as *const bpf_intf::cpu_ctx as *const u8,
2146 std::mem::size_of::<bpf_intf::cpu_ctx>(),
2147 )
2148 };
2149 bytes.to_vec()
2150 })
2151 .collect()
2152 }
2153
2154 fn init_cpus(skel: &BpfSkel, layer_specs: &[LayerSpec], topo: &Topology) -> Result<()> {
2155 let key = (0_u32).to_ne_bytes();
2156 let mut cpu_ctxs: Vec<bpf_intf::cpu_ctx> = vec![];
2157 let cpu_ctxs_vec = skel
2158 .maps
2159 .cpu_ctxs
2160 .lookup_percpu(&key, libbpf_rs::MapFlags::ANY)
2161 .context("Failed to lookup cpu_ctx")?
2162 .unwrap();
2163
2164 let op_layers: Vec<u32> = layer_specs
2165 .iter()
2166 .enumerate()
2167 .filter(|(_idx, spec)| match &spec.kind {
2168 LayerKind::Open { .. } => spec.kind.common().preempt,
2169 _ => false,
2170 })
2171 .map(|(idx, _)| idx as u32)
2172 .collect();
2173 let on_layers: Vec<u32> = layer_specs
2174 .iter()
2175 .enumerate()
2176 .filter(|(_idx, spec)| match &spec.kind {
2177 LayerKind::Open { .. } => !spec.kind.common().preempt,
2178 _ => false,
2179 })
2180 .map(|(idx, _)| idx as u32)
2181 .collect();
2182 let gp_layers: Vec<u32> = layer_specs
2183 .iter()
2184 .enumerate()
2185 .filter(|(_idx, spec)| match &spec.kind {
2186 LayerKind::Grouped { .. } => spec.kind.common().preempt,
2187 _ => false,
2188 })
2189 .map(|(idx, _)| idx as u32)
2190 .collect();
2191 let gn_layers: Vec<u32> = layer_specs
2192 .iter()
2193 .enumerate()
2194 .filter(|(_idx, spec)| match &spec.kind {
2195 LayerKind::Grouped { .. } => !spec.kind.common().preempt,
2196 _ => false,
2197 })
2198 .map(|(idx, _)| idx as u32)
2199 .collect();
2200
2201 for cpu in 0..*NR_CPUS_POSSIBLE {
2203 cpu_ctxs.push(*unsafe {
2204 &*(cpu_ctxs_vec[cpu].as_slice().as_ptr() as *const bpf_intf::cpu_ctx)
2205 });
2206
2207 let topo_cpu = topo.all_cpus.get(&cpu).unwrap();
2208 let is_big = topo_cpu.core_type == CoreType::Big { turbo: true };
2209 cpu_ctxs[cpu].cpu = cpu as i32;
2210 cpu_ctxs[cpu].layer_id = MAX_LAYERS as u32;
2211 cpu_ctxs[cpu].is_big = is_big;
2212
2213 fastrand::seed(cpu as u64);
2214
2215 let mut ogp_order = op_layers.clone();
2216 ogp_order.append(&mut gp_layers.clone());
2217 fastrand::shuffle(&mut ogp_order);
2218
2219 let mut ogn_order = on_layers.clone();
2220 ogn_order.append(&mut gn_layers.clone());
2221 fastrand::shuffle(&mut ogn_order);
2222
2223 let mut op_order = op_layers.clone();
2224 fastrand::shuffle(&mut op_order);
2225
2226 let mut on_order = on_layers.clone();
2227 fastrand::shuffle(&mut on_order);
2228
2229 let mut gp_order = gp_layers.clone();
2230 fastrand::shuffle(&mut gp_order);
2231
2232 let mut gn_order = gn_layers.clone();
2233 fastrand::shuffle(&mut gn_order);
2234
2235 for i in 0..MAX_LAYERS {
2236 cpu_ctxs[cpu].ogp_layer_order[i] =
2237 ogp_order.get(i).cloned().unwrap_or(MAX_LAYERS as u32);
2238 cpu_ctxs[cpu].ogn_layer_order[i] =
2239 ogn_order.get(i).cloned().unwrap_or(MAX_LAYERS as u32);
2240
2241 cpu_ctxs[cpu].op_layer_order[i] =
2242 op_order.get(i).cloned().unwrap_or(MAX_LAYERS as u32);
2243 cpu_ctxs[cpu].on_layer_order[i] =
2244 on_order.get(i).cloned().unwrap_or(MAX_LAYERS as u32);
2245 cpu_ctxs[cpu].gp_layer_order[i] =
2246 gp_order.get(i).cloned().unwrap_or(MAX_LAYERS as u32);
2247 cpu_ctxs[cpu].gn_layer_order[i] =
2248 gn_order.get(i).cloned().unwrap_or(MAX_LAYERS as u32);
2249 }
2250 }
2251
2252 Self::init_cpu_prox_map(topo, &mut cpu_ctxs);
2253
2254 skel.maps
2255 .cpu_ctxs
2256 .update_percpu(
2257 &key,
2258 &Self::convert_cpu_ctxs(cpu_ctxs),
2259 libbpf_rs::MapFlags::ANY,
2260 )
2261 .context("Failed to update cpu_ctx")?;
2262
2263 Ok(())
2264 }
2265
2266 fn init_llc_prox_map(skel: &mut BpfSkel, topo: &Topology) -> Result<()> {
2267 for (&llc_id, llc) in &topo.all_llcs {
2268 let mut node_order: Vec<usize> =
2270 topo.nodes[&llc.node_id].llcs.keys().cloned().collect();
2271 let mut sys_order: Vec<usize> = topo.all_llcs.keys().cloned().collect();
2272
2273 sys_order.retain(|id| !node_order.contains(id));
2275 node_order.retain(|&id| id != llc_id);
2276
2277 fastrand::seed(llc_id as u64);
2280 fastrand::shuffle(&mut sys_order);
2281 fastrand::shuffle(&mut node_order);
2282
2283 let mut order: Vec<usize> = vec![];
2285 let mut idx: usize = 0;
2286
2287 idx += 1;
2288 order.push(llc_id);
2289
2290 idx += node_order.len();
2291 order.append(&mut node_order);
2292 let node_end = idx;
2293
2294 idx += sys_order.len();
2295 order.append(&mut sys_order);
2296 let sys_end = idx;
2297
2298 debug!(
2299 "LLC[{}] proximity map[{}/{}]: {:?}",
2300 llc_id, node_end, sys_end, &order
2301 );
2302
2303 let key = llc_id as u32;
2308 let llc_id_slice =
2309 unsafe { std::slice::from_raw_parts((&key as *const u32) as *const u8, 4) };
2310 let v = skel
2311 .maps
2312 .llc_data
2313 .lookup(llc_id_slice, libbpf_rs::MapFlags::ANY)
2314 .unwrap()
2315 .unwrap();
2316 let mut llcc = unsafe { *(v.as_slice().as_ptr() as *const bpf_intf::llc_ctx) };
2317
2318 let pmap = &mut llcc.prox_map;
2319 for (i, &llc_id) in order.iter().enumerate() {
2320 pmap.llcs[i] = llc_id as u16;
2321 }
2322 pmap.node_end = node_end as u32;
2323 pmap.sys_end = sys_end as u32;
2324
2325 let v = unsafe {
2326 std::slice::from_raw_parts(
2327 &llcc as *const bpf_intf::llc_ctx as *const u8,
2328 std::mem::size_of::<bpf_intf::llc_ctx>(),
2329 )
2330 };
2331
2332 skel.maps
2333 .llc_data
2334 .update(llc_id_slice, v, libbpf_rs::MapFlags::ANY)?
2335 }
2336
2337 Ok(())
2338 }
2339
2340 fn init(
2341 opts: &'a Opts,
2342 layer_specs: &[LayerSpec],
2343 open_object: &'a mut MaybeUninit<OpenObject>,
2344 hint_to_layer_map: &HashMap<u64, HintLayerInfo>,
2345 membw_tracking: bool,
2346 ) -> Result<Self> {
2347 let nr_layers = layer_specs.len();
2348 let mut disable_topology = opts.disable_topology.unwrap_or(false);
2349
2350 let topo = Arc::new(if disable_topology {
2351 Topology::with_flattened_llc_node()?
2352 } else if opts.topology.virt_llc.is_some() {
2353 Topology::with_args(&opts.topology)?
2354 } else {
2355 Topology::new()?
2356 });
2357
2358 if topo.nodes.keys().enumerate().any(|(i, &k)| i != k) {
2365 bail!("Holes in node IDs detected: {:?}", topo.nodes.keys());
2366 }
2367 if topo.all_llcs.keys().enumerate().any(|(i, &k)| i != k) {
2368 bail!("Holes in LLC IDs detected: {:?}", topo.all_llcs.keys());
2369 }
2370 if topo.all_cpus.keys().enumerate().any(|(i, &k)| i != k) {
2371 bail!("Holes in CPU IDs detected: {:?}", topo.all_cpus.keys());
2372 }
2373
2374 let netdevs = if opts.netdev_irq_balance {
2375 warn!(
2376 "Experimental netdev IRQ balancing enabled. Reset IRQ masks of network devices after use!!!"
2377 );
2378 read_netdevs()?
2379 } else {
2380 BTreeMap::new()
2381 };
2382
2383 if !disable_topology {
2384 if topo.nodes.len() == 1 && topo.nodes[&0].llcs.len() == 1 {
2385 disable_topology = true;
2386 };
2387 info!(
2388 "Topology awareness not specified, selecting {} based on hardware",
2389 if disable_topology {
2390 "disabled"
2391 } else {
2392 "enabled"
2393 }
2394 );
2395 };
2396
2397 let cpu_pool = CpuPool::new(topo.clone())?;
2398
2399 let layer_specs: Vec<_> = if disable_topology {
2402 info!("Disabling topology awareness");
2403 layer_specs
2404 .iter()
2405 .cloned()
2406 .map(|mut s| {
2407 s.kind.common_mut().nodes.clear();
2408 s.kind.common_mut().llcs.clear();
2409 s
2410 })
2411 .collect()
2412 } else {
2413 layer_specs.to_vec()
2414 };
2415
2416 init_libbpf_logging(None);
2418 let kfuncs_in_syscall = scx_bpf_compat::kfuncs_supported_in_syscall()?;
2419 if !kfuncs_in_syscall {
2420 warn!("Using slow path: kfuncs not supported in syscall programs (a8e03b6bbb2c ∉ ker)");
2421 }
2422
2423 let debug_level = if opts.log_level.contains("trace") {
2425 2
2426 } else if opts.log_level.contains("debug") {
2427 1
2428 } else {
2429 0
2430 };
2431 let mut skel_builder = BpfSkelBuilder::default();
2432 skel_builder.obj_builder.debug(debug_level > 1);
2433
2434 info!(
2435 "Running scx_layered (build ID: {})",
2436 build_id::full_version(env!("CARGO_PKG_VERSION"))
2437 );
2438 let open_opts = opts.libbpf.clone().into_bpf_open_opts();
2439 let mut skel = scx_ops_open!(skel_builder, open_object, layered, open_opts)?;
2440
2441 skel.progs.scx_pmu_switch_tc.set_autoload(membw_tracking);
2443 skel.progs.scx_pmu_tick_tc.set_autoload(membw_tracking);
2444
2445 if opts.enable_gpu_support {
2448 if opts.gpu_kprobe_level >= 1 {
2451 compat::cond_kprobe_enable("nvidia_open", &skel.progs.kprobe_nvidia_open)?;
2452 }
2453 if opts.gpu_kprobe_level >= 2 {
2456 compat::cond_kprobe_enable("nvidia_mmap", &skel.progs.kprobe_nvidia_mmap)?;
2457 }
2458 if opts.gpu_kprobe_level >= 3 {
2459 compat::cond_kprobe_enable("nvidia_poll", &skel.progs.kprobe_nvidia_poll)?;
2460 }
2461 }
2462
2463 let ext_sched_class_addr = get_kallsyms_addr("ext_sched_class");
2464 let idle_sched_class_addr = get_kallsyms_addr("idle_sched_class");
2465
2466 let event = if membw_tracking {
2467 setup_membw_tracking(&mut skel)?
2468 } else {
2469 0
2470 };
2471
2472 let rodata = skel.maps.rodata_data.as_mut().unwrap();
2473
2474 if ext_sched_class_addr.is_ok() && idle_sched_class_addr.is_ok() {
2475 rodata.ext_sched_class_addr = ext_sched_class_addr.unwrap();
2476 rodata.idle_sched_class_addr = idle_sched_class_addr.unwrap();
2477 } else {
2478 warn!(
2479 "Unable to get sched_class addresses from /proc/kallsyms, disabling skip_preempt."
2480 );
2481 }
2482
2483 rodata.slice_ns = scx_enums.SCX_SLICE_DFL;
2484 rodata.max_exec_ns = 20 * scx_enums.SCX_SLICE_DFL;
2485
2486 skel.struct_ops.layered_mut().exit_dump_len = opts.exit_dump_len;
2488
2489 if !opts.disable_queued_wakeup {
2490 match *compat::SCX_OPS_ALLOW_QUEUED_WAKEUP {
2491 0 => info!("Kernel does not support queued wakeup optimization"),
2492 v => skel.struct_ops.layered_mut().flags |= v,
2493 }
2494 }
2495
2496 rodata.percpu_kthread_preempt = !opts.disable_percpu_kthread_preempt;
2497 rodata.percpu_kthread_preempt_all =
2498 !opts.disable_percpu_kthread_preempt && opts.percpu_kthread_preempt_all;
2499 rodata.debug = debug_level as u32;
2500 rodata.slice_ns = opts.slice_us * 1000;
2501 rodata.max_exec_ns = if opts.max_exec_us > 0 {
2502 opts.max_exec_us * 1000
2503 } else {
2504 opts.slice_us * 1000 * 20
2505 };
2506 rodata.nr_cpu_ids = *NR_CPU_IDS as u32;
2507 rodata.nr_possible_cpus = *NR_CPUS_POSSIBLE as u32;
2508 rodata.smt_enabled = topo.smt_enabled;
2509 rodata.has_little_cores = topo.has_little_cores();
2510 rodata.xnuma_preemption = opts.xnuma_preemption;
2511 rodata.antistall_sec = opts.antistall_sec;
2512 rodata.monitor_disable = opts.monitor_disable;
2513 rodata.lo_fb_wait_ns = opts.lo_fb_wait_us * 1000;
2514 rodata.lo_fb_share_ppk = ((opts.lo_fb_share * 1024.0) as u32).clamp(1, 1024);
2515 rodata.enable_antistall = !opts.disable_antistall;
2516 rodata.enable_match_debug = opts.enable_match_debug;
2517 rodata.enable_gpu_support = opts.enable_gpu_support;
2518 rodata.kfuncs_supported_in_syscall = kfuncs_in_syscall;
2519
2520 for (cpu, sib) in topo.sibling_cpus().iter().enumerate() {
2521 rodata.__sibling_cpu[cpu] = *sib;
2522 }
2523 for cpu in topo.all_cpus.keys() {
2524 rodata.all_cpus[cpu / 8] |= 1 << (cpu % 8);
2525 }
2526
2527 rodata.nr_op_layers = layer_specs
2528 .iter()
2529 .filter(|spec| match &spec.kind {
2530 LayerKind::Open { .. } => spec.kind.common().preempt,
2531 _ => false,
2532 })
2533 .count() as u32;
2534 rodata.nr_on_layers = layer_specs
2535 .iter()
2536 .filter(|spec| match &spec.kind {
2537 LayerKind::Open { .. } => !spec.kind.common().preempt,
2538 _ => false,
2539 })
2540 .count() as u32;
2541 rodata.nr_gp_layers = layer_specs
2542 .iter()
2543 .filter(|spec| match &spec.kind {
2544 LayerKind::Grouped { .. } => spec.kind.common().preempt,
2545 _ => false,
2546 })
2547 .count() as u32;
2548 rodata.nr_gn_layers = layer_specs
2549 .iter()
2550 .filter(|spec| match &spec.kind {
2551 LayerKind::Grouped { .. } => !spec.kind.common().preempt,
2552 _ => false,
2553 })
2554 .count() as u32;
2555 rodata.nr_excl_layers = layer_specs
2556 .iter()
2557 .filter(|spec| spec.kind.common().exclusive)
2558 .count() as u32;
2559
2560 let mut min_open = u64::MAX;
2561 let mut min_preempt = u64::MAX;
2562
2563 for spec in layer_specs.iter() {
2564 if let LayerKind::Open { common, .. } = &spec.kind {
2565 min_open = min_open.min(common.disallow_open_after_us.unwrap());
2566 min_preempt = min_preempt.min(common.disallow_preempt_after_us.unwrap());
2567 }
2568 }
2569
2570 rodata.min_open_layer_disallow_open_after_ns = match min_open {
2571 u64::MAX => *DFL_DISALLOW_OPEN_AFTER_US,
2572 v => v,
2573 };
2574 rodata.min_open_layer_disallow_preempt_after_ns = match min_preempt {
2575 u64::MAX => *DFL_DISALLOW_PREEMPT_AFTER_US,
2576 v => v,
2577 };
2578
2579 for i in 0..layer_specs.len() {
2581 skel.maps.bss_data.as_mut().unwrap().empty_layer_ids[i] = i as u32;
2582 }
2583 skel.maps.bss_data.as_mut().unwrap().nr_empty_layer_ids = nr_layers as u32;
2584
2585 let layered_task_hint_map_path = &opts.task_hint_map;
2590 let hint_map = &mut skel.maps.scx_layered_task_hint_map;
2591 if layered_task_hint_map_path.is_empty() == false {
2593 hint_map.set_pin_path(layered_task_hint_map_path).unwrap();
2594 rodata.task_hint_map_enabled = true;
2595 }
2596
2597 if !opts.hi_fb_thread_name.is_empty() {
2598 let bpf_hi_fb_thread_name = &mut rodata.hi_fb_thread_name;
2599 copy_into_cstr(bpf_hi_fb_thread_name, opts.hi_fb_thread_name.as_str());
2600 rodata.enable_hi_fb_thread_name_match = true;
2601 }
2602
2603 let cgroup_regexes = Self::init_layers(&mut skel, &layer_specs, &topo)?;
2604 skel.maps.rodata_data.as_mut().unwrap().nr_cgroup_regexes = cgroup_regexes.len() as u32;
2605 Self::init_nodes(&mut skel, opts, &topo);
2606
2607 let mut skel = scx_ops_load!(skel, layered, uei)?;
2608
2609 if hint_to_layer_map.len() != 0 {
2611 for (k, v) in hint_to_layer_map.iter() {
2612 let key: u32 = *k as u32;
2613
2614 let mut info_bytes = vec![0u8; std::mem::size_of::<bpf_intf::hint_layer_info>()];
2616 let info_ptr = info_bytes.as_mut_ptr() as *mut bpf_intf::hint_layer_info;
2617 unsafe {
2618 (*info_ptr).layer_id = v.layer_id as u32;
2619 (*info_ptr).system_cpu_util_below = match v.system_cpu_util_below {
2620 Some(threshold) => (threshold * 10000.0) as u64,
2621 None => u64::MAX, };
2623 (*info_ptr).dsq_insert_below = match v.dsq_insert_below {
2624 Some(threshold) => (threshold * 10000.0) as u64,
2625 None => u64::MAX, };
2627 }
2628
2629 skel.maps.hint_to_layer_id_map.update(
2630 &key.to_ne_bytes(),
2631 &info_bytes,
2632 libbpf_rs::MapFlags::ANY,
2633 )?;
2634 }
2635 }
2636
2637 if membw_tracking {
2638 create_perf_fds(&mut skel, event)?;
2639 }
2640
2641 let mut layers = vec![];
2642 let layer_growth_orders =
2643 LayerGrowthAlgo::layer_core_orders(&cpu_pool, &layer_specs, &topo)?;
2644 for (idx, spec) in layer_specs.iter().enumerate() {
2645 let growth_order = layer_growth_orders
2646 .get(&idx)
2647 .with_context(|| "layer has no growth order".to_string())?;
2648 layers.push(Layer::new(spec, &topo, growth_order)?);
2649 }
2650
2651 let mut idle_qos_enabled = layers
2652 .iter()
2653 .any(|layer| layer.kind.common().idle_resume_us.unwrap_or(0) > 0);
2654 if idle_qos_enabled && !cpu_idle_resume_latency_supported() {
2655 warn!("idle_resume_us not supported, ignoring");
2656 idle_qos_enabled = false;
2657 }
2658
2659 Self::init_cpus(&skel, &layer_specs, &topo)?;
2660 Self::init_llc_prox_map(&mut skel, &topo)?;
2661
2662 let proc_reader = fb_procfs::ProcReader::new();
2664
2665 let input = ProgramInput {
2667 ..Default::default()
2668 };
2669 let prog = &mut skel.progs.initialize_pid_namespace;
2670
2671 let _ = prog.test_run(input);
2672
2673 if layered_task_hint_map_path.is_empty() == false {
2682 let path = CString::new(layered_task_hint_map_path.as_bytes()).unwrap();
2683 let mode: libc::mode_t = 0o666;
2684 unsafe {
2685 if libc::chmod(path.as_ptr(), mode) != 0 {
2686 trace!("'chmod' to 666 of task hint map failed, continuing...");
2687 }
2688 }
2689 }
2690
2691 let struct_ops = scx_ops_attach!(skel, layered)?;
2693 let stats_server = StatsServer::new(stats::server_data()).launch()?;
2694 let mut gpu_task_handler =
2695 GpuTaskAffinitizer::new(opts.gpu_affinitize_secs, opts.enable_gpu_affinitize);
2696 gpu_task_handler.init(topo.clone());
2697
2698 let sched = Self {
2699 struct_ops: Some(struct_ops),
2700 layer_specs,
2701
2702 sched_intv: Duration::from_secs_f64(opts.interval),
2703 layer_refresh_intv: Duration::from_millis(opts.layer_refresh_ms_avgruntime),
2704
2705 cpu_pool,
2706 layers,
2707 idle_qos_enabled,
2708
2709 sched_stats: Stats::new(&mut skel, &proc_reader, &gpu_task_handler)?,
2710
2711 cgroup_regexes: Some(cgroup_regexes),
2712 nr_layer_cpus_ranges: vec![(0, 0); nr_layers],
2713 processing_dur: Default::default(),
2714
2715 proc_reader,
2716 skel,
2717
2718 topo,
2719 netdevs,
2720 stats_server,
2721 gpu_task_handler,
2722 };
2723
2724 info!("Layered Scheduler Attached. Run `scx_layered --monitor` for metrics.");
2725
2726 Ok(sched)
2727 }
2728
2729 fn update_cpumask(mask: &Cpumask, bpfmask: &mut [u8]) {
2730 for cpu in 0..mask.len() {
2731 if mask.test_cpu(cpu) {
2732 bpfmask[cpu / 8] |= 1 << (cpu % 8);
2733 } else {
2734 bpfmask[cpu / 8] &= !(1 << (cpu % 8));
2735 }
2736 }
2737 }
2738
2739 fn update_bpf_layer_cpumask(layer: &Layer, bpf_layer: &mut types::layer) {
2740 trace!("[{}] Updating BPF CPUs: {}", layer.name, &layer.cpus);
2741 Self::update_cpumask(&layer.cpus, &mut bpf_layer.cpus);
2742
2743 bpf_layer.nr_cpus = layer.nr_cpus as u32;
2744 for (llc_id, &nr_llc_cpus) in layer.nr_llc_cpus.iter().enumerate() {
2745 bpf_layer.nr_llc_cpus[llc_id] = nr_llc_cpus as u32;
2746 }
2747
2748 bpf_layer.refresh_cpus = 1;
2749 }
2750
2751 fn update_netdev_cpumasks(&mut self) -> Result<()> {
2752 let available_cpus = self.cpu_pool.available_cpus();
2753 if available_cpus.is_empty() {
2754 return Ok(());
2755 }
2756
2757 for (iface, netdev) in self.netdevs.iter_mut() {
2758 let node = self
2759 .topo
2760 .nodes
2761 .values()
2762 .take_while(|n| n.id == netdev.node())
2763 .next()
2764 .ok_or_else(|| anyhow!("Failed to get netdev node"))?;
2765 let node_cpus = node.span.clone();
2766 for (irq, irqmask) in netdev.irqs.iter_mut() {
2767 irqmask.clear_all();
2768 for cpu in available_cpus.iter() {
2769 if !node_cpus.test_cpu(cpu) {
2770 continue;
2771 }
2772 let _ = irqmask.set_cpu(cpu);
2773 }
2774 if irqmask.weight() == 0 {
2776 for cpu in node_cpus.iter() {
2777 let _ = irqmask.set_cpu(cpu);
2778 }
2779 }
2780 trace!("{} updating irq {} cpumask {:?}", iface, irq, irqmask);
2781 }
2782 netdev.apply_cpumasks()?;
2783 }
2784
2785 Ok(())
2786 }
2787
2788 fn clamp_target_by_membw(
2789 &self,
2790 layer: &Layer,
2791 membw_limit: f64,
2792 membw: f64,
2793 curtarget: u64,
2794 ) -> usize {
2795 let ncpu: u64 = layer.cpus.weight() as u64;
2796 let membw = (membw * (1024 as f64).powf(3.0)).round() as u64;
2797 let membw_limit = (membw_limit * (1024 as f64).powf(3.0)).round() as u64;
2798 let last_membw_percpu = if ncpu > 0 { membw / ncpu } else { 0 };
2799
2800 if membw_limit == 0 || last_membw_percpu == 0 {
2803 return curtarget as usize;
2804 }
2805
2806 return (membw_limit / last_membw_percpu) as usize;
2807 }
2808
2809 fn calc_target_nr_cpus(&self) -> Vec<(usize, usize)> {
2815 let nr_cpus = self.cpu_pool.topo.all_cpus.len();
2816 let utils = &self.sched_stats.layer_utils;
2817 let membws = &self.sched_stats.layer_membws;
2818
2819 let mut records: Vec<(u64, u64, u64, usize, usize, usize)> = vec![];
2820 let mut targets: Vec<(usize, usize)> = vec![];
2821
2822 for (idx, layer) in self.layers.iter().enumerate() {
2823 targets.push(match &layer.kind {
2824 LayerKind::Confined {
2825 util_range,
2826 cpus_range,
2827 cpus_range_frac,
2828 membw_gb,
2829 ..
2830 }
2831 | LayerKind::Grouped {
2832 util_range,
2833 cpus_range,
2834 cpus_range_frac,
2835 membw_gb,
2836 ..
2837 } => {
2838 let cpus_range =
2839 resolve_cpus_pct_range(cpus_range, cpus_range_frac, nr_cpus).unwrap();
2840
2841 let owned = utils[idx][LAYER_USAGE_OWNED];
2846 let open = utils[idx][LAYER_USAGE_OPEN];
2847
2848 let membw_owned = membws[idx][LAYER_USAGE_OWNED];
2849 let membw_open = membws[idx][LAYER_USAGE_OPEN];
2850
2851 let mut util = owned;
2852 let mut membw = membw_owned;
2853 if layer.kind.util_includes_open_cputime() || layer.nr_cpus == 0 {
2854 util += open;
2855 membw += membw_open;
2856 }
2857
2858 let util = if util < 0.01 { 0.0 } else { util };
2859 let low = (util / util_range.1).ceil() as usize;
2860 let high = ((util / util_range.0).floor() as usize).max(low);
2861
2862 let membw_limit = match membw_gb {
2863 Some(membw_limit) => *membw_limit,
2864 None => 0.0,
2865 };
2866
2867 trace!(
2868 "layer {0} (membw, membw_limit): ({membw} gi_b, {membw_limit} gi_b)",
2869 layer.name
2870 );
2871
2872 let target = layer.cpus.weight().clamp(low, high);
2873
2874 records.push((
2875 (owned * 100.0) as u64,
2876 (open * 100.0) as u64,
2877 (util * 100.0) as u64,
2878 low,
2879 high,
2880 target,
2881 ));
2882
2883 let target = target.clamp(cpus_range.0, cpus_range.1);
2884 let membw_target = self.clamp_target_by_membw(
2885 &layer,
2886 membw_limit as f64,
2887 membw as f64,
2888 target as u64,
2889 );
2890
2891 trace!("CPU target pre- and post-membw adjustment: {target} -> {membw_target}");
2892
2893 if membw_target < cpus_range.0 {
2896 warn!("cannot satisfy memory bw limit for layer {}", layer.name);
2897 warn!("membw_target {membw_target} low {}", cpus_range.0);
2898 };
2899
2900 let target = membw_target.clamp(cpus_range.0, target);
2903
2904 (target, cpus_range.0)
2905 }
2906 LayerKind::Open { .. } => (0, 0),
2907 });
2908 }
2909
2910 trace!("(owned, open, util, low, high, target): {:?}", &records);
2911 targets
2912 }
2913
2914 fn weighted_target_nr_cpus(&self, targets: &[(usize, usize)]) -> Vec<usize> {
2918 let mut nr_left = self.cpu_pool.topo.all_cpus.len();
2919 let weights: Vec<usize> = self
2920 .layers
2921 .iter()
2922 .map(|layer| layer.kind.common().weight as usize)
2923 .collect();
2924 let mut cands: BTreeMap<usize, (usize, usize, usize)> = targets
2925 .iter()
2926 .zip(&weights)
2927 .enumerate()
2928 .map(|(i, ((target, min), weight))| (i, (*target, *min, *weight)))
2929 .collect();
2930 let mut weight_sum: usize = weights.iter().sum();
2931 let mut weighted: Vec<usize> = vec![0; self.layers.len()];
2932
2933 trace!("cands: {:?}", &cands);
2934
2935 cands.retain(|&i, &mut (target, min, weight)| {
2937 if target <= min {
2938 let target = target.min(nr_left);
2939 weighted[i] = target;
2940 weight_sum -= weight;
2941 nr_left -= target;
2942 false
2943 } else {
2944 true
2945 }
2946 });
2947
2948 trace!("cands after accepting mins: {:?}", &cands);
2949
2950 let calc_share = |nr_left, weight, weight_sum| {
2952 (((nr_left * weight) as f64 / weight_sum as f64).ceil() as usize).min(nr_left)
2953 };
2954
2955 while !cands.is_empty() {
2956 let mut progress = false;
2957
2958 cands.retain(|&i, &mut (target, _min, weight)| {
2959 let share = calc_share(nr_left, weight, weight_sum);
2960 if target <= share {
2961 weighted[i] = target;
2962 weight_sum -= weight;
2963 nr_left -= target;
2964 progress = true;
2965 false
2966 } else {
2967 true
2968 }
2969 });
2970
2971 if !progress {
2972 break;
2973 }
2974 }
2975
2976 trace!("cands after accepting under allotted: {:?}", &cands);
2977
2978 let nr_to_share = nr_left;
2981 for (i, (_target, _min, weight)) in cands.into_iter() {
2982 let share = calc_share(nr_to_share, weight, weight_sum).min(nr_left);
2983 weighted[i] = share;
2984 nr_left -= share;
2985 }
2986
2987 trace!("weighted: {:?}", &weighted);
2988
2989 weighted
2990 }
2991
2992 fn compute_target_llcs(target: usize, topo: &Topology) -> (usize, usize) {
2996 let cores_per_llc = topo.all_cores.len() / topo.all_llcs.len();
2998 let cpus_per_core = topo.all_cores.first_key_value().unwrap().1.cpus.len();
3000 let cpus_per_llc = cores_per_llc * cpus_per_core;
3001
3002 let full = target / cpus_per_llc;
3003 let extra = target % cpus_per_llc;
3004
3005 (full, extra.div_ceil(cpus_per_core))
3006 }
3007
3008 fn recompute_layer_core_order(&mut self, layer_targets: &Vec<(usize, usize)>) -> Result<bool> {
3016 debug!(
3018 " free: before pass: free_llcs={:?}",
3019 self.cpu_pool.free_llcs
3020 );
3021 for &(idx, target) in layer_targets.iter().rev() {
3022 let layer = &mut self.layers[idx];
3023 let old_tlc = layer.target_llc_cpus;
3024 let new_tlc = Self::compute_target_llcs(target, &self.topo);
3025
3026 if layer.growth_algo != LayerGrowthAlgo::StickyDynamic {
3027 continue;
3028 }
3029
3030 let mut to_free = (old_tlc.0 as i32 - new_tlc.0 as i32).max(0) as usize;
3031
3032 debug!(
3033 " free: layer={} old_tlc={:?} new_tlc={:?} to_free={} assigned={} free={}",
3034 layer.name,
3035 old_tlc,
3036 new_tlc,
3037 to_free,
3038 layer.assigned_llcs.len(),
3039 self.cpu_pool.free_llcs.len()
3040 );
3041
3042 while to_free > 0 && layer.assigned_llcs.len() > 0 {
3043 let llc = layer.assigned_llcs.pop().unwrap();
3044 self.cpu_pool.free_llcs.push((llc, 0));
3045 to_free -= 1;
3046
3047 debug!(" layer={} freed_llc={}", layer.name, llc);
3048 }
3049 }
3050 debug!(" free: after pass: free_llcs={:?}", self.cpu_pool.free_llcs);
3051
3052 for &(idx, target) in layer_targets.iter().rev() {
3054 let layer = &mut self.layers[idx];
3055 let old_tlc = layer.target_llc_cpus;
3056 let new_tlc = Self::compute_target_llcs(target, &self.topo);
3057
3058 if layer.growth_algo != LayerGrowthAlgo::StickyDynamic {
3059 continue;
3060 }
3061
3062 let mut to_alloc = (new_tlc.0 as i32 - old_tlc.0 as i32).max(0) as usize;
3063
3064 debug!(
3065 " alloc: layer={} old_tlc={:?} new_tlc={:?} to_alloc={} assigned={} free={}",
3066 layer.name,
3067 old_tlc,
3068 new_tlc,
3069 to_alloc,
3070 layer.assigned_llcs.len(),
3071 self.cpu_pool.free_llcs.len()
3072 );
3073
3074 while to_alloc > 0
3075 && self.cpu_pool.free_llcs.len() > 0
3076 && to_alloc <= self.cpu_pool.free_llcs.len()
3077 {
3078 let llc = self.cpu_pool.free_llcs.pop().unwrap().0;
3079 layer.assigned_llcs.push(llc);
3080 to_alloc -= 1;
3081
3082 debug!(" layer={} alloc_llc={}", layer.name, llc);
3083 }
3084
3085 debug!(
3086 " alloc: layer={} assigned_llcs={:?}",
3087 layer.name, layer.assigned_llcs
3088 );
3089
3090 layer.target_llc_cpus = new_tlc;
3092 }
3093
3094 for &(idx, _) in layer_targets.iter() {
3097 let mut core_order = vec![];
3098 let layer = &mut self.layers[idx];
3099
3100 if layer.growth_algo != LayerGrowthAlgo::StickyDynamic {
3101 continue;
3102 }
3103
3104 let tlc = layer.target_llc_cpus;
3105 let mut extra = tlc.1;
3106 let cores_per_llc = self.topo.all_cores.len() / self.topo.all_llcs.len();
3108 let cpus_per_core = self.topo.all_cores.first_key_value().unwrap().1.cpus.len();
3109 let cpus_per_llc = cores_per_llc * cpus_per_core;
3110
3111 for i in 0..self.cpu_pool.free_llcs.len() {
3113 let free_vec = &mut self.cpu_pool.free_llcs;
3114 let avail = cpus_per_llc - free_vec[i].1;
3116 let mut used = extra.min(avail);
3118 let cores_to_add = used;
3119
3120 let shift = free_vec[i].1;
3121 free_vec[i].1 += used;
3122
3123 let llc_id = free_vec[i].0;
3124 let llc = self.topo.all_llcs.get(&llc_id).unwrap();
3125
3126 for core in llc.cores.iter().skip(shift) {
3127 if used == 0 {
3128 break;
3129 }
3130 core_order.push(core.1.id);
3131 used -= 1;
3132 }
3133
3134 extra -= cores_to_add;
3135 if extra == 0 {
3136 break;
3137 }
3138 }
3139
3140 core_order.reverse();
3141 layer.core_order = core_order;
3142 }
3143
3144 for i in 0..self.cpu_pool.free_llcs.len() {
3146 self.cpu_pool.free_llcs[i].1 = 0;
3147 }
3148
3149 for &(idx, _) in layer_targets.iter() {
3150 let layer = &mut self.layers[idx];
3151
3152 if layer.growth_algo != LayerGrowthAlgo::StickyDynamic {
3153 continue;
3154 }
3155
3156 for core in self.topo.all_cores.iter() {
3157 let llc_id = core.1.llc_id;
3158 if layer.assigned_llcs.contains(&llc_id) {
3159 layer.core_order.push(core.1.id);
3160 }
3161 }
3162 layer.core_order.reverse();
3164
3165 debug!(
3166 " alloc: layer={} core_order={:?}",
3167 layer.name, layer.core_order
3168 );
3169 }
3170
3171 let mut updated = false;
3175
3176 for &(idx, _) in layer_targets.iter() {
3178 let layer = &mut self.layers[idx];
3179
3180 if layer.growth_algo != LayerGrowthAlgo::StickyDynamic {
3181 continue;
3182 }
3183
3184 let mut new_cpus = Cpumask::new();
3186 for &core_id in &layer.core_order {
3187 if let Some(core) = self.topo.all_cores.get(&core_id) {
3188 new_cpus |= &core.span;
3189 }
3190 }
3191
3192 new_cpus &= &layer.allowed_cpus;
3194
3195 let cpus_to_free = layer.cpus.clone().and(&new_cpus.clone().not());
3197
3198 if cpus_to_free.weight() > 0 {
3199 debug!(
3200 " apply: layer={} freeing CPUs: {}",
3201 layer.name, cpus_to_free
3202 );
3203 layer.cpus &= &cpus_to_free.not();
3205 layer.nr_cpus -= cpus_to_free.weight();
3206 for cpu in cpus_to_free.iter() {
3207 layer.nr_llc_cpus[self.cpu_pool.topo.all_cpus[&cpu].llc_id] -= 1;
3208 }
3209 self.cpu_pool.free(&cpus_to_free)?;
3210 updated = true;
3211 }
3212 }
3213
3214 for &(idx, _) in layer_targets.iter() {
3216 let layer = &mut self.layers[idx];
3217
3218 if layer.growth_algo != LayerGrowthAlgo::StickyDynamic {
3219 continue;
3220 }
3221
3222 let mut new_cpus = Cpumask::new();
3224 for &core_id in &layer.core_order {
3225 if let Some(core) = self.topo.all_cores.get(&core_id) {
3226 new_cpus |= &core.span;
3227 }
3228 }
3229 new_cpus &= &layer.allowed_cpus;
3230
3231 let available_cpus = self.cpu_pool.available_cpus();
3233 let desired_to_alloc = new_cpus.clone().and(&layer.cpus.clone().not());
3234 let cpus_to_alloc = desired_to_alloc.clone().and(&available_cpus);
3235
3236 if desired_to_alloc.weight() > cpus_to_alloc.weight() {
3237 debug!(
3238 " apply: layer={} wanted to alloc {} CPUs but only {} available",
3239 layer.name,
3240 desired_to_alloc.weight(),
3241 cpus_to_alloc.weight()
3242 );
3243 }
3244
3245 if cpus_to_alloc.weight() > 0 {
3246 debug!(
3247 " apply: layer={} allocating CPUs: {}",
3248 layer.name, cpus_to_alloc
3249 );
3250 layer.cpus |= &cpus_to_alloc;
3252 layer.nr_cpus += cpus_to_alloc.weight();
3253 for cpu in cpus_to_alloc.iter() {
3254 layer.nr_llc_cpus[self.cpu_pool.topo.all_cpus[&cpu].llc_id] += 1;
3255 }
3256 self.cpu_pool.mark_allocated(&cpus_to_alloc)?;
3257 updated = true;
3258 }
3259
3260 debug!(
3261 " apply: layer={} final cpus.weight()={} nr_cpus={}",
3262 layer.name,
3263 layer.cpus.weight(),
3264 layer.nr_cpus
3265 );
3266 }
3267
3268 Ok(updated)
3269 }
3270
3271 fn refresh_cpumasks(&mut self) -> Result<()> {
3272 let layer_is_open = |layer: &Layer| matches!(layer.kind, LayerKind::Open { .. });
3273
3274 let mut updated = false;
3275 let targets = self.calc_target_nr_cpus();
3276 let targets = self.weighted_target_nr_cpus(&targets);
3277
3278 let mut ascending: Vec<(usize, usize)> = targets.iter().copied().enumerate().collect();
3279 ascending.sort_by(|a, b| a.1.cmp(&b.1));
3280
3281 let sticky_dynamic_updated = self.recompute_layer_core_order(&ascending)?;
3282 updated |= sticky_dynamic_updated;
3283
3284 if sticky_dynamic_updated {
3286 for (idx, layer) in self.layers.iter().enumerate() {
3287 if layer.growth_algo == LayerGrowthAlgo::StickyDynamic {
3288 Self::update_bpf_layer_cpumask(
3289 layer,
3290 &mut self.skel.maps.bss_data.as_mut().unwrap().layers[idx],
3291 );
3292 }
3293 }
3294 }
3295
3296 let mut force_free = self
3299 .layers
3300 .iter()
3301 .zip(targets.iter())
3302 .any(|(layer, &target)| layer.nr_cpus < target);
3303
3304 for &(idx, target) in ascending.iter().rev() {
3308 let layer = &mut self.layers[idx];
3309 if layer_is_open(layer) {
3310 continue;
3311 }
3312
3313 if layer.growth_algo == LayerGrowthAlgo::StickyDynamic {
3315 continue;
3316 }
3317
3318 let nr_cur = layer.cpus.weight();
3319 if nr_cur <= target {
3320 continue;
3321 }
3322 let mut nr_to_free = nr_cur - target;
3323
3324 let nr_to_break_at = nr_to_free / 2;
3329
3330 let mut freed = false;
3331
3332 while nr_to_free > 0 {
3333 let max_to_free = if force_free {
3334 force_free = false;
3335 layer.nr_cpus
3336 } else {
3337 nr_to_free
3338 };
3339
3340 let nr_freed = layer.free_some_cpus(&mut self.cpu_pool, max_to_free)?;
3341 if nr_freed == 0 {
3342 break;
3343 }
3344
3345 nr_to_free = nr_to_free.saturating_sub(nr_freed);
3346 freed = true;
3347
3348 if nr_to_free <= nr_to_break_at {
3349 break;
3350 }
3351 }
3352
3353 if freed {
3354 Self::update_bpf_layer_cpumask(
3355 layer,
3356 &mut self.skel.maps.bss_data.as_mut().unwrap().layers[idx],
3357 );
3358 updated = true;
3359 }
3360 }
3361
3362 for &(idx, target) in &ascending {
3368 let layer = &mut self.layers[idx];
3369
3370 if layer_is_open(layer) {
3371 continue;
3372 }
3373
3374 if layer.growth_algo == LayerGrowthAlgo::StickyDynamic {
3376 continue;
3377 }
3378
3379 let nr_cur = layer.cpus.weight();
3380 if nr_cur >= target {
3381 continue;
3382 }
3383
3384 let mut nr_to_alloc = target - nr_cur;
3385 let mut alloced = false;
3386
3387 while nr_to_alloc > 0 {
3388 let nr_alloced = layer.alloc_some_cpus(&mut self.cpu_pool)?;
3389 if nr_alloced == 0 {
3390 break;
3391 }
3392 alloced = true;
3393 nr_to_alloc -= nr_alloced.min(nr_to_alloc);
3394 }
3395
3396 if alloced {
3397 Self::update_bpf_layer_cpumask(
3398 layer,
3399 &mut self.skel.maps.bss_data.as_mut().unwrap().layers[idx],
3400 );
3401 updated = true;
3402 }
3403 }
3404
3405 if updated {
3407 for (idx, layer) in self.layers.iter_mut().enumerate() {
3408 if !layer_is_open(layer) {
3409 continue;
3410 }
3411
3412 let bpf_layer = &mut self.skel.maps.bss_data.as_mut().unwrap().layers[idx];
3413 let available_cpus = self.cpu_pool.available_cpus().and(&layer.allowed_cpus);
3414 let nr_available_cpus = available_cpus.weight();
3415
3416 layer.cpus = available_cpus;
3419 layer.nr_cpus = nr_available_cpus;
3420 Self::update_bpf_layer_cpumask(layer, bpf_layer);
3421 }
3422
3423 self.skel.maps.bss_data.as_mut().unwrap().fallback_cpu =
3424 self.cpu_pool.fallback_cpu as u32;
3425
3426 for (lidx, layer) in self.layers.iter().enumerate() {
3427 self.nr_layer_cpus_ranges[lidx] = (
3428 self.nr_layer_cpus_ranges[lidx].0.min(layer.nr_cpus),
3429 self.nr_layer_cpus_ranges[lidx].1.max(layer.nr_cpus),
3430 );
3431 }
3432
3433 let input = ProgramInput {
3435 ..Default::default()
3436 };
3437 let prog = &mut self.skel.progs.refresh_layer_cpumasks;
3438 let _ = prog.test_run(input);
3439
3440 let empty_layer_ids: Vec<u32> = self
3442 .layers
3443 .iter()
3444 .enumerate()
3445 .filter(|(_idx, layer)| layer.nr_cpus == 0)
3446 .map(|(idx, _layer)| idx as u32)
3447 .collect();
3448 for i in 0..self.layers.len() {
3449 self.skel.maps.bss_data.as_mut().unwrap().empty_layer_ids[i] =
3450 empty_layer_ids.get(i).cloned().unwrap_or(MAX_LAYERS as u32);
3451 }
3452 self.skel.maps.bss_data.as_mut().unwrap().nr_empty_layer_ids =
3453 empty_layer_ids.len() as u32;
3454 }
3455
3456 let _ = self.update_netdev_cpumasks();
3457 Ok(())
3458 }
3459
3460 fn refresh_idle_qos(&mut self) -> Result<()> {
3461 if !self.idle_qos_enabled {
3462 return Ok(());
3463 }
3464
3465 let mut cpu_idle_qos = vec![0; *NR_CPU_IDS];
3466 for layer in self.layers.iter() {
3467 let idle_resume_us = layer.kind.common().idle_resume_us.unwrap_or(0) as i32;
3468 for cpu in layer.cpus.iter() {
3469 cpu_idle_qos[cpu] = idle_resume_us;
3470 }
3471 }
3472
3473 for (cpu, idle_resume_usec) in cpu_idle_qos.iter().enumerate() {
3474 update_cpu_idle_resume_latency(cpu, *idle_resume_usec)?;
3475 }
3476
3477 Ok(())
3478 }
3479
3480 fn step(&mut self) -> Result<()> {
3481 let started_at = Instant::now();
3482 self.sched_stats.refresh(
3483 &mut self.skel,
3484 &self.proc_reader,
3485 started_at,
3486 self.processing_dur,
3487 &self.gpu_task_handler,
3488 )?;
3489
3490 self.skel
3492 .maps
3493 .bss_data
3494 .as_mut()
3495 .unwrap()
3496 .system_cpu_util_ewma = (self.sched_stats.system_cpu_util_ewma * 10000.0) as u64;
3497
3498 for layer_id in 0..self.sched_stats.nr_layers {
3499 self.skel
3500 .maps
3501 .bss_data
3502 .as_mut()
3503 .unwrap()
3504 .layer_dsq_insert_ewma[layer_id] =
3505 (self.sched_stats.layer_dsq_insert_ewma[layer_id] * 10000.0) as u64;
3506 }
3507
3508 self.refresh_cpumasks()?;
3509 self.refresh_idle_qos()?;
3510 self.gpu_task_handler.maybe_affinitize();
3511 self.processing_dur += Instant::now().duration_since(started_at);
3512 Ok(())
3513 }
3514
3515 fn generate_sys_stats(
3516 &mut self,
3517 stats: &Stats,
3518 cpus_ranges: &mut [(usize, usize)],
3519 ) -> Result<SysStats> {
3520 let bstats = &stats.bpf_stats;
3521 let mut sys_stats = SysStats::new(stats, bstats, self.cpu_pool.fallback_cpu)?;
3522
3523 for (lidx, (spec, layer)) in self.layer_specs.iter().zip(self.layers.iter()).enumerate() {
3524 let layer_stats = LayerStats::new(lidx, layer, stats, bstats, cpus_ranges[lidx]);
3525 sys_stats.layers.insert(spec.name.to_string(), layer_stats);
3526 cpus_ranges[lidx] = (layer.nr_cpus, layer.nr_cpus);
3527 }
3528
3529 Ok(sys_stats)
3530 }
3531
3532 fn process_cgroup_creation(
3534 path: &Path,
3535 cgroup_regexes: &HashMap<u32, Regex>,
3536 cgroup_path_to_id: &mut HashMap<String, u64>,
3537 sender: &crossbeam::channel::Sender<CgroupEvent>,
3538 ) {
3539 let path_str = path.to_string_lossy().to_string();
3540
3541 let cgroup_id = std::fs::metadata(path)
3543 .map(|metadata| {
3544 use std::os::unix::fs::MetadataExt;
3545 metadata.ino()
3546 })
3547 .unwrap_or(0);
3548
3549 let mut match_bitmap = 0u64;
3551 for (rule_id, regex) in cgroup_regexes {
3552 if regex.is_match(&path_str) {
3553 match_bitmap |= 1u64 << rule_id;
3554 }
3555 }
3556
3557 cgroup_path_to_id.insert(path_str.clone(), cgroup_id);
3559
3560 if let Err(e) = sender.send(CgroupEvent::Created {
3562 path: path_str,
3563 cgroup_id,
3564 match_bitmap,
3565 }) {
3566 error!("Failed to send cgroup creation event: {}", e);
3567 }
3568 }
3569
3570 fn start_cgroup_watcher(
3571 shutdown: Arc<AtomicBool>,
3572 cgroup_regexes: HashMap<u32, Regex>,
3573 ) -> Result<Receiver<CgroupEvent>> {
3574 let mut inotify = Inotify::init().context("Failed to initialize inotify")?;
3575 let mut wd_to_path = HashMap::new();
3576
3577 let (sender, receiver) = crossbeam::channel::bounded::<CgroupEvent>(1024);
3579
3580 let root_wd = inotify
3582 .watches()
3583 .add("/sys/fs/cgroup", WatchMask::CREATE | WatchMask::DELETE)
3584 .context("Failed to add watch for /sys/fs/cgroup")?;
3585 wd_to_path.insert(root_wd, PathBuf::from("/sys/fs/cgroup"));
3586
3587 Self::add_recursive_watches(&mut inotify, &mut wd_to_path, Path::new("/sys/fs/cgroup"))?;
3589
3590 std::thread::spawn(move || {
3592 let mut buffer = [0; 4096];
3593 let inotify_fd = inotify.as_raw_fd();
3594 let mut cgroup_path_to_id = HashMap::<String, u64>::new();
3596
3597 for entry in WalkDir::new("/sys/fs/cgroup")
3599 .into_iter()
3600 .filter_map(|e| e.ok())
3601 .filter(|e| e.file_type().is_dir())
3602 {
3603 let path = entry.path();
3604 Self::process_cgroup_creation(
3605 path,
3606 &cgroup_regexes,
3607 &mut cgroup_path_to_id,
3608 &sender,
3609 );
3610 }
3611
3612 while !shutdown.load(Ordering::Relaxed) {
3613 let ready = unsafe {
3615 let mut read_fds: libc::fd_set = std::mem::zeroed();
3616 libc::FD_ZERO(&mut read_fds);
3617 libc::FD_SET(inotify_fd, &mut read_fds);
3618
3619 let mut timeout = libc::timeval {
3620 tv_sec: 0,
3621 tv_usec: 100_000, };
3623
3624 libc::select(
3625 inotify_fd + 1,
3626 &mut read_fds,
3627 std::ptr::null_mut(),
3628 std::ptr::null_mut(),
3629 &mut timeout,
3630 )
3631 };
3632
3633 if ready <= 0 {
3634 continue;
3636 }
3637
3638 let events = match inotify.read_events(&mut buffer) {
3640 Ok(events) => events,
3641 Err(e) => {
3642 error!("Error reading inotify events: {}", e);
3643 break;
3644 }
3645 };
3646
3647 for event in events {
3648 if !event.mask.contains(inotify::EventMask::CREATE)
3649 && !event.mask.contains(inotify::EventMask::DELETE)
3650 {
3651 continue;
3652 }
3653
3654 let name = match event.name {
3655 Some(name) => name,
3656 None => continue,
3657 };
3658
3659 let parent_path = match wd_to_path.get(&event.wd) {
3660 Some(parent) => parent,
3661 None => {
3662 warn!("Unknown watch descriptor: {:?}", event.wd);
3663 continue;
3664 }
3665 };
3666
3667 let path = parent_path.join(name.to_string_lossy().as_ref());
3668
3669 if event.mask.contains(inotify::EventMask::CREATE) {
3670 if !path.is_dir() {
3671 continue;
3672 }
3673
3674 Self::process_cgroup_creation(
3675 &path,
3676 &cgroup_regexes,
3677 &mut cgroup_path_to_id,
3678 &sender,
3679 );
3680
3681 match inotify
3683 .watches()
3684 .add(&path, WatchMask::CREATE | WatchMask::DELETE)
3685 {
3686 Ok(wd) => {
3687 wd_to_path.insert(wd, path.clone());
3688 }
3689 Err(e) => {
3690 warn!(
3691 "Failed to add watch for new cgroup {}: {}",
3692 path.display(),
3693 e
3694 );
3695 }
3696 }
3697 } else if event.mask.contains(inotify::EventMask::DELETE) {
3698 let path_str = path.to_string_lossy().to_string();
3699
3700 let cgroup_id = cgroup_path_to_id.remove(&path_str).unwrap_or(0);
3702
3703 if let Err(e) = sender.send(CgroupEvent::Removed {
3705 path: path_str,
3706 cgroup_id,
3707 }) {
3708 error!("Failed to send cgroup removal event: {}", e);
3709 }
3710
3711 let wd_to_remove = wd_to_path.iter().find_map(|(wd, watched_path)| {
3713 if watched_path == &path {
3714 Some(wd.clone())
3715 } else {
3716 None
3717 }
3718 });
3719 if let Some(wd) = wd_to_remove {
3720 wd_to_path.remove(&wd);
3721 }
3722 }
3723 }
3724 }
3725 });
3726
3727 Ok(receiver)
3728 }
3729
3730 fn add_recursive_watches(
3731 inotify: &mut Inotify,
3732 wd_to_path: &mut HashMap<inotify::WatchDescriptor, PathBuf>,
3733 path: &Path,
3734 ) -> Result<()> {
3735 for entry in WalkDir::new(path)
3736 .into_iter()
3737 .filter_map(|e| e.ok())
3738 .filter(|e| e.file_type().is_dir())
3739 .skip(1)
3740 {
3741 let entry_path = entry.path();
3742 match inotify
3744 .watches()
3745 .add(entry_path, WatchMask::CREATE | WatchMask::DELETE)
3746 {
3747 Ok(wd) => {
3748 wd_to_path.insert(wd, entry_path.to_path_buf());
3749 }
3750 Err(e) => {
3751 debug!("Failed to add watch for {}: {}", entry_path.display(), e);
3752 }
3753 }
3754 }
3755 Ok(())
3756 }
3757
3758 fn run(&mut self, shutdown: Arc<AtomicBool>) -> Result<UserExitInfo> {
3759 let (res_ch, req_ch) = self.stats_server.channels();
3760 let mut next_sched_at = Instant::now() + self.sched_intv;
3761 let enable_layer_refresh = !self.layer_refresh_intv.is_zero();
3762 let mut next_layer_refresh_at = Instant::now() + self.layer_refresh_intv;
3763 let mut cpus_ranges = HashMap::<ThreadId, Vec<(usize, usize)>>::new();
3764
3765 let cgroup_regexes = self.cgroup_regexes.take().unwrap();
3767 let cgroup_event_rx = if !cgroup_regexes.is_empty() {
3768 Some(Self::start_cgroup_watcher(
3769 shutdown.clone(),
3770 cgroup_regexes,
3771 )?)
3772 } else {
3773 None
3774 };
3775
3776 while !shutdown.load(Ordering::Relaxed) && !uei_exited!(&self.skel, uei) {
3777 let now = Instant::now();
3778
3779 if now >= next_sched_at {
3780 self.step()?;
3781 while next_sched_at < now {
3782 next_sched_at += self.sched_intv;
3783 }
3784 }
3785
3786 if enable_layer_refresh && now >= next_layer_refresh_at {
3787 self.skel
3788 .maps
3789 .bss_data
3790 .as_mut()
3791 .unwrap()
3792 .layer_refresh_seq_avgruntime += 1;
3793 while next_layer_refresh_at < now {
3794 next_layer_refresh_at += self.layer_refresh_intv;
3795 }
3796 }
3797
3798 let timeout_duration = next_sched_at.saturating_duration_since(Instant::now());
3800 let never_rx = crossbeam::channel::never();
3801 let cgroup_rx = cgroup_event_rx.as_ref().unwrap_or(&never_rx);
3802
3803 select! {
3804 recv(req_ch) -> msg => match msg {
3805 Ok(StatsReq::Hello(tid)) => {
3806 cpus_ranges.insert(
3807 tid,
3808 self.layers.iter().map(|l| (l.nr_cpus, l.nr_cpus)).collect(),
3809 );
3810 let stats =
3811 Stats::new(&mut self.skel, &self.proc_reader, &self.gpu_task_handler)?;
3812 res_ch.send(StatsRes::Hello(stats))?;
3813 }
3814 Ok(StatsReq::Refresh(tid, mut stats)) => {
3815 for i in 0..self.nr_layer_cpus_ranges.len() {
3817 for (_, ranges) in cpus_ranges.iter_mut() {
3818 ranges[i] = (
3819 ranges[i].0.min(self.nr_layer_cpus_ranges[i].0),
3820 ranges[i].1.max(self.nr_layer_cpus_ranges[i].1),
3821 );
3822 }
3823 self.nr_layer_cpus_ranges[i] =
3824 (self.layers[i].nr_cpus, self.layers[i].nr_cpus);
3825 }
3826
3827 stats.refresh(
3828 &mut self.skel,
3829 &self.proc_reader,
3830 now,
3831 self.processing_dur,
3832 &self.gpu_task_handler,
3833 )?;
3834 let sys_stats =
3835 self.generate_sys_stats(&stats, cpus_ranges.get_mut(&tid).unwrap())?;
3836 res_ch.send(StatsRes::Refreshed((stats, sys_stats)))?;
3837 }
3838 Ok(StatsReq::Bye(tid)) => {
3839 cpus_ranges.remove(&tid);
3840 res_ch.send(StatsRes::Bye)?;
3841 }
3842 Err(e) => Err(e)?,
3843 },
3844
3845 recv(cgroup_rx) -> event => match event {
3846 Ok(CgroupEvent::Created { path, cgroup_id, match_bitmap }) => {
3847 self.skel.maps.cgroup_match_bitmap.update(
3849 &cgroup_id.to_ne_bytes(),
3850 &match_bitmap.to_ne_bytes(),
3851 libbpf_rs::MapFlags::ANY,
3852 ).with_context(|| format!(
3853 "Failed to insert cgroup {}({}) into BPF map. Cgroup map may be full \
3854 (max 16384 entries). Aborting.",
3855 cgroup_id, path
3856 ))?;
3857
3858 debug!("Added cgroup {} to BPF map with bitmap 0x{:x}", cgroup_id, match_bitmap);
3859 }
3860 Ok(CgroupEvent::Removed { path, cgroup_id }) => {
3861 if let Err(e) = self.skel.maps.cgroup_match_bitmap.delete(&cgroup_id.to_ne_bytes()) {
3863 warn!("Failed to delete cgroup {} from BPF map: {}", cgroup_id, e);
3864 } else {
3865 debug!("Removed cgroup {}({}) from BPF map", cgroup_id, path);
3866 }
3867 }
3868 Err(e) => {
3869 error!("Error receiving cgroup event: {}", e);
3870 }
3871 },
3872
3873 recv(crossbeam::channel::after(timeout_duration)) -> _ => {
3874 }
3876 }
3877 }
3878
3879 let _ = self.struct_ops.take();
3880 uei_report!(&self.skel, uei)
3881 }
3882}
3883
3884impl Drop for Scheduler<'_> {
3885 fn drop(&mut self) {
3886 info!("Unregister {SCHEDULER_NAME} scheduler");
3887
3888 if let Some(struct_ops) = self.struct_ops.take() {
3889 drop(struct_ops);
3890 }
3891 }
3892}
3893
3894fn write_example_file(path: &str) -> Result<()> {
3895 let mut f = fs::OpenOptions::new()
3896 .create_new(true)
3897 .write(true)
3898 .open(path)?;
3899 Ok(f.write_all(serde_json::to_string_pretty(&*EXAMPLE_CONFIG)?.as_bytes())?)
3900}
3901
3902struct HintLayerInfo {
3903 layer_id: usize,
3904 system_cpu_util_below: Option<f64>,
3905 dsq_insert_below: Option<f64>,
3906}
3907
3908fn verify_layer_specs(specs: &[LayerSpec]) -> Result<HashMap<u64, HintLayerInfo>> {
3909 let mut hint_to_layer_map = HashMap::<u64, (usize, String, Option<f64>, Option<f64>)>::new();
3910
3911 let nr_specs = specs.len();
3912 if nr_specs == 0 {
3913 bail!("No layer spec");
3914 }
3915 if nr_specs > MAX_LAYERS {
3916 bail!("Too many layer specs");
3917 }
3918
3919 for (idx, spec) in specs.iter().enumerate() {
3920 if idx < nr_specs - 1 {
3921 if spec.matches.is_empty() {
3922 bail!("Non-terminal spec {:?} has NULL matches", spec.name);
3923 }
3924 } else {
3925 if spec.matches.len() != 1 || !spec.matches[0].is_empty() {
3926 bail!("Terminal spec {:?} must have an empty match", spec.name);
3927 }
3928 }
3929
3930 if spec.matches.len() > MAX_LAYER_MATCH_ORS {
3931 bail!(
3932 "Spec {:?} has too many ({}) OR match blocks",
3933 spec.name,
3934 spec.matches.len()
3935 );
3936 }
3937
3938 for (ands_idx, ands) in spec.matches.iter().enumerate() {
3939 if ands.len() > NR_LAYER_MATCH_KINDS {
3940 bail!(
3941 "Spec {:?}'s {}th OR block has too many ({}) match conditions",
3942 spec.name,
3943 ands_idx,
3944 ands.len()
3945 );
3946 }
3947 let mut hint_equals_cnt = 0;
3948 let mut system_cpu_util_below_cnt = 0;
3949 let mut dsq_insert_below_cnt = 0;
3950 let mut hint_value: Option<u64> = None;
3951 let mut system_cpu_util_threshold: Option<f64> = None;
3952 let mut dsq_insert_threshold: Option<f64> = None;
3953 for one in ands.iter() {
3954 match one {
3955 LayerMatch::CgroupPrefix(prefix) => {
3956 if prefix.len() > MAX_PATH {
3957 bail!("Spec {:?} has too long a cgroup prefix", spec.name);
3958 }
3959 }
3960 LayerMatch::CgroupSuffix(suffix) => {
3961 if suffix.len() > MAX_PATH {
3962 bail!("Spec {:?} has too long a cgroup suffix", spec.name);
3963 }
3964 }
3965 LayerMatch::CgroupContains(substr) => {
3966 if substr.len() > MAX_PATH {
3967 bail!("Spec {:?} has too long a cgroup substr", spec.name);
3968 }
3969 }
3970 LayerMatch::CommPrefix(prefix) => {
3971 if prefix.len() > MAX_COMM {
3972 bail!("Spec {:?} has too long a comm prefix", spec.name);
3973 }
3974 }
3975 LayerMatch::PcommPrefix(prefix) => {
3976 if prefix.len() > MAX_COMM {
3977 bail!("Spec {:?} has too long a process name prefix", spec.name);
3978 }
3979 }
3980 LayerMatch::SystemCpuUtilBelow(threshold) => {
3981 if *threshold < 0.0 || *threshold > 1.0 {
3982 bail!(
3983 "Spec {:?} has SystemCpuUtilBelow threshold outside the range [0.0, 1.0]",
3984 spec.name
3985 );
3986 }
3987 system_cpu_util_threshold = Some(*threshold);
3988 system_cpu_util_below_cnt += 1;
3989 }
3990 LayerMatch::DsqInsertBelow(threshold) => {
3991 if *threshold < 0.0 || *threshold > 1.0 {
3992 bail!(
3993 "Spec {:?} has DsqInsertBelow threshold outside the range [0.0, 1.0]",
3994 spec.name
3995 );
3996 }
3997 dsq_insert_threshold = Some(*threshold);
3998 dsq_insert_below_cnt += 1;
3999 }
4000 LayerMatch::HintEquals(hint) => {
4001 if *hint > 1024 {
4002 bail!(
4003 "Spec {:?} has hint value outside the range [0, 1024]",
4004 spec.name
4005 );
4006 }
4007 hint_value = Some(*hint);
4008 hint_equals_cnt += 1;
4009 }
4010 _ => {}
4011 }
4012 }
4013 if hint_equals_cnt > 1 {
4014 bail!("Only 1 HintEquals match permitted per AND block");
4015 }
4016 let high_freq_matcher_cnt = system_cpu_util_below_cnt + dsq_insert_below_cnt;
4017 if high_freq_matcher_cnt > 0 {
4018 if hint_equals_cnt != 1 {
4019 bail!("High-frequency matchers (SystemCpuUtilBelow, DsqInsertBelow) must be used with one HintEquals");
4020 }
4021 if system_cpu_util_below_cnt > 1 {
4022 bail!("Only 1 SystemCpuUtilBelow match permitted per AND block");
4023 }
4024 if dsq_insert_below_cnt > 1 {
4025 bail!("Only 1 DsqInsertBelow match permitted per AND block");
4026 }
4027 if ands.len() != hint_equals_cnt + system_cpu_util_below_cnt + dsq_insert_below_cnt
4028 {
4029 bail!("High-frequency matchers must be used only with HintEquals (no other matchers)");
4030 }
4031 } else if hint_equals_cnt == 1 && ands.len() != 1 {
4032 bail!("HintEquals match cannot be in conjunction with other matches");
4033 }
4034
4035 if let Some(hint) = hint_value {
4037 if let Some((layer_id, name, _, _)) = hint_to_layer_map.get(&hint) {
4038 if *layer_id != idx {
4039 bail!(
4040 "Spec {:?} has hint value ({}) that is already mapped to Spec {:?}",
4041 spec.name,
4042 hint,
4043 name
4044 );
4045 }
4046 } else {
4047 hint_to_layer_map.insert(
4048 hint,
4049 (
4050 idx,
4051 spec.name.clone(),
4052 system_cpu_util_threshold,
4053 dsq_insert_threshold,
4054 ),
4055 );
4056 }
4057 }
4058 }
4059
4060 match spec.kind {
4061 LayerKind::Confined {
4062 cpus_range,
4063 util_range,
4064 ..
4065 }
4066 | LayerKind::Grouped {
4067 cpus_range,
4068 util_range,
4069 ..
4070 } => {
4071 if let Some((cpus_min, cpus_max)) = cpus_range {
4072 if cpus_min > cpus_max {
4073 bail!(
4074 "Spec {:?} has invalid cpus_range({}, {})",
4075 spec.name,
4076 cpus_min,
4077 cpus_max
4078 );
4079 }
4080 }
4081 if util_range.0 >= util_range.1 {
4082 bail!(
4083 "Spec {:?} has invalid util_range ({}, {})",
4084 spec.name,
4085 util_range.0,
4086 util_range.1
4087 );
4088 }
4089 }
4090 _ => {}
4091 }
4092 }
4093
4094 Ok(hint_to_layer_map
4095 .into_iter()
4096 .map(|(k, v)| {
4097 (
4098 k,
4099 HintLayerInfo {
4100 layer_id: v.0,
4101 system_cpu_util_below: v.2,
4102 dsq_insert_below: v.3,
4103 },
4104 )
4105 })
4106 .collect())
4107}
4108
4109fn name_suffix(cgroup: &str, len: usize) -> String {
4110 let suffixlen = std::cmp::min(len, cgroup.len());
4111 let suffixrev: String = cgroup.chars().rev().take(suffixlen).collect();
4112
4113 suffixrev.chars().rev().collect()
4114}
4115
4116fn traverse_sysfs(dir: &Path) -> Result<Vec<PathBuf>> {
4117 let mut paths = vec![];
4118
4119 if !dir.is_dir() {
4120 panic!("path {:?} does not correspond to directory", dir);
4121 }
4122
4123 let direntries = fs::read_dir(dir)?;
4124
4125 for entry in direntries {
4126 let path = entry?.path();
4127 if path.is_dir() {
4128 paths.append(&mut traverse_sysfs(&path)?);
4129 paths.push(path);
4130 }
4131 }
4132
4133 Ok(paths)
4134}
4135
4136fn find_cpumask(cgroup: &str) -> Cpumask {
4137 let mut path = String::from(cgroup);
4138 path.push_str("/cpuset.cpus.effective");
4139
4140 let description = fs::read_to_string(&mut path).unwrap();
4141
4142 Cpumask::from_cpulist(&description).unwrap()
4143}
4144
4145fn expand_template(rule: &LayerMatch) -> Result<Vec<(LayerMatch, Cpumask)>> {
4146 match rule {
4147 LayerMatch::CgroupSuffix(suffix) => Ok(traverse_sysfs(Path::new("/sys/fs/cgroup"))?
4148 .into_iter()
4149 .map(|cgroup| String::from(cgroup.to_str().expect("could not parse cgroup path")))
4150 .filter(|cgroup| cgroup.ends_with(suffix))
4151 .map(|cgroup| {
4152 (
4153 {
4154 let mut slashterminated = cgroup.clone();
4155 slashterminated.push('/');
4156 LayerMatch::CgroupSuffix(name_suffix(&slashterminated, 64))
4157 },
4158 find_cpumask(&cgroup),
4159 )
4160 })
4161 .collect()),
4162 LayerMatch::CgroupRegex(expr) => Ok(traverse_sysfs(Path::new("/sys/fs/cgroup"))?
4163 .into_iter()
4164 .map(|cgroup| String::from(cgroup.to_str().expect("could not parse cgroup path")))
4165 .filter(|cgroup| {
4166 let re = Regex::new(expr).unwrap();
4167 re.is_match(cgroup)
4168 })
4169 .map(|cgroup| {
4170 (
4171 {
4175 let mut slashterminated = cgroup.clone();
4176 slashterminated.push('/');
4177 LayerMatch::CgroupSuffix(name_suffix(&slashterminated, 64))
4178 },
4179 find_cpumask(&cgroup),
4180 )
4181 })
4182 .collect()),
4183 _ => panic!("Unimplemented template enum {:?}", rule),
4184 }
4185}
4186
4187fn create_perf_fds(skel: &mut BpfSkel, event: u64) -> Result<()> {
4188 let mut attr = perf::bindings::perf_event_attr::default();
4189 attr.size = std::mem::size_of::<perf::bindings::perf_event_attr>() as u32;
4190 attr.type_ = perf::bindings::PERF_TYPE_RAW;
4191 attr.config = event;
4192 attr.sample_type = 0u64;
4193 attr.__bindgen_anon_1.sample_period = 0u64;
4194 attr.set_disabled(0);
4195
4196 let perf_events_map = &skel.maps.scx_pmu_map;
4197 let map_fd = unsafe { libbpf_sys::bpf_map__fd(perf_events_map.as_libbpf_object().as_ptr()) };
4198
4199 let mut failures = 0u64;
4200
4201 for cpu in 0..*NR_CPUS_POSSIBLE {
4202 let fd = unsafe { perf::perf_event_open(&mut attr as *mut _, -1, cpu as i32, -1, 0) };
4203 if fd < 0 {
4204 failures += 1;
4205 trace!(
4206 "perf_event_open failed cpu={cpu} errno={}",
4207 std::io::Error::last_os_error()
4208 );
4209 continue;
4210 }
4211
4212 let key = cpu as u32;
4213 let val = fd as u32;
4214 let ret = unsafe {
4215 libbpf_sys::bpf_map_update_elem(
4216 map_fd,
4217 &key as *const _ as *const _,
4218 &val as *const _ as *const _,
4219 0,
4220 )
4221 };
4222 if ret != 0 {
4223 trace!("bpf_map_update_elem failed cpu={cpu} fd={fd} ret={ret}");
4224 } else {
4225 trace!("mapped cpu={cpu} -> fd={fd}");
4226 }
4227 }
4228
4229 if failures > 0 {
4230 println!("membw tracking: failed to install {failures} counters");
4231 }
4233
4234 Ok(())
4235}
4236
4237fn setup_membw_tracking(skel: &mut OpenBpfSkel) -> Result<u64> {
4239 let pmumanager = PMUManager::new()?;
4240 let codename = &pmumanager.codename as &str;
4241
4242 let pmuspec = match codename {
4243 "amdzen1" | "amdzen2" | "amdzen3" => {
4244 trace!("found AMD codename {codename}");
4245 pmumanager.pmus.get("ls_any_fills_from_sys.mem_io_local")
4246 }
4247 "amdzen4" | "amdzen5" => {
4248 trace!("found AMD codename {codename}");
4249 pmumanager.pmus.get("ls_any_fills_from_sys.dram_io_all")
4250 }
4251
4252 "haswell" | "broadwell" | "broadwellde" | "broadwellx" | "skylake" | "skylakex"
4253 | "cascadelakex" | "arrowlake" | "meteorlake" | "sapphirerapids" | "emeraldrapids"
4254 | "graniterapids" => {
4255 trace!("found Intel codename {codename}");
4256 pmumanager.pmus.get("LONGEST_LAT_CACHE.MISS")
4257 }
4258
4259 _ => {
4260 trace!("found unknown codename {codename}");
4261 None
4262 }
4263 };
4264
4265 let spec = pmuspec.ok_or("not_found").unwrap();
4266 let config = (spec.umask << 8) | spec.event[0];
4267
4268 skel.maps.rodata_data.as_mut().unwrap().membw_event = config;
4270
4271 Ok(config)
4272}
4273
4274#[clap_main::clap_main]
4275fn main(opts: Opts) -> Result<()> {
4276 if opts.version {
4277 println!(
4278 "scx_layered {}",
4279 build_id::full_version(env!("CARGO_PKG_VERSION"))
4280 );
4281 return Ok(());
4282 }
4283
4284 if opts.help_stats {
4285 stats::server_data().describe_meta(&mut std::io::stdout(), None)?;
4286 return Ok(());
4287 }
4288
4289 let env_filter = EnvFilter::try_from_default_env()
4290 .or_else(|_| match EnvFilter::try_new(&opts.log_level) {
4291 Ok(filter) => Ok(filter),
4292 Err(e) => {
4293 eprintln!(
4294 "invalid log envvar: {}, using info, err is: {}",
4295 opts.log_level, e
4296 );
4297 EnvFilter::try_new("info")
4298 }
4299 })
4300 .unwrap_or_else(|_| EnvFilter::new("info"));
4301
4302 match tracing_subscriber::fmt()
4303 .with_env_filter(env_filter)
4304 .with_target(true)
4305 .with_thread_ids(true)
4306 .with_file(true)
4307 .with_line_number(true)
4308 .try_init()
4309 {
4310 Ok(()) => {}
4311 Err(e) => eprintln!("failed to init logger: {}", e),
4312 }
4313
4314 if opts.verbose > 0 {
4315 warn!("Setting verbose via -v is depricated and will be an error in future releases.");
4316 }
4317
4318 if opts.no_load_frac_limit {
4319 warn!("--no-load-frac-limit is deprecated and noop");
4320 }
4321 if opts.layer_preempt_weight_disable != 0.0 {
4322 warn!("--layer-preempt-weight-disable is deprecated and noop");
4323 }
4324 if opts.layer_growth_weight_disable != 0.0 {
4325 warn!("--layer-growth-weight-disable is deprecated and noop");
4326 }
4327 if opts.local_llc_iteration {
4328 warn!("--local_llc_iteration is deprecated and noop");
4329 }
4330
4331 debug!("opts={:?}", &opts);
4332
4333 if let Some(run_id) = opts.run_id {
4334 info!("scx_layered run_id: {}", run_id);
4335 }
4336
4337 let shutdown = Arc::new(AtomicBool::new(false));
4338 let shutdown_clone = shutdown.clone();
4339 ctrlc::set_handler(move || {
4340 shutdown_clone.store(true, Ordering::Relaxed);
4341 })
4342 .context("Error setting Ctrl-C handler")?;
4343
4344 if let Some(intv) = opts.monitor.or(opts.stats) {
4345 let shutdown_copy = shutdown.clone();
4346 let jh = std::thread::spawn(move || {
4347 match stats::monitor(Duration::from_secs_f64(intv), shutdown_copy) {
4348 Ok(_) => {
4349 debug!("stats monitor thread finished successfully")
4350 }
4351 Err(error_object) => {
4352 warn!(
4353 "stats monitor thread finished because of an error {}",
4354 error_object
4355 )
4356 }
4357 }
4358 });
4359 if opts.monitor.is_some() {
4360 let _ = jh.join();
4361 return Ok(());
4362 }
4363 }
4364
4365 if let Some(path) = &opts.example {
4366 write_example_file(path)?;
4367 return Ok(());
4368 }
4369
4370 let mut layer_config = match opts.run_example {
4371 true => EXAMPLE_CONFIG.clone(),
4372 false => LayerConfig { specs: vec![] },
4373 };
4374
4375 for (idx, input) in opts.specs.iter().enumerate() {
4376 let specs = LayerSpec::parse(input)
4377 .context(format!("Failed to parse specs[{}] ({:?})", idx, input))?;
4378
4379 for spec in specs {
4380 match spec.template {
4381 Some(ref rule) => {
4382 let matches = expand_template(&rule)?;
4383 if matches.is_empty() {
4386 layer_config.specs.push(spec);
4387 } else {
4388 for (mt, mask) in matches {
4389 let mut genspec = spec.clone();
4390
4391 genspec.cpuset = Some(mask);
4392
4393 for orterm in &mut genspec.matches {
4395 orterm.push(mt.clone());
4396 }
4397
4398 match &mt {
4399 LayerMatch::CgroupSuffix(cgroup) => genspec.name.push_str(cgroup),
4400 _ => bail!("Template match has unexpected type"),
4401 }
4402
4403 layer_config.specs.push(genspec);
4405 }
4406 }
4407 }
4408
4409 None => {
4410 layer_config.specs.push(spec);
4411 }
4412 }
4413 }
4414 }
4415
4416 for spec in layer_config.specs.iter_mut() {
4417 let common = spec.kind.common_mut();
4418
4419 if common.slice_us == 0 {
4420 common.slice_us = opts.slice_us;
4421 }
4422
4423 if common.weight == 0 {
4424 common.weight = DEFAULT_LAYER_WEIGHT;
4425 }
4426 common.weight = common.weight.clamp(MIN_LAYER_WEIGHT, MAX_LAYER_WEIGHT);
4427
4428 if common.preempt {
4429 if common.disallow_open_after_us.is_some() {
4430 warn!(
4431 "Preempt layer {} has non-null disallow_open_after_us, ignored",
4432 &spec.name
4433 );
4434 }
4435 if common.disallow_preempt_after_us.is_some() {
4436 warn!(
4437 "Preempt layer {} has non-null disallow_preempt_after_us, ignored",
4438 &spec.name
4439 );
4440 }
4441 common.disallow_open_after_us = Some(u64::MAX);
4442 common.disallow_preempt_after_us = Some(u64::MAX);
4443 } else {
4444 if common.disallow_open_after_us.is_none() {
4445 common.disallow_open_after_us = Some(*DFL_DISALLOW_OPEN_AFTER_US);
4446 }
4447
4448 if common.disallow_preempt_after_us.is_none() {
4449 common.disallow_preempt_after_us = Some(*DFL_DISALLOW_PREEMPT_AFTER_US);
4450 }
4451 }
4452
4453 if common.idle_smt.is_some() {
4454 warn!("Layer {} has deprecated flag \"idle_smt\"", &spec.name);
4455 }
4456 }
4457
4458 let membw_required = layer_config.specs.iter().any(|spec| match spec.kind {
4459 LayerKind::Confined { membw_gb, .. } | LayerKind::Grouped { membw_gb, .. } => {
4460 membw_gb.is_some()
4461 }
4462 LayerKind::Open { .. } => false,
4463 });
4464
4465 if opts.print_and_exit {
4466 println!("specs={}", serde_json::to_string_pretty(&layer_config)?);
4467 return Ok(());
4468 }
4469
4470 debug!("specs={}", serde_json::to_string_pretty(&layer_config)?);
4471 let hint_to_layer_map = verify_layer_specs(&layer_config.specs)?;
4472
4473 let mut open_object = MaybeUninit::uninit();
4474 loop {
4475 let mut sched = Scheduler::init(
4476 &opts,
4477 &layer_config.specs,
4478 &mut open_object,
4479 &hint_to_layer_map,
4480 membw_required,
4481 )?;
4482 if !sched.run(shutdown.clone())?.should_restart() {
4483 break;
4484 }
4485 }
4486
4487 Ok(())
4488}