1mod bpf_skel;
6mod stats;
7
8use std::collections::BTreeMap;
9use std::collections::HashMap;
10use std::collections::HashSet;
11use std::ffi::CString;
12use std::fs;
13use std::io::Write;
14use std::mem::MaybeUninit;
15use std::ops::Sub;
16use std::path::Path;
17use std::path::PathBuf;
18use std::sync::atomic::AtomicBool;
19use std::sync::atomic::Ordering;
20use std::sync::Arc;
21use std::thread::ThreadId;
22use std::time::Duration;
23use std::time::Instant;
24
25use anyhow::anyhow;
26use anyhow::bail;
27use anyhow::Context;
28use anyhow::Result;
29pub use bpf_skel::*;
30use clap::Parser;
31use crossbeam::channel::RecvTimeoutError;
32use lazy_static::lazy_static;
33use libbpf_rs::libbpf_sys;
34use libbpf_rs::AsRawLibbpf;
35use libbpf_rs::MapCore as _;
36use libbpf_rs::OpenObject;
37use libbpf_rs::ProgramInput;
38use log::info;
39use log::trace;
40use log::warn;
41use log::{debug, error};
42use nix::sched::CpuSet;
43use nvml_wrapper::error::NvmlError;
44use nvml_wrapper::Nvml;
45use once_cell::sync::OnceCell;
46use regex::Regex;
47use scx_bpf_compat;
48use scx_layered::*;
49use scx_raw_pmu::PMUManager;
50use scx_stats::prelude::*;
51use scx_utils::build_id;
52use scx_utils::compat;
53use scx_utils::init_libbpf_logging;
54use scx_utils::libbpf_clap_opts::LibbpfOpts;
55use scx_utils::perf;
56use scx_utils::pm::{cpu_idle_resume_latency_supported, update_cpu_idle_resume_latency};
57use scx_utils::read_netdevs;
58use scx_utils::scx_enums;
59use scx_utils::scx_ops_attach;
60use scx_utils::scx_ops_load;
61use scx_utils::scx_ops_open;
62use scx_utils::uei_exited;
63use scx_utils::uei_report;
64use scx_utils::CoreType;
65use scx_utils::Cpumask;
66use scx_utils::Llc;
67use scx_utils::NetDev;
68use scx_utils::Topology;
69use scx_utils::UserExitInfo;
70use scx_utils::NR_CPUS_POSSIBLE;
71use scx_utils::NR_CPU_IDS;
72use stats::LayerStats;
73use stats::StatsReq;
74use stats::StatsRes;
75use stats::SysStats;
76use std::collections::VecDeque;
77use sysinfo::{Pid, ProcessRefreshKind, ProcessesToUpdate, System};
78
79const SCHEDULER_NAME: &str = "scx_layered";
80const MAX_PATH: usize = bpf_intf::consts_MAX_PATH as usize;
81const MAX_COMM: usize = bpf_intf::consts_MAX_COMM as usize;
82const MAX_LAYER_WEIGHT: u32 = bpf_intf::consts_MAX_LAYER_WEIGHT;
83const MIN_LAYER_WEIGHT: u32 = bpf_intf::consts_MIN_LAYER_WEIGHT;
84const MAX_LAYER_MATCH_ORS: usize = bpf_intf::consts_MAX_LAYER_MATCH_ORS as usize;
85const MAX_LAYER_NAME: usize = bpf_intf::consts_MAX_LAYER_NAME as usize;
86const MAX_LAYERS: usize = bpf_intf::consts_MAX_LAYERS as usize;
87const DEFAULT_LAYER_WEIGHT: u32 = bpf_intf::consts_DEFAULT_LAYER_WEIGHT;
88const USAGE_HALF_LIFE: u32 = bpf_intf::consts_USAGE_HALF_LIFE;
89const USAGE_HALF_LIFE_F64: f64 = USAGE_HALF_LIFE as f64 / 1_000_000_000.0;
90
91const LAYER_USAGE_OWNED: usize = bpf_intf::layer_usage_LAYER_USAGE_OWNED as usize;
92const LAYER_USAGE_OPEN: usize = bpf_intf::layer_usage_LAYER_USAGE_OPEN as usize;
93const LAYER_USAGE_SUM_UPTO: usize = bpf_intf::layer_usage_LAYER_USAGE_SUM_UPTO as usize;
94const LAYER_USAGE_PROTECTED: usize = bpf_intf::layer_usage_LAYER_USAGE_PROTECTED as usize;
95const LAYER_USAGE_PROTECTED_PREEMPT: usize =
96 bpf_intf::layer_usage_LAYER_USAGE_PROTECTED_PREEMPT as usize;
97const NR_LAYER_USAGES: usize = bpf_intf::layer_usage_NR_LAYER_USAGES as usize;
98
99const NR_GSTATS: usize = bpf_intf::global_stat_id_NR_GSTATS as usize;
100const NR_LSTATS: usize = bpf_intf::layer_stat_id_NR_LSTATS as usize;
101const NR_LLC_LSTATS: usize = bpf_intf::llc_layer_stat_id_NR_LLC_LSTATS as usize;
102
103const NR_LAYER_MATCH_KINDS: usize = bpf_intf::layer_match_kind_NR_LAYER_MATCH_KINDS as usize;
104
105static NVML: OnceCell<Nvml> = OnceCell::new();
106
107fn nvml() -> Result<&'static Nvml, NvmlError> {
108 NVML.get_or_try_init(Nvml::init)
109}
110
111lazy_static! {
112 static ref USAGE_DECAY: f64 = 0.5f64.powf(1.0 / USAGE_HALF_LIFE_F64);
113 static ref DFL_DISALLOW_OPEN_AFTER_US: u64 = 2 * scx_enums.SCX_SLICE_DFL / 1000;
114 static ref DFL_DISALLOW_PREEMPT_AFTER_US: u64 = 4 * scx_enums.SCX_SLICE_DFL / 1000;
115 static ref EXAMPLE_CONFIG: LayerConfig = LayerConfig {
116 specs: vec![
117 LayerSpec {
118 name: "batch".into(),
119 comment: Some("tasks under system.slice or tasks with nice value > 0".into()),
120 cpuset: None,
121 template: None,
122 matches: vec![
123 vec![LayerMatch::CgroupPrefix("system.slice/".into())],
124 vec![LayerMatch::NiceAbove(0)],
125 ],
126 kind: LayerKind::Confined {
127 util_range: (0.8, 0.9),
128 cpus_range: Some((0, 16)),
129 cpus_range_frac: None,
130 protected: false,
131 membw_gb: None,
132 common: LayerCommon {
133 min_exec_us: 1000,
134 yield_ignore: 0.0,
135 preempt: false,
136 preempt_first: false,
137 exclusive: false,
138 allow_node_aligned: false,
139 skip_remote_node: false,
140 prev_over_idle_core: false,
141 idle_smt: None,
142 slice_us: 20000,
143 fifo: false,
144 weight: DEFAULT_LAYER_WEIGHT,
145 disallow_open_after_us: None,
146 disallow_preempt_after_us: None,
147 xllc_mig_min_us: 1000.0,
148 growth_algo: LayerGrowthAlgo::Sticky,
149 idle_resume_us: None,
150 perf: 1024,
151 nodes: vec![],
152 llcs: vec![],
153 placement: LayerPlacement::Standard,
154 },
155 },
156 },
157 LayerSpec {
158 name: "immediate".into(),
159 comment: Some("tasks under workload.slice with nice value < 0".into()),
160 cpuset: None,
161 template: None,
162 matches: vec![vec![
163 LayerMatch::CgroupPrefix("workload.slice/".into()),
164 LayerMatch::NiceBelow(0),
165 ]],
166 kind: LayerKind::Open {
167 common: LayerCommon {
168 min_exec_us: 100,
169 yield_ignore: 0.25,
170 preempt: true,
171 preempt_first: false,
172 exclusive: true,
173 allow_node_aligned: true,
174 skip_remote_node: false,
175 prev_over_idle_core: true,
176 idle_smt: None,
177 slice_us: 20000,
178 fifo: false,
179 weight: DEFAULT_LAYER_WEIGHT,
180 disallow_open_after_us: None,
181 disallow_preempt_after_us: None,
182 xllc_mig_min_us: 0.0,
183 growth_algo: LayerGrowthAlgo::Sticky,
184 perf: 1024,
185 idle_resume_us: None,
186 nodes: vec![],
187 llcs: vec![],
188 placement: LayerPlacement::Standard,
189 },
190 },
191 },
192 LayerSpec {
193 name: "stress-ng".into(),
194 comment: Some("stress-ng test layer".into()),
195 cpuset: None,
196 template: None,
197 matches: vec![
198 vec![LayerMatch::CommPrefix("stress-ng".into()),],
199 vec![LayerMatch::PcommPrefix("stress-ng".into()),]
200 ],
201 kind: LayerKind::Confined {
202 cpus_range: None,
203 util_range: (0.2, 0.8),
204 protected: false,
205 cpus_range_frac: None,
206 membw_gb: None,
207 common: LayerCommon {
208 min_exec_us: 800,
209 yield_ignore: 0.0,
210 preempt: true,
211 preempt_first: false,
212 exclusive: false,
213 allow_node_aligned: false,
214 skip_remote_node: false,
215 prev_over_idle_core: false,
216 idle_smt: None,
217 slice_us: 800,
218 fifo: false,
219 weight: DEFAULT_LAYER_WEIGHT,
220 disallow_open_after_us: None,
221 disallow_preempt_after_us: None,
222 xllc_mig_min_us: 0.0,
223 growth_algo: LayerGrowthAlgo::Topo,
224 perf: 1024,
225 idle_resume_us: None,
226 nodes: vec![],
227 llcs: vec![],
228 placement: LayerPlacement::Standard,
229 },
230 },
231 },
232 LayerSpec {
233 name: "normal".into(),
234 comment: Some("the rest".into()),
235 cpuset: None,
236 template: None,
237 matches: vec![vec![]],
238 kind: LayerKind::Grouped {
239 cpus_range: None,
240 util_range: (0.5, 0.6),
241 util_includes_open_cputime: true,
242 protected: false,
243 cpus_range_frac: None,
244 membw_gb: None,
245 common: LayerCommon {
246 min_exec_us: 200,
247 yield_ignore: 0.0,
248 preempt: false,
249 preempt_first: false,
250 exclusive: false,
251 allow_node_aligned: false,
252 skip_remote_node: false,
253 prev_over_idle_core: false,
254 idle_smt: None,
255 slice_us: 20000,
256 fifo: false,
257 weight: DEFAULT_LAYER_WEIGHT,
258 disallow_open_after_us: None,
259 disallow_preempt_after_us: None,
260 xllc_mig_min_us: 100.0,
261 growth_algo: LayerGrowthAlgo::Linear,
262 perf: 1024,
263 idle_resume_us: None,
264 nodes: vec![],
265 llcs: vec![],
266 placement: LayerPlacement::Standard,
267 },
268 },
269 },
270 ],
271 };
272}
273
274#[derive(Debug, Parser)]
554#[command(verbatim_doc_comment)]
555struct Opts {
556 #[clap(short = 's', long, default_value = "20000")]
558 slice_us: u64,
559
560 #[clap(short = 'M', long, default_value = "0")]
565 max_exec_us: u64,
566
567 #[clap(short = 'i', long, default_value = "0.1")]
569 interval: f64,
570
571 #[clap(short = 'n', long, default_value = "false")]
574 no_load_frac_limit: bool,
575
576 #[clap(long, default_value = "0")]
578 exit_dump_len: u32,
579
580 #[clap(short = 'v', long, action = clap::ArgAction::Count)]
583 verbose: u8,
584
585 #[arg(short = 't', long, num_args = 0..=1, default_missing_value = "true", require_equals = true)]
589 disable_topology: Option<bool>,
590
591 #[clap(long)]
593 xnuma_preemption: bool,
594
595 #[clap(long)]
597 monitor_disable: bool,
598
599 #[clap(short = 'e', long)]
601 example: Option<String>,
602
603 #[clap(long, default_value = "0.0")]
607 layer_preempt_weight_disable: f64,
608
609 #[clap(long, default_value = "0.0")]
613 layer_growth_weight_disable: f64,
614
615 #[clap(long)]
617 stats: Option<f64>,
618
619 #[clap(long)]
622 monitor: Option<f64>,
623
624 #[clap(long)]
626 run_example: bool,
627
628 #[clap(long, default_value = "false")]
631 local_llc_iteration: bool,
632
633 #[clap(long, default_value = "10000")]
638 lo_fb_wait_us: u64,
639
640 #[clap(long, default_value = ".05")]
643 lo_fb_share: f64,
644
645 #[clap(long, default_value = "false")]
647 disable_antistall: bool,
648
649 #[clap(long, default_value = "false")]
651 enable_gpu_affinitize: bool,
652
653 #[clap(long, default_value = "900")]
656 gpu_affinitize_secs: u64,
657
658 #[clap(long, default_value = "false")]
663 enable_match_debug: bool,
664
665 #[clap(long, default_value = "3")]
667 antistall_sec: u64,
668
669 #[clap(long, default_value = "false")]
671 enable_gpu_support: bool,
672
673 #[clap(long, default_value = "3")]
681 gpu_kprobe_level: u64,
682
683 #[clap(long, default_value = "false")]
685 netdev_irq_balance: bool,
686
687 #[clap(long, default_value = "false")]
689 disable_queued_wakeup: bool,
690
691 #[clap(long, default_value = "false")]
693 disable_percpu_kthread_preempt: bool,
694
695 #[clap(long, default_value = "false")]
699 percpu_kthread_preempt_all: bool,
700
701 #[clap(short = 'V', long, action = clap::ArgAction::SetTrue)]
703 version: bool,
704
705 #[clap(long)]
707 help_stats: bool,
708
709 specs: Vec<String>,
711
712 #[clap(long, default_value = "2000")]
715 layer_refresh_ms_avgruntime: u64,
716
717 #[clap(long, default_value = "")]
719 task_hint_map: String,
720
721 #[clap(long, default_value = "false")]
723 print_and_exit: bool,
724
725 #[clap(long, default_value = "")]
727 hi_fb_thread_name: String,
728
729 #[clap(flatten, next_help_heading = "Libbpf Options")]
730 pub libbpf: LibbpfOpts,
731}
732
733fn read_total_cpu(reader: &fb_procfs::ProcReader) -> Result<fb_procfs::CpuStat> {
734 reader
735 .read_stat()
736 .context("Failed to read procfs")?
737 .total_cpu
738 .ok_or_else(|| anyhow!("Could not read total cpu stat in proc"))
739}
740
741fn calc_util(curr: &fb_procfs::CpuStat, prev: &fb_procfs::CpuStat) -> Result<f64> {
742 match (curr, prev) {
743 (
744 fb_procfs::CpuStat {
745 user_usec: Some(curr_user),
746 nice_usec: Some(curr_nice),
747 system_usec: Some(curr_system),
748 idle_usec: Some(curr_idle),
749 iowait_usec: Some(curr_iowait),
750 irq_usec: Some(curr_irq),
751 softirq_usec: Some(curr_softirq),
752 stolen_usec: Some(curr_stolen),
753 ..
754 },
755 fb_procfs::CpuStat {
756 user_usec: Some(prev_user),
757 nice_usec: Some(prev_nice),
758 system_usec: Some(prev_system),
759 idle_usec: Some(prev_idle),
760 iowait_usec: Some(prev_iowait),
761 irq_usec: Some(prev_irq),
762 softirq_usec: Some(prev_softirq),
763 stolen_usec: Some(prev_stolen),
764 ..
765 },
766 ) => {
767 let idle_usec = curr_idle.saturating_sub(*prev_idle);
768 let iowait_usec = curr_iowait.saturating_sub(*prev_iowait);
769 let user_usec = curr_user.saturating_sub(*prev_user);
770 let system_usec = curr_system.saturating_sub(*prev_system);
771 let nice_usec = curr_nice.saturating_sub(*prev_nice);
772 let irq_usec = curr_irq.saturating_sub(*prev_irq);
773 let softirq_usec = curr_softirq.saturating_sub(*prev_softirq);
774 let stolen_usec = curr_stolen.saturating_sub(*prev_stolen);
775
776 let busy_usec =
777 user_usec + system_usec + nice_usec + irq_usec + softirq_usec + stolen_usec;
778 let total_usec = idle_usec + busy_usec + iowait_usec;
779 if total_usec > 0 {
780 Ok(((busy_usec as f64) / (total_usec as f64)).clamp(0.0, 1.0))
781 } else {
782 Ok(1.0)
783 }
784 }
785 _ => bail!("Missing stats in cpustat"),
786 }
787}
788
789fn copy_into_cstr(dst: &mut [i8], src: &str) {
790 let cstr = CString::new(src).unwrap();
791 let bytes = unsafe { std::mem::transmute::<&[u8], &[i8]>(cstr.as_bytes_with_nul()) };
792 dst[0..bytes.len()].copy_from_slice(bytes);
793}
794
795fn nodemask_from_nodes(nodes: &Vec<usize>) -> usize {
796 let mut mask = 0;
797 for node in nodes {
798 mask |= 1 << node;
799 }
800 mask
801}
802
803fn llcmask_from_llcs(llcs: &BTreeMap<usize, Arc<Llc>>) -> usize {
804 let mut mask = 0;
805 for (_, cache) in llcs {
806 mask |= 1 << cache.id;
807 }
808 mask
809}
810
811fn read_cpu_ctxs(skel: &BpfSkel) -> Result<Vec<bpf_intf::cpu_ctx>> {
812 let mut cpu_ctxs = vec![];
813 let cpu_ctxs_vec = skel
814 .maps
815 .cpu_ctxs
816 .lookup_percpu(&0u32.to_ne_bytes(), libbpf_rs::MapFlags::ANY)
817 .context("Failed to lookup cpu_ctx")?
818 .unwrap();
819 for cpu in 0..*NR_CPUS_POSSIBLE {
820 cpu_ctxs.push(*unsafe {
821 &*(cpu_ctxs_vec[cpu].as_slice().as_ptr() as *const bpf_intf::cpu_ctx)
822 });
823 }
824 Ok(cpu_ctxs)
825}
826
827#[derive(Clone, Debug)]
828struct BpfStats {
829 gstats: Vec<u64>,
830 lstats: Vec<Vec<u64>>,
831 lstats_sums: Vec<u64>,
832 llc_lstats: Vec<Vec<Vec<u64>>>, }
834
835impl BpfStats {
836 fn read(skel: &BpfSkel, cpu_ctxs: &[bpf_intf::cpu_ctx]) -> Self {
837 let nr_layers = skel.maps.rodata_data.as_ref().unwrap().nr_layers as usize;
838 let nr_llcs = skel.maps.rodata_data.as_ref().unwrap().nr_llcs as usize;
839 let mut gstats = vec![0u64; NR_GSTATS];
840 let mut lstats = vec![vec![0u64; NR_LSTATS]; nr_layers];
841 let mut llc_lstats = vec![vec![vec![0u64; NR_LLC_LSTATS]; nr_llcs]; nr_layers];
842
843 for cpu in 0..*NR_CPUS_POSSIBLE {
844 for stat in 0..NR_GSTATS {
845 gstats[stat] += cpu_ctxs[cpu].gstats[stat];
846 }
847 for layer in 0..nr_layers {
848 for stat in 0..NR_LSTATS {
849 lstats[layer][stat] += cpu_ctxs[cpu].lstats[layer][stat];
850 }
851 }
852 }
853
854 let mut lstats_sums = vec![0u64; NR_LSTATS];
855 for layer in 0..nr_layers {
856 for stat in 0..NR_LSTATS {
857 lstats_sums[stat] += lstats[layer][stat];
858 }
859 }
860
861 for llc_id in 0..nr_llcs {
862 let key = llc_id as u32;
868 let llc_id_slice =
869 unsafe { std::slice::from_raw_parts((&key as *const u32) as *const u8, 4) };
870 let v = skel
871 .maps
872 .llc_data
873 .lookup(llc_id_slice, libbpf_rs::MapFlags::ANY)
874 .unwrap()
875 .unwrap();
876 let llcc = unsafe { *(v.as_slice().as_ptr() as *const bpf_intf::llc_ctx) };
877
878 for layer_id in 0..nr_layers {
879 for stat_id in 0..NR_LLC_LSTATS {
880 llc_lstats[layer_id][llc_id][stat_id] = llcc.lstats[layer_id][stat_id];
881 }
882 }
883 }
884
885 Self {
886 gstats,
887 lstats,
888 lstats_sums,
889 llc_lstats,
890 }
891 }
892}
893
894impl<'a, 'b> Sub<&'b BpfStats> for &'a BpfStats {
895 type Output = BpfStats;
896
897 fn sub(self, rhs: &'b BpfStats) -> BpfStats {
898 let vec_sub = |l: &[u64], r: &[u64]| l.iter().zip(r.iter()).map(|(l, r)| *l - *r).collect();
899 BpfStats {
900 gstats: vec_sub(&self.gstats, &rhs.gstats),
901 lstats: self
902 .lstats
903 .iter()
904 .zip(rhs.lstats.iter())
905 .map(|(l, r)| vec_sub(l, r))
906 .collect(),
907 lstats_sums: vec_sub(&self.lstats_sums, &rhs.lstats_sums),
908 llc_lstats: self
909 .llc_lstats
910 .iter()
911 .zip(rhs.llc_lstats.iter())
912 .map(|(l_layer, r_layer)| {
913 l_layer
914 .iter()
915 .zip(r_layer.iter())
916 .map(|(l_llc, r_llc)| {
917 let (l_llc, mut r_llc) = (l_llc.clone(), r_llc.clone());
918 r_llc[bpf_intf::llc_layer_stat_id_LLC_LSTAT_LAT as usize] = 0;
920 vec_sub(&l_llc, &r_llc)
921 })
922 .collect()
923 })
924 .collect(),
925 }
926 }
927}
928
929#[derive(Clone, Debug)]
930struct Stats {
931 at: Instant,
932 elapsed: Duration,
933 nr_layers: usize,
934 nr_layer_tasks: Vec<usize>,
935 nr_nodes: usize,
936
937 total_util: f64, layer_utils: Vec<Vec<f64>>,
939 prev_layer_usages: Vec<Vec<u64>>,
940
941 layer_membws: Vec<Vec<f64>>, prev_layer_membw_agg: Vec<Vec<u64>>, cpu_busy: f64, prev_total_cpu: fb_procfs::CpuStat,
946
947 bpf_stats: BpfStats,
948 prev_bpf_stats: BpfStats,
949
950 processing_dur: Duration,
951 prev_processing_dur: Duration,
952
953 layer_slice_us: Vec<u64>,
954
955 gpu_tasks_affinitized: u64,
956 gpu_task_affinitization_ms: u64,
957}
958
959impl Stats {
960 fn read_layer_usages(cpu_ctxs: &[bpf_intf::cpu_ctx], nr_layers: usize) -> Vec<Vec<u64>> {
961 let mut layer_usages = vec![vec![0u64; NR_LAYER_USAGES]; nr_layers];
962
963 for cpu in 0..*NR_CPUS_POSSIBLE {
964 for layer in 0..nr_layers {
965 for usage in 0..NR_LAYER_USAGES {
966 layer_usages[layer][usage] += cpu_ctxs[cpu].layer_usages[layer][usage];
967 }
968 }
969 }
970
971 layer_usages
972 }
973
974 fn read_layer_membw_agg(cpu_ctxs: &[bpf_intf::cpu_ctx], nr_layers: usize) -> Vec<Vec<u64>> {
976 let mut layer_membw_agg = vec![vec![0u64; NR_LAYER_USAGES]; nr_layers];
977
978 for cpu in 0..*NR_CPUS_POSSIBLE {
979 for layer in 0..nr_layers {
980 for usage in 0..NR_LAYER_USAGES {
981 let delta = cpu_ctxs[cpu].layer_membw_agg[layer][usage];
982 layer_membw_agg[layer][usage] += delta;
983 }
984 }
985 }
986
987 layer_membw_agg
988 }
989
990 fn new(
991 skel: &mut BpfSkel,
992 proc_reader: &fb_procfs::ProcReader,
993 gpu_task_affinitizer: &GpuTaskAffinitizer,
994 ) -> Result<Self> {
995 let nr_layers = skel.maps.rodata_data.as_ref().unwrap().nr_layers as usize;
996 let cpu_ctxs = read_cpu_ctxs(skel)?;
997 let bpf_stats = BpfStats::read(skel, &cpu_ctxs);
998 let nr_nodes = skel.maps.rodata_data.as_ref().unwrap().nr_nodes as usize;
999
1000 Ok(Self {
1001 at: Instant::now(),
1002 elapsed: Default::default(),
1003 nr_layers,
1004 nr_layer_tasks: vec![0; nr_layers],
1005 nr_nodes,
1006
1007 total_util: 0.0,
1008 layer_utils: vec![vec![0.0; NR_LAYER_USAGES]; nr_layers],
1009 layer_membws: vec![vec![0.0; NR_LAYER_USAGES]; nr_layers],
1010 prev_layer_usages: Self::read_layer_usages(&cpu_ctxs, nr_layers),
1011 prev_layer_membw_agg: Self::read_layer_membw_agg(&cpu_ctxs, nr_layers),
1012
1013 cpu_busy: 0.0,
1014 prev_total_cpu: read_total_cpu(proc_reader)?,
1015
1016 bpf_stats: bpf_stats.clone(),
1017 prev_bpf_stats: bpf_stats,
1018
1019 processing_dur: Default::default(),
1020 prev_processing_dur: Default::default(),
1021
1022 layer_slice_us: vec![0; nr_layers],
1023 gpu_tasks_affinitized: gpu_task_affinitizer.tasks_affinitized,
1024 gpu_task_affinitization_ms: gpu_task_affinitizer.last_task_affinitization_ms,
1025 })
1026 }
1027
1028 fn refresh(
1029 &mut self,
1030 skel: &mut BpfSkel,
1031 proc_reader: &fb_procfs::ProcReader,
1032 now: Instant,
1033 cur_processing_dur: Duration,
1034 gpu_task_affinitizer: &GpuTaskAffinitizer,
1035 ) -> Result<()> {
1036 let elapsed = now.duration_since(self.at);
1037 let elapsed_f64 = elapsed.as_secs_f64();
1038 let cpu_ctxs = read_cpu_ctxs(skel)?;
1039
1040 let nr_layer_tasks: Vec<usize> = skel
1041 .maps
1042 .bss_data
1043 .as_ref()
1044 .unwrap()
1045 .layers
1046 .iter()
1047 .take(self.nr_layers)
1048 .map(|layer| layer.nr_tasks as usize)
1049 .collect();
1050 let layer_slice_us: Vec<u64> = skel
1051 .maps
1052 .bss_data
1053 .as_ref()
1054 .unwrap()
1055 .layers
1056 .iter()
1057 .take(self.nr_layers)
1058 .map(|layer| layer.slice_ns / 1000_u64)
1059 .collect();
1060
1061 let cur_layer_usages = Self::read_layer_usages(&cpu_ctxs, self.nr_layers);
1062 let cur_layer_membw_agg = Self::read_layer_membw_agg(&cpu_ctxs, self.nr_layers);
1063
1064 let compute_diff = |cur_agg: &Vec<Vec<u64>>, prev_agg: &Vec<Vec<u64>>| {
1066 cur_agg
1067 .iter()
1068 .zip(prev_agg.iter())
1069 .map(|(cur, prev)| {
1070 cur.iter()
1071 .zip(prev.iter())
1072 .map(|(c, p)| (c - p) as f64 / 1_000_000_000.0 / elapsed_f64)
1073 .collect()
1074 })
1075 .collect()
1076 };
1077
1078 let compute_mem_diff = |cur_agg: &Vec<Vec<u64>>, prev_agg: &Vec<Vec<u64>>| {
1081 cur_agg
1082 .iter()
1083 .zip(prev_agg.iter())
1084 .map(|(cur, prev)| {
1085 cur.iter()
1086 .zip(prev.iter())
1087 .map(|(c, p)| (c - p) as f64 / elapsed_f64)
1088 .collect()
1089 })
1090 .collect()
1091 };
1092
1093 let cur_layer_utils: Vec<Vec<f64>> =
1094 compute_diff(&cur_layer_usages, &self.prev_layer_usages);
1095 let cur_layer_membw: Vec<Vec<f64>> =
1096 compute_mem_diff(&cur_layer_membw_agg, &self.prev_layer_membw_agg);
1097
1098 let metric_decay =
1099 |cur_metric: Vec<Vec<f64>>, prev_metric: &Vec<Vec<f64>>, decay_rate: f64| {
1100 cur_metric
1101 .iter()
1102 .zip(prev_metric.iter())
1103 .map(|(cur, prev)| {
1104 cur.iter()
1105 .zip(prev.iter())
1106 .map(|(c, p)| {
1107 let decay = decay_rate.powf(elapsed_f64);
1108 p * decay + c * (1.0 - decay)
1109 })
1110 .collect()
1111 })
1112 .collect()
1113 };
1114
1115 let layer_utils: Vec<Vec<f64>> =
1116 metric_decay(cur_layer_utils, &self.layer_utils, *USAGE_DECAY);
1117 let layer_membws: Vec<Vec<f64>> = metric_decay(cur_layer_membw, &self.layer_membws, 0.0);
1118
1119 let cur_total_cpu = read_total_cpu(proc_reader)?;
1120 let cpu_busy = calc_util(&cur_total_cpu, &self.prev_total_cpu)?;
1121
1122 let cur_bpf_stats = BpfStats::read(skel, &cpu_ctxs);
1123 let bpf_stats = &cur_bpf_stats - &self.prev_bpf_stats;
1124
1125 let processing_dur = cur_processing_dur
1126 .checked_sub(self.prev_processing_dur)
1127 .unwrap();
1128
1129 *self = Self {
1130 at: now,
1131 elapsed,
1132 nr_layers: self.nr_layers,
1133 nr_layer_tasks,
1134 nr_nodes: self.nr_nodes,
1135
1136 total_util: layer_utils
1137 .iter()
1138 .map(|x| x.iter().take(LAYER_USAGE_SUM_UPTO + 1).sum::<f64>())
1139 .sum(),
1140 layer_utils,
1141 layer_membws,
1142 prev_layer_usages: cur_layer_usages,
1143 prev_layer_membw_agg: cur_layer_membw_agg,
1144
1145 cpu_busy,
1146 prev_total_cpu: cur_total_cpu,
1147
1148 bpf_stats,
1149 prev_bpf_stats: cur_bpf_stats,
1150
1151 processing_dur,
1152 prev_processing_dur: cur_processing_dur,
1153
1154 layer_slice_us,
1155 gpu_tasks_affinitized: gpu_task_affinitizer.tasks_affinitized,
1156 gpu_task_affinitization_ms: gpu_task_affinitizer.last_task_affinitization_ms,
1157 };
1158 Ok(())
1159 }
1160}
1161
1162#[derive(Debug)]
1163struct Layer {
1164 name: String,
1165 kind: LayerKind,
1166 growth_algo: LayerGrowthAlgo,
1167 core_order: Vec<usize>,
1168
1169 target_llc_cpus: (usize, usize),
1170 assigned_llcs: Vec<usize>,
1171
1172 nr_cpus: usize,
1173 nr_llc_cpus: Vec<usize>,
1174 cpus: Cpumask,
1175 allowed_cpus: Cpumask,
1176}
1177
1178fn get_kallsyms_addr(sym_name: &str) -> Result<u64> {
1179 fs::read_to_string("/proc/kallsyms")?
1180 .lines()
1181 .find(|line| line.contains(sym_name))
1182 .and_then(|line| line.split_whitespace().next())
1183 .and_then(|addr| u64::from_str_radix(addr, 16).ok())
1184 .ok_or_else(|| anyhow!("Symbol '{}' not found", sym_name))
1185}
1186
1187fn resolve_cpus_pct_range(
1188 cpus_range: &Option<(usize, usize)>,
1189 cpus_range_frac: &Option<(f64, f64)>,
1190 max_cpus: usize,
1191) -> Result<(usize, usize)> {
1192 match (cpus_range, cpus_range_frac) {
1193 (Some(_x), Some(_y)) => {
1194 bail!("cpus_range cannot be used with cpus_pct.");
1195 }
1196 (Some((cpus_range_min, cpus_range_max)), None) => Ok((*cpus_range_min, *cpus_range_max)),
1197 (None, Some((cpus_frac_min, cpus_frac_max))) => {
1198 if *cpus_frac_min < 0_f64
1199 || *cpus_frac_min > 1_f64
1200 || *cpus_frac_max < 0_f64
1201 || *cpus_frac_max > 1_f64
1202 {
1203 bail!("cpus_range_frac values must be between 0.0 and 1.0");
1204 }
1205 let cpus_min_count = ((max_cpus as f64) * cpus_frac_min).round_ties_even() as usize;
1206 let cpus_max_count = ((max_cpus as f64) * cpus_frac_max).round_ties_even() as usize;
1207 Ok((
1208 std::cmp::max(cpus_min_count, 1),
1209 std::cmp::min(cpus_max_count, max_cpus),
1210 ))
1211 }
1212 (None, None) => Ok((0, max_cpus)),
1213 }
1214}
1215
1216impl Layer {
1217 fn new(spec: &LayerSpec, topo: &Topology, core_order: &Vec<usize>) -> Result<Self> {
1218 let name = &spec.name;
1219 let kind = spec.kind.clone();
1220 let mut allowed_cpus = Cpumask::new();
1221 match &kind {
1222 LayerKind::Confined {
1223 cpus_range,
1224 cpus_range_frac,
1225 common: LayerCommon { nodes, llcs, .. },
1226 ..
1227 } => {
1228 let cpus_range =
1229 resolve_cpus_pct_range(cpus_range, cpus_range_frac, topo.all_cpus.len())?;
1230 if cpus_range.0 > cpus_range.1 || cpus_range.1 == 0 {
1231 bail!("invalid cpus_range {:?}", cpus_range);
1232 }
1233 if nodes.is_empty() && llcs.is_empty() {
1234 allowed_cpus.set_all();
1235 } else {
1236 for (node_id, node) in &topo.nodes {
1238 if nodes.contains(node_id) {
1240 for &id in node.all_cpus.keys() {
1241 allowed_cpus.set_cpu(id)?;
1242 }
1243 }
1244 for (llc_id, llc) in &node.llcs {
1246 if llcs.contains(llc_id) {
1247 for &id in llc.all_cpus.keys() {
1248 allowed_cpus.set_cpu(id)?;
1249 }
1250 }
1251 }
1252 }
1253 }
1254 }
1255 LayerKind::Grouped {
1256 common: LayerCommon { nodes, llcs, .. },
1257 ..
1258 }
1259 | LayerKind::Open {
1260 common: LayerCommon { nodes, llcs, .. },
1261 ..
1262 } => {
1263 if nodes.is_empty() && llcs.is_empty() {
1264 allowed_cpus.set_all();
1265 } else {
1266 for (node_id, node) in &topo.nodes {
1268 if nodes.contains(node_id) {
1270 for &id in node.all_cpus.keys() {
1271 allowed_cpus.set_cpu(id)?;
1272 }
1273 }
1274 for (llc_id, llc) in &node.llcs {
1276 if llcs.contains(llc_id) {
1277 for &id in llc.all_cpus.keys() {
1278 allowed_cpus.set_cpu(id)?;
1279 }
1280 }
1281 }
1282 }
1283 }
1284 }
1285 }
1286
1287 if let Some(util_range) = kind.util_range() {
1290 if util_range.0 < 0.0 || util_range.1 < 0.0 || util_range.0 >= util_range.1 {
1291 bail!("invalid util_range {:?}", util_range);
1292 }
1293 }
1294
1295 let layer_growth_algo = kind.common().growth_algo.clone();
1296
1297 debug!(
1298 "layer: {} algo: {:?} core order: {:?}",
1299 name, &layer_growth_algo, core_order
1300 );
1301
1302 Ok(Self {
1303 name: name.into(),
1304 kind,
1305 growth_algo: layer_growth_algo,
1306 core_order: core_order.clone(),
1307
1308 target_llc_cpus: (0, 0),
1309 assigned_llcs: vec![],
1310
1311 nr_cpus: 0,
1312 nr_llc_cpus: vec![0; topo.all_llcs.len()],
1313 cpus: Cpumask::new(),
1314 allowed_cpus,
1315 })
1316 }
1317
1318 fn free_some_cpus(&mut self, cpu_pool: &mut CpuPool, max_to_free: usize) -> Result<usize> {
1319 let cpus_to_free = match cpu_pool.next_to_free(&self.cpus, self.core_order.iter().rev())? {
1320 Some(ret) => ret.clone(),
1321 None => return Ok(0),
1322 };
1323
1324 let nr_to_free = cpus_to_free.weight();
1325
1326 Ok(if nr_to_free <= max_to_free {
1327 trace!("[{}] freeing CPUs: {}", self.name, &cpus_to_free);
1328 self.cpus &= &cpus_to_free.not();
1329 self.nr_cpus -= nr_to_free;
1330 for cpu in cpus_to_free.iter() {
1331 self.nr_llc_cpus[cpu_pool.topo.all_cpus[&cpu].llc_id] -= 1;
1332 }
1333 cpu_pool.free(&cpus_to_free)?;
1334 nr_to_free
1335 } else {
1336 0
1337 })
1338 }
1339
1340 fn alloc_some_cpus(&mut self, cpu_pool: &mut CpuPool) -> Result<usize> {
1341 let new_cpus = match cpu_pool
1342 .alloc_cpus(&self.allowed_cpus, &self.core_order)
1343 .clone()
1344 {
1345 Some(ret) => ret.clone(),
1346 None => {
1347 trace!("layer-{} can't grow, no CPUs", &self.name);
1348 return Ok(0);
1349 }
1350 };
1351
1352 let nr_new_cpus = new_cpus.weight();
1353
1354 trace!("[{}] adding CPUs: {}", &self.name, &new_cpus);
1355 self.cpus |= &new_cpus;
1356 self.nr_cpus += nr_new_cpus;
1357 for cpu in new_cpus.iter() {
1358 self.nr_llc_cpus[cpu_pool.topo.all_cpus[&cpu].llc_id] += 1;
1359 }
1360 Ok(nr_new_cpus)
1361 }
1362}
1363#[derive(Debug, Clone)]
1364struct NodeInfo {
1365 node_mask: nix::sched::CpuSet,
1366 _node_id: usize,
1367}
1368
1369#[derive(Debug)]
1370struct GpuTaskAffinitizer {
1371 gpu_devs_to_node_info: HashMap<u32, NodeInfo>,
1374 gpu_pids_to_devs: HashMap<Pid, u32>,
1375 last_process_time: Option<Instant>,
1376 sys: System,
1377 pid_map: HashMap<Pid, Vec<Pid>>,
1378 poll_interval: Duration,
1379 enable: bool,
1380 tasks_affinitized: u64,
1381 last_task_affinitization_ms: u64,
1382}
1383
1384impl GpuTaskAffinitizer {
1385 pub fn new(poll_interval: u64, enable: bool) -> GpuTaskAffinitizer {
1386 GpuTaskAffinitizer {
1387 gpu_devs_to_node_info: HashMap::new(),
1388 gpu_pids_to_devs: HashMap::new(),
1389 last_process_time: None,
1390 sys: System::default(),
1391 pid_map: HashMap::new(),
1392 poll_interval: Duration::from_secs(poll_interval),
1393 enable,
1394 tasks_affinitized: 0,
1395 last_task_affinitization_ms: 0,
1396 }
1397 }
1398
1399 fn find_one_cpu(&self, affinity: Vec<u64>) -> Result<u32> {
1400 for (chunk, &mask) in affinity.iter().enumerate() {
1401 let mut inner_offset: u64 = 1;
1402 for _ in 0..64 {
1403 if (mask & inner_offset) != 0 {
1404 return Ok((64 * chunk + u64::trailing_zeros(inner_offset) as usize) as u32);
1405 }
1406 inner_offset = inner_offset << 1;
1407 }
1408 }
1409 anyhow::bail!("unable to get CPU from NVML bitmask");
1410 }
1411
1412 fn node_to_cpuset(&self, node: &scx_utils::Node) -> Result<CpuSet> {
1413 let mut cpuset = CpuSet::new();
1414 for (cpu_id, _cpu) in &node.all_cpus {
1415 cpuset.set(*cpu_id)?;
1416 }
1417 Ok(cpuset)
1418 }
1419
1420 fn init_dev_node_map(&mut self, topo: Arc<Topology>) -> Result<()> {
1421 let nvml = nvml()?;
1422 let device_count = nvml.device_count()?;
1423
1424 for idx in 0..device_count {
1425 let dev = nvml.device_by_index(idx)?;
1426 let cpu = dev.cpu_affinity(16)?;
1428 let ideal_cpu = self.find_one_cpu(cpu)?;
1429 if let Some(cpu) = topo.all_cpus.get(&(ideal_cpu as usize)) {
1430 self.gpu_devs_to_node_info.insert(
1431 idx,
1432 NodeInfo {
1433 node_mask: self.node_to_cpuset(
1434 topo.nodes.get(&cpu.node_id).expect("topo missing node"),
1435 )?,
1436 _node_id: cpu.node_id,
1437 },
1438 );
1439 }
1440 }
1441 Ok(())
1442 }
1443
1444 fn update_gpu_pids(&mut self) -> Result<()> {
1445 let nvml = nvml()?;
1446 for i in 0..nvml.device_count()? {
1447 let device = nvml.device_by_index(i)?;
1448 for proc in device
1449 .running_compute_processes()?
1450 .into_iter()
1451 .chain(device.running_graphics_processes()?.into_iter())
1452 {
1453 self.gpu_pids_to_devs.insert(Pid::from_u32(proc.pid), i);
1454 }
1455 }
1456 Ok(())
1457 }
1458
1459 fn update_process_info(&mut self) -> Result<()> {
1460 self.sys.refresh_processes_specifics(
1461 ProcessesToUpdate::All,
1462 true,
1463 ProcessRefreshKind::nothing(),
1464 );
1465 self.pid_map.clear();
1466 for (pid, proc_) in self.sys.processes() {
1467 if let Some(ppid) = proc_.parent() {
1468 self.pid_map.entry(ppid).or_default().push(*pid);
1469 }
1470 }
1471 Ok(())
1472 }
1473
1474 fn get_child_pids_and_tids(&self, root_pid: Pid) -> HashSet<Pid> {
1475 let mut work = VecDeque::from([root_pid]);
1476 let mut pids_and_tids: HashSet<Pid> = HashSet::new();
1477
1478 while let Some(pid) = work.pop_front() {
1479 if pids_and_tids.insert(pid) {
1480 if let Some(kids) = self.pid_map.get(&pid) {
1481 work.extend(kids);
1482 }
1483 if let Some(proc_) = self.sys.process(pid) {
1484 if let Some(tasks) = proc_.tasks() {
1485 pids_and_tids.extend(tasks.iter().copied());
1486 }
1487 }
1488 }
1489 }
1490 pids_and_tids
1491 }
1492
1493 fn affinitize_gpu_pids(&mut self) -> Result<()> {
1494 if !self.enable {
1495 return Ok(());
1496 }
1497 for (pid, dev) in &self.gpu_pids_to_devs {
1498 let node_info = self
1499 .gpu_devs_to_node_info
1500 .get(&dev)
1501 .expect("Unable to get gpu pid node mask");
1502 for child in self.get_child_pids_and_tids(*pid) {
1503 match nix::sched::sched_setaffinity(
1504 nix::unistd::Pid::from_raw(child.as_u32() as i32),
1505 &node_info.node_mask,
1506 ) {
1507 Ok(_) => {
1508 self.tasks_affinitized += 1;
1510 }
1511 Err(_) => {
1512 debug!(
1513 "Error affinitizing gpu pid {} to node {:#?}",
1514 child.as_u32(),
1515 node_info
1516 );
1517 }
1518 };
1519 }
1520 }
1521 Ok(())
1522 }
1523
1524 pub fn maybe_affinitize(&mut self) {
1525 if !self.enable {
1526 return;
1527 }
1528 let now = Instant::now();
1529
1530 if let Some(last_process_time) = self.last_process_time {
1531 if (now - last_process_time) < self.poll_interval {
1532 return;
1533 }
1534 }
1535
1536 match self.update_gpu_pids() {
1537 Ok(_) => {}
1538 Err(e) => {
1539 error!("Error updating GPU PIDs: {}", e);
1540 }
1541 };
1542 match self.update_process_info() {
1543 Ok(_) => {}
1544 Err(e) => {
1545 error!("Error updating process info to affinitize GPU PIDs: {}", e);
1546 }
1547 };
1548 match self.affinitize_gpu_pids() {
1549 Ok(_) => {}
1550 Err(e) => {
1551 error!("Error updating GPU PIDs: {}", e);
1552 }
1553 };
1554 self.last_process_time = Some(now);
1555 self.last_task_affinitization_ms = (Instant::now() - now).as_millis() as u64;
1556
1557 return;
1558 }
1559
1560 pub fn init(&mut self, topo: Arc<Topology>) {
1561 if !self.enable || self.last_process_time.is_some() {
1562 return;
1563 }
1564
1565 match self.init_dev_node_map(topo) {
1566 Ok(_) => {}
1567 Err(e) => {
1568 error!("Error initializing gpu node dev map: {}", e);
1569 }
1570 };
1571 self.sys = System::new_all();
1572 return;
1573 }
1574}
1575
1576struct Scheduler<'a> {
1577 skel: BpfSkel<'a>,
1578 struct_ops: Option<libbpf_rs::Link>,
1579 layer_specs: Vec<LayerSpec>,
1580
1581 sched_intv: Duration,
1582 layer_refresh_intv: Duration,
1583
1584 cpu_pool: CpuPool,
1585 layers: Vec<Layer>,
1586 idle_qos_enabled: bool,
1587
1588 proc_reader: fb_procfs::ProcReader,
1589 sched_stats: Stats,
1590
1591 nr_layer_cpus_ranges: Vec<(usize, usize)>,
1592 processing_dur: Duration,
1593
1594 topo: Arc<Topology>,
1595 netdevs: BTreeMap<String, NetDev>,
1596 stats_server: StatsServer<StatsReq, StatsRes>,
1597 gpu_task_handler: GpuTaskAffinitizer,
1598}
1599
1600impl<'a> Scheduler<'a> {
1601 fn init_layers(skel: &mut OpenBpfSkel, specs: &[LayerSpec], topo: &Topology) -> Result<()> {
1602 skel.maps.rodata_data.as_mut().unwrap().nr_layers = specs.len() as u32;
1603 let mut perf_set = false;
1604
1605 let mut layer_iteration_order = (0..specs.len()).collect::<Vec<_>>();
1606 let mut layer_weights: Vec<usize> = vec![];
1607
1608 for (spec_i, spec) in specs.iter().enumerate() {
1609 let layer = &mut skel.maps.bss_data.as_mut().unwrap().layers[spec_i];
1610
1611 for (or_i, or) in spec.matches.iter().enumerate() {
1612 for (and_i, and) in or.iter().enumerate() {
1613 let mt = &mut layer.matches[or_i].matches[and_i];
1614
1615 mt.exclude.write(false);
1617
1618 match and {
1619 LayerMatch::CgroupPrefix(prefix) => {
1620 mt.kind = bpf_intf::layer_match_kind_MATCH_CGROUP_PREFIX as i32;
1621 copy_into_cstr(&mut mt.cgroup_prefix, prefix.as_str());
1622 }
1623 LayerMatch::CgroupSuffix(suffix) => {
1624 mt.kind = bpf_intf::layer_match_kind_MATCH_CGROUP_SUFFIX as i32;
1625 copy_into_cstr(&mut mt.cgroup_suffix, suffix.as_str());
1626 }
1627 LayerMatch::CgroupRegex(_) => {
1628 panic!("CgroupRegex match only supported in template");
1629 }
1630 LayerMatch::CgroupContains(substr) => {
1631 mt.kind = bpf_intf::layer_match_kind_MATCH_CGROUP_CONTAINS as i32;
1632 copy_into_cstr(&mut mt.cgroup_substr, substr.as_str());
1633 }
1634 LayerMatch::CommPrefix(prefix) => {
1635 mt.kind = bpf_intf::layer_match_kind_MATCH_COMM_PREFIX as i32;
1636 copy_into_cstr(&mut mt.comm_prefix, prefix.as_str());
1637 }
1638 LayerMatch::CommPrefixExclude(prefix) => {
1639 mt.kind = bpf_intf::layer_match_kind_MATCH_COMM_PREFIX as i32;
1640 mt.exclude.write(true);
1641 copy_into_cstr(&mut mt.comm_prefix, prefix.as_str());
1642 }
1643 LayerMatch::PcommPrefix(prefix) => {
1644 mt.kind = bpf_intf::layer_match_kind_MATCH_PCOMM_PREFIX as i32;
1645 copy_into_cstr(&mut mt.pcomm_prefix, prefix.as_str());
1646 }
1647 LayerMatch::PcommPrefixExclude(prefix) => {
1648 mt.kind = bpf_intf::layer_match_kind_MATCH_PCOMM_PREFIX as i32;
1649 mt.exclude.write(true);
1650 copy_into_cstr(&mut mt.pcomm_prefix, prefix.as_str());
1651 }
1652 LayerMatch::NiceAbove(nice) => {
1653 mt.kind = bpf_intf::layer_match_kind_MATCH_NICE_ABOVE as i32;
1654 mt.nice = *nice;
1655 }
1656 LayerMatch::NiceBelow(nice) => {
1657 mt.kind = bpf_intf::layer_match_kind_MATCH_NICE_BELOW as i32;
1658 mt.nice = *nice;
1659 }
1660 LayerMatch::NiceEquals(nice) => {
1661 mt.kind = bpf_intf::layer_match_kind_MATCH_NICE_EQUALS as i32;
1662 mt.nice = *nice;
1663 }
1664 LayerMatch::UIDEquals(user_id) => {
1665 mt.kind = bpf_intf::layer_match_kind_MATCH_USER_ID_EQUALS as i32;
1666 mt.user_id = *user_id;
1667 }
1668 LayerMatch::GIDEquals(group_id) => {
1669 mt.kind = bpf_intf::layer_match_kind_MATCH_GROUP_ID_EQUALS as i32;
1670 mt.group_id = *group_id;
1671 }
1672 LayerMatch::PIDEquals(pid) => {
1673 mt.kind = bpf_intf::layer_match_kind_MATCH_PID_EQUALS as i32;
1674 mt.pid = *pid;
1675 }
1676 LayerMatch::PPIDEquals(ppid) => {
1677 mt.kind = bpf_intf::layer_match_kind_MATCH_PPID_EQUALS as i32;
1678 mt.ppid = *ppid;
1679 }
1680 LayerMatch::TGIDEquals(tgid) => {
1681 mt.kind = bpf_intf::layer_match_kind_MATCH_TGID_EQUALS as i32;
1682 mt.tgid = *tgid;
1683 }
1684 LayerMatch::NSPIDEquals(nsid, pid) => {
1685 mt.kind = bpf_intf::layer_match_kind_MATCH_NSPID_EQUALS as i32;
1686 mt.nsid = *nsid;
1687 mt.pid = *pid;
1688 }
1689 LayerMatch::NSEquals(nsid) => {
1690 mt.kind = bpf_intf::layer_match_kind_MATCH_NS_EQUALS as i32;
1691 mt.nsid = *nsid as u64;
1692 }
1693 LayerMatch::CmdJoin(joincmd) => {
1694 mt.kind = bpf_intf::layer_match_kind_MATCH_SCXCMD_JOIN as i32;
1695 copy_into_cstr(&mut mt.comm_prefix, joincmd);
1696 }
1697 LayerMatch::IsGroupLeader(polarity) => {
1698 mt.kind = bpf_intf::layer_match_kind_MATCH_IS_GROUP_LEADER as i32;
1699 mt.is_group_leader.write(*polarity);
1700 }
1701 LayerMatch::IsKthread(polarity) => {
1702 mt.kind = bpf_intf::layer_match_kind_MATCH_IS_KTHREAD as i32;
1703 mt.is_kthread.write(*polarity);
1704 }
1705 LayerMatch::UsedGpuTid(polarity) => {
1706 mt.kind = bpf_intf::layer_match_kind_MATCH_USED_GPU_TID as i32;
1707 mt.used_gpu_tid.write(*polarity);
1708 }
1709 LayerMatch::UsedGpuPid(polarity) => {
1710 mt.kind = bpf_intf::layer_match_kind_MATCH_USED_GPU_PID as i32;
1711 mt.used_gpu_pid.write(*polarity);
1712 }
1713 LayerMatch::AvgRuntime(min, max) => {
1714 mt.kind = bpf_intf::layer_match_kind_MATCH_AVG_RUNTIME as i32;
1715 mt.min_avg_runtime_us = *min;
1716 mt.max_avg_runtime_us = *max;
1717 }
1718 LayerMatch::HintEquals(hint) => {
1719 mt.kind = bpf_intf::layer_match_kind_MATCH_HINT_EQUALS as i32;
1720 mt.hint = *hint;
1721 }
1722 }
1723 }
1724 layer.matches[or_i].nr_match_ands = or.len() as i32;
1725 }
1726
1727 layer.nr_match_ors = spec.matches.len() as u32;
1728 layer.kind = spec.kind.as_bpf_enum();
1729
1730 {
1731 let LayerCommon {
1732 min_exec_us,
1733 yield_ignore,
1734 perf,
1735 preempt,
1736 preempt_first,
1737 exclusive,
1738 allow_node_aligned,
1739 skip_remote_node,
1740 prev_over_idle_core,
1741 growth_algo,
1742 nodes,
1743 slice_us,
1744 fifo,
1745 weight,
1746 disallow_open_after_us,
1747 disallow_preempt_after_us,
1748 xllc_mig_min_us,
1749 placement,
1750 ..
1751 } = spec.kind.common();
1752
1753 layer.slice_ns = *slice_us * 1000;
1754 layer.fifo.write(*fifo);
1755 layer.min_exec_ns = min_exec_us * 1000;
1756 layer.yield_step_ns = if *yield_ignore > 0.999 {
1757 0
1758 } else if *yield_ignore < 0.001 {
1759 layer.slice_ns
1760 } else {
1761 (layer.slice_ns as f64 * (1.0 - *yield_ignore)) as u64
1762 };
1763 let mut layer_name: String = spec.name.clone();
1764 layer_name.truncate(MAX_LAYER_NAME);
1765 copy_into_cstr(&mut layer.name, layer_name.as_str());
1766 layer.preempt.write(*preempt);
1767 layer.preempt_first.write(*preempt_first);
1768 layer.excl.write(*exclusive);
1769 layer.allow_node_aligned.write(*allow_node_aligned);
1770 layer.skip_remote_node.write(*skip_remote_node);
1771 layer.prev_over_idle_core.write(*prev_over_idle_core);
1772 layer.growth_algo = growth_algo.as_bpf_enum();
1773 layer.weight = *weight;
1774 layer.disallow_open_after_ns = match disallow_open_after_us.unwrap() {
1775 v if v == u64::MAX => v,
1776 v => v * 1000,
1777 };
1778 layer.disallow_preempt_after_ns = match disallow_preempt_after_us.unwrap() {
1779 v if v == u64::MAX => v,
1780 v => v * 1000,
1781 };
1782 layer.xllc_mig_min_ns = (xllc_mig_min_us * 1000.0) as u64;
1783 layer_weights.push(layer.weight.try_into().unwrap());
1784 layer.perf = u32::try_from(*perf)?;
1785 layer.node_mask = nodemask_from_nodes(nodes) as u64;
1786 for (topo_node_id, topo_node) in &topo.nodes {
1787 if !nodes.is_empty() && !nodes.contains(topo_node_id) {
1788 continue;
1789 }
1790 layer.llc_mask |= llcmask_from_llcs(&topo_node.llcs) as u64;
1791 }
1792
1793 let task_place = |place: u32| crate::types::layer_task_place(place);
1794 layer.task_place = match placement {
1795 LayerPlacement::Standard => {
1796 task_place(bpf_intf::layer_task_place_PLACEMENT_STD as u32)
1797 }
1798 LayerPlacement::Sticky => {
1799 task_place(bpf_intf::layer_task_place_PLACEMENT_STICK as u32)
1800 }
1801 LayerPlacement::Floating => {
1802 task_place(bpf_intf::layer_task_place_PLACEMENT_FLOAT as u32)
1803 }
1804 };
1805 }
1806
1807 layer.is_protected.write(match spec.kind {
1808 LayerKind::Open { .. } => false,
1809 LayerKind::Confined { protected, .. } | LayerKind::Grouped { protected, .. } => {
1810 protected
1811 }
1812 });
1813
1814 match &spec.cpuset {
1815 Some(mask) => {
1816 Self::update_cpumask(&mask, &mut layer.cpuset);
1817 }
1818 None => {
1819 for i in 0..layer.cpuset.len() {
1820 layer.cpuset[i] = u8::MAX;
1821 }
1822 }
1823 };
1824
1825 perf_set |= layer.perf > 0;
1826 }
1827
1828 layer_iteration_order.sort_by(|i, j| layer_weights[*i].cmp(&layer_weights[*j]));
1829 for (idx, layer_idx) in layer_iteration_order.iter().enumerate() {
1830 skel.maps
1831 .rodata_data
1832 .as_mut()
1833 .unwrap()
1834 .layer_iteration_order[idx] = *layer_idx as u32;
1835 }
1836
1837 if perf_set && !compat::ksym_exists("scx_bpf_cpuperf_set")? {
1838 warn!("cpufreq support not available, ignoring perf configurations");
1839 }
1840
1841 Ok(())
1842 }
1843
1844 fn init_nodes(skel: &mut OpenBpfSkel, _opts: &Opts, topo: &Topology) {
1845 skel.maps.rodata_data.as_mut().unwrap().nr_nodes = topo.nodes.len() as u32;
1846 skel.maps.rodata_data.as_mut().unwrap().nr_llcs = 0;
1847
1848 for (&node_id, node) in &topo.nodes {
1849 debug!("configuring node {}, LLCs {:?}", node_id, node.llcs.len());
1850 skel.maps.rodata_data.as_mut().unwrap().nr_llcs += node.llcs.len() as u32;
1851 let raw_numa_slice = node.span.as_raw_slice();
1852 let node_cpumask_slice =
1853 &mut skel.maps.rodata_data.as_mut().unwrap().numa_cpumasks[node_id];
1854 let (left, _) = node_cpumask_slice.split_at_mut(raw_numa_slice.len());
1855 left.clone_from_slice(raw_numa_slice);
1856 debug!(
1857 "node {} mask: {:?}",
1858 node_id,
1859 skel.maps.rodata_data.as_ref().unwrap().numa_cpumasks[node_id]
1860 );
1861
1862 for llc in node.llcs.values() {
1863 debug!("configuring llc {:?} for node {:?}", llc.id, node_id);
1864 skel.maps.rodata_data.as_mut().unwrap().llc_numa_id_map[llc.id] = node_id as u32;
1865 }
1866 }
1867
1868 for cpu in topo.all_cpus.values() {
1869 skel.maps.rodata_data.as_mut().unwrap().cpu_llc_id_map[cpu.id] = cpu.llc_id as u32;
1870 }
1871 }
1872
1873 fn init_cpu_prox_map(topo: &Topology, cpu_ctxs: &mut [bpf_intf::cpu_ctx]) {
1874 let radiate = |mut vec: Vec<usize>, center_id: usize| -> Vec<usize> {
1875 vec.sort_by_key(|&id| (center_id as i32 - id as i32).abs());
1876 vec
1877 };
1878 let radiate_cpu =
1879 |mut vec: Vec<usize>, center_cpu: usize, center_core: usize| -> Vec<usize> {
1880 vec.sort_by_key(|&id| {
1881 (
1882 (center_core as i32 - topo.all_cpus.get(&id).unwrap().core_id as i32).abs(),
1883 (center_cpu as i32 - id as i32).abs(),
1884 )
1885 });
1886 vec
1887 };
1888
1889 for (&cpu_id, cpu) in &topo.all_cpus {
1890 let mut core_span = topo.all_cores[&cpu.core_id].span.clone();
1892 let llc_span = &topo.all_llcs[&cpu.llc_id].span;
1893 let node_span = &topo.nodes[&cpu.node_id].span;
1894 let sys_span = &topo.span;
1895
1896 let sys_span = sys_span.and(&node_span.not());
1898 let node_span = node_span.and(&llc_span.not());
1899 let llc_span = llc_span.and(&core_span.not());
1900 core_span.clear_cpu(cpu_id).unwrap();
1901
1902 let mut sys_order: Vec<usize> = sys_span.iter().collect();
1904 let mut node_order: Vec<usize> = node_span.iter().collect();
1905 let mut llc_order: Vec<usize> = llc_span.iter().collect();
1906 let mut core_order: Vec<usize> = core_span.iter().collect();
1907
1908 sys_order = radiate_cpu(sys_order, cpu_id, cpu.core_id);
1913 node_order = radiate(node_order, cpu.node_id);
1914 llc_order = radiate_cpu(llc_order, cpu_id, cpu.core_id);
1915 core_order = radiate_cpu(core_order, cpu_id, cpu.core_id);
1916
1917 let mut order: Vec<usize> = vec![];
1919 let mut idx: usize = 0;
1920
1921 idx += 1;
1922 order.push(cpu_id);
1923
1924 idx += core_order.len();
1925 order.append(&mut core_order);
1926 let core_end = idx;
1927
1928 idx += llc_order.len();
1929 order.append(&mut llc_order);
1930 let llc_end = idx;
1931
1932 idx += node_order.len();
1933 order.append(&mut node_order);
1934 let node_end = idx;
1935
1936 idx += sys_order.len();
1937 order.append(&mut sys_order);
1938 let sys_end = idx;
1939
1940 debug!(
1941 "CPU[{}] proximity map[{}/{}/{}/{}]: {:?}",
1942 cpu_id, core_end, llc_end, node_end, sys_end, &order
1943 );
1944
1945 let pmap = &mut cpu_ctxs[cpu_id].prox_map;
1947 for (i, &cpu) in order.iter().enumerate() {
1948 pmap.cpus[i] = cpu as u16;
1949 }
1950 pmap.core_end = core_end as u32;
1951 pmap.llc_end = llc_end as u32;
1952 pmap.node_end = node_end as u32;
1953 pmap.sys_end = sys_end as u32;
1954 }
1955 }
1956
1957 fn convert_cpu_ctxs(cpu_ctxs: Vec<bpf_intf::cpu_ctx>) -> Vec<Vec<u8>> {
1958 cpu_ctxs
1959 .into_iter()
1960 .map(|cpu_ctx| {
1961 let bytes = unsafe {
1962 std::slice::from_raw_parts(
1963 &cpu_ctx as *const bpf_intf::cpu_ctx as *const u8,
1964 std::mem::size_of::<bpf_intf::cpu_ctx>(),
1965 )
1966 };
1967 bytes.to_vec()
1968 })
1969 .collect()
1970 }
1971
1972 fn init_cpus(skel: &BpfSkel, layer_specs: &[LayerSpec], topo: &Topology) -> Result<()> {
1973 let key = (0_u32).to_ne_bytes();
1974 let mut cpu_ctxs: Vec<bpf_intf::cpu_ctx> = vec![];
1975 let cpu_ctxs_vec = skel
1976 .maps
1977 .cpu_ctxs
1978 .lookup_percpu(&key, libbpf_rs::MapFlags::ANY)
1979 .context("Failed to lookup cpu_ctx")?
1980 .unwrap();
1981
1982 let op_layers: Vec<u32> = layer_specs
1983 .iter()
1984 .enumerate()
1985 .filter(|(_idx, spec)| match &spec.kind {
1986 LayerKind::Open { .. } => spec.kind.common().preempt,
1987 _ => false,
1988 })
1989 .map(|(idx, _)| idx as u32)
1990 .collect();
1991 let on_layers: Vec<u32> = layer_specs
1992 .iter()
1993 .enumerate()
1994 .filter(|(_idx, spec)| match &spec.kind {
1995 LayerKind::Open { .. } => !spec.kind.common().preempt,
1996 _ => false,
1997 })
1998 .map(|(idx, _)| idx as u32)
1999 .collect();
2000 let gp_layers: Vec<u32> = layer_specs
2001 .iter()
2002 .enumerate()
2003 .filter(|(_idx, spec)| match &spec.kind {
2004 LayerKind::Grouped { .. } => spec.kind.common().preempt,
2005 _ => false,
2006 })
2007 .map(|(idx, _)| idx as u32)
2008 .collect();
2009 let gn_layers: Vec<u32> = layer_specs
2010 .iter()
2011 .enumerate()
2012 .filter(|(_idx, spec)| match &spec.kind {
2013 LayerKind::Grouped { .. } => !spec.kind.common().preempt,
2014 _ => false,
2015 })
2016 .map(|(idx, _)| idx as u32)
2017 .collect();
2018
2019 for cpu in 0..*NR_CPUS_POSSIBLE {
2021 cpu_ctxs.push(*unsafe {
2022 &*(cpu_ctxs_vec[cpu].as_slice().as_ptr() as *const bpf_intf::cpu_ctx)
2023 });
2024
2025 let topo_cpu = topo.all_cpus.get(&cpu).unwrap();
2026 let is_big = topo_cpu.core_type == CoreType::Big { turbo: true };
2027 cpu_ctxs[cpu].cpu = cpu as i32;
2028 cpu_ctxs[cpu].layer_id = MAX_LAYERS as u32;
2029 cpu_ctxs[cpu].task_layer_id = MAX_LAYERS as u32;
2030 cpu_ctxs[cpu].is_big = is_big;
2031
2032 fastrand::seed(cpu as u64);
2033
2034 let mut ogp_order = op_layers.clone();
2035 ogp_order.append(&mut gp_layers.clone());
2036 fastrand::shuffle(&mut ogp_order);
2037
2038 let mut ogn_order = on_layers.clone();
2039 ogn_order.append(&mut gn_layers.clone());
2040 fastrand::shuffle(&mut ogn_order);
2041
2042 let mut op_order = op_layers.clone();
2043 fastrand::shuffle(&mut op_order);
2044
2045 let mut on_order = on_layers.clone();
2046 fastrand::shuffle(&mut on_order);
2047
2048 let mut gp_order = gp_layers.clone();
2049 fastrand::shuffle(&mut gp_order);
2050
2051 let mut gn_order = gn_layers.clone();
2052 fastrand::shuffle(&mut gn_order);
2053
2054 for i in 0..MAX_LAYERS {
2055 cpu_ctxs[cpu].ogp_layer_order[i] =
2056 ogp_order.get(i).cloned().unwrap_or(MAX_LAYERS as u32);
2057 cpu_ctxs[cpu].ogn_layer_order[i] =
2058 ogn_order.get(i).cloned().unwrap_or(MAX_LAYERS as u32);
2059
2060 cpu_ctxs[cpu].op_layer_order[i] =
2061 op_order.get(i).cloned().unwrap_or(MAX_LAYERS as u32);
2062 cpu_ctxs[cpu].on_layer_order[i] =
2063 on_order.get(i).cloned().unwrap_or(MAX_LAYERS as u32);
2064 cpu_ctxs[cpu].gp_layer_order[i] =
2065 gp_order.get(i).cloned().unwrap_or(MAX_LAYERS as u32);
2066 cpu_ctxs[cpu].gn_layer_order[i] =
2067 gn_order.get(i).cloned().unwrap_or(MAX_LAYERS as u32);
2068 }
2069 }
2070
2071 Self::init_cpu_prox_map(topo, &mut cpu_ctxs);
2072
2073 skel.maps
2074 .cpu_ctxs
2075 .update_percpu(
2076 &key,
2077 &Self::convert_cpu_ctxs(cpu_ctxs),
2078 libbpf_rs::MapFlags::ANY,
2079 )
2080 .context("Failed to update cpu_ctx")?;
2081
2082 Ok(())
2083 }
2084
2085 fn init_llc_prox_map(skel: &mut BpfSkel, topo: &Topology) -> Result<()> {
2086 for (&llc_id, llc) in &topo.all_llcs {
2087 let mut node_order: Vec<usize> =
2089 topo.nodes[&llc.node_id].llcs.keys().cloned().collect();
2090 let mut sys_order: Vec<usize> = topo.all_llcs.keys().cloned().collect();
2091
2092 sys_order.retain(|id| !node_order.contains(id));
2094 node_order.retain(|&id| id != llc_id);
2095
2096 fastrand::seed(llc_id as u64);
2099 fastrand::shuffle(&mut sys_order);
2100 fastrand::shuffle(&mut node_order);
2101
2102 let mut order: Vec<usize> = vec![];
2104 let mut idx: usize = 0;
2105
2106 idx += 1;
2107 order.push(llc_id);
2108
2109 idx += node_order.len();
2110 order.append(&mut node_order);
2111 let node_end = idx;
2112
2113 idx += sys_order.len();
2114 order.append(&mut sys_order);
2115 let sys_end = idx;
2116
2117 debug!(
2118 "LLC[{}] proximity map[{}/{}]: {:?}",
2119 llc_id, node_end, sys_end, &order
2120 );
2121
2122 let key = llc_id as u32;
2127 let llc_id_slice =
2128 unsafe { std::slice::from_raw_parts((&key as *const u32) as *const u8, 4) };
2129 let v = skel
2130 .maps
2131 .llc_data
2132 .lookup(llc_id_slice, libbpf_rs::MapFlags::ANY)
2133 .unwrap()
2134 .unwrap();
2135 let mut llcc = unsafe { *(v.as_slice().as_ptr() as *const bpf_intf::llc_ctx) };
2136
2137 let pmap = &mut llcc.prox_map;
2138 for (i, &llc_id) in order.iter().enumerate() {
2139 pmap.llcs[i] = llc_id as u16;
2140 }
2141 pmap.node_end = node_end as u32;
2142 pmap.sys_end = sys_end as u32;
2143
2144 let v = unsafe {
2145 std::slice::from_raw_parts(
2146 &llcc as *const bpf_intf::llc_ctx as *const u8,
2147 std::mem::size_of::<bpf_intf::llc_ctx>(),
2148 )
2149 };
2150
2151 skel.maps
2152 .llc_data
2153 .update(llc_id_slice, v, libbpf_rs::MapFlags::ANY)?
2154 }
2155
2156 Ok(())
2157 }
2158
2159 fn init(
2160 opts: &'a Opts,
2161 layer_specs: &[LayerSpec],
2162 open_object: &'a mut MaybeUninit<OpenObject>,
2163 hint_to_layer_map: &HashMap<u64, usize>,
2164 membw_tracking: bool,
2165 ) -> Result<Self> {
2166 let nr_layers = layer_specs.len();
2167 let mut disable_topology = opts.disable_topology.unwrap_or(false);
2168
2169 let topo = Arc::new(if disable_topology {
2170 Topology::with_flattened_llc_node()?
2171 } else {
2172 Topology::new()?
2173 });
2174
2175 if topo.nodes.keys().enumerate().any(|(i, &k)| i != k) {
2182 bail!("Holes in node IDs detected: {:?}", topo.nodes.keys());
2183 }
2184 if topo.all_llcs.keys().enumerate().any(|(i, &k)| i != k) {
2185 bail!("Holes in LLC IDs detected: {:?}", topo.all_llcs.keys());
2186 }
2187 if topo.all_cpus.keys().enumerate().any(|(i, &k)| i != k) {
2188 bail!("Holes in CPU IDs detected: {:?}", topo.all_cpus.keys());
2189 }
2190
2191 let netdevs = if opts.netdev_irq_balance {
2192 warn!(
2193 "Experimental netdev IRQ balancing enabled. Reset IRQ masks of network devices after use!!!"
2194 );
2195 read_netdevs()?
2196 } else {
2197 BTreeMap::new()
2198 };
2199
2200 if !disable_topology {
2201 if topo.nodes.len() == 1 && topo.nodes[&0].llcs.len() == 1 {
2202 disable_topology = true;
2203 };
2204 info!(
2205 "Topology awareness not specified, selecting {} based on hardware",
2206 if disable_topology {
2207 "disabled"
2208 } else {
2209 "enabled"
2210 }
2211 );
2212 };
2213
2214 let cpu_pool = CpuPool::new(topo.clone())?;
2215
2216 let layer_specs: Vec<_> = if disable_topology {
2219 info!("Disabling topology awareness");
2220 layer_specs
2221 .iter()
2222 .cloned()
2223 .map(|mut s| {
2224 s.kind.common_mut().nodes.clear();
2225 s.kind.common_mut().llcs.clear();
2226 s
2227 })
2228 .collect()
2229 } else {
2230 layer_specs.to_vec()
2231 };
2232
2233 init_libbpf_logging(None);
2235 let kfuncs_in_syscall = scx_bpf_compat::kfuncs_supported_in_syscall()?;
2236 if !kfuncs_in_syscall {
2237 warn!("Using slow path: kfuncs not supported in syscall programs (a8e03b6bbb2c ∉ ker)");
2238 }
2239
2240 let mut skel_builder = BpfSkelBuilder::default();
2242 skel_builder.obj_builder.debug(opts.verbose > 1);
2243 info!(
2244 "Running scx_layered (build ID: {})",
2245 build_id::full_version(env!("CARGO_PKG_VERSION"))
2246 );
2247 let open_opts = opts.libbpf.clone().into_bpf_open_opts();
2248 let mut skel = scx_ops_open!(skel_builder, open_object, layered, open_opts)?;
2249
2250 skel.progs.scx_pmu_tc.set_autoload(membw_tracking);
2252
2253 if opts.enable_gpu_support {
2256 if opts.gpu_kprobe_level >= 1 {
2259 compat::cond_kprobe_enable("nvidia_open", &skel.progs.kprobe_nvidia_open)?;
2260 }
2261 if opts.gpu_kprobe_level >= 2 {
2264 compat::cond_kprobe_enable("nvidia_mmap", &skel.progs.kprobe_nvidia_mmap)?;
2265 }
2266 if opts.gpu_kprobe_level >= 3 {
2267 compat::cond_kprobe_enable("nvidia_poll", &skel.progs.kprobe_nvidia_poll)?;
2268 }
2269 }
2270
2271 let ext_sched_class_addr = get_kallsyms_addr("ext_sched_class");
2272 let idle_sched_class_addr = get_kallsyms_addr("idle_sched_class");
2273
2274 let event = if membw_tracking {
2275 setup_membw_tracking(&mut skel)?
2276 } else {
2277 0
2278 };
2279
2280 let rodata = skel.maps.rodata_data.as_mut().unwrap();
2281
2282 if ext_sched_class_addr.is_ok() && idle_sched_class_addr.is_ok() {
2283 rodata.ext_sched_class_addr = ext_sched_class_addr.unwrap();
2284 rodata.idle_sched_class_addr = idle_sched_class_addr.unwrap();
2285 } else {
2286 warn!(
2287 "Unable to get sched_class addresses from /proc/kallsyms, disabling skip_preempt."
2288 );
2289 }
2290
2291 rodata.slice_ns = scx_enums.SCX_SLICE_DFL;
2292 rodata.max_exec_ns = 20 * scx_enums.SCX_SLICE_DFL;
2293
2294 skel.struct_ops.layered_mut().exit_dump_len = opts.exit_dump_len;
2296
2297 if !opts.disable_queued_wakeup {
2298 match *compat::SCX_OPS_ALLOW_QUEUED_WAKEUP {
2299 0 => info!("Kernel does not support queued wakeup optimization"),
2300 v => skel.struct_ops.layered_mut().flags |= v,
2301 }
2302 }
2303
2304 rodata.percpu_kthread_preempt = !opts.disable_percpu_kthread_preempt;
2305 rodata.percpu_kthread_preempt_all =
2306 !opts.disable_percpu_kthread_preempt && opts.percpu_kthread_preempt_all;
2307 rodata.debug = opts.verbose as u32;
2308 rodata.slice_ns = opts.slice_us * 1000;
2309 rodata.max_exec_ns = if opts.max_exec_us > 0 {
2310 opts.max_exec_us * 1000
2311 } else {
2312 opts.slice_us * 1000 * 20
2313 };
2314 rodata.nr_cpu_ids = *NR_CPU_IDS as u32;
2315 rodata.nr_possible_cpus = *NR_CPUS_POSSIBLE as u32;
2316 rodata.smt_enabled = topo.smt_enabled;
2317 rodata.has_little_cores = topo.has_little_cores();
2318 rodata.xnuma_preemption = opts.xnuma_preemption;
2319 rodata.antistall_sec = opts.antistall_sec;
2320 rodata.monitor_disable = opts.monitor_disable;
2321 rodata.lo_fb_wait_ns = opts.lo_fb_wait_us * 1000;
2322 rodata.lo_fb_share_ppk = ((opts.lo_fb_share * 1024.0) as u32).clamp(1, 1024);
2323 rodata.enable_antistall = !opts.disable_antistall;
2324 rodata.enable_match_debug = opts.enable_match_debug;
2325 rodata.enable_gpu_support = opts.enable_gpu_support;
2326 rodata.kfuncs_supported_in_syscall = kfuncs_in_syscall;
2327
2328 for (cpu, sib) in topo.sibling_cpus().iter().enumerate() {
2329 rodata.__sibling_cpu[cpu] = *sib;
2330 }
2331 for cpu in topo.all_cpus.keys() {
2332 rodata.all_cpus[cpu / 8] |= 1 << (cpu % 8);
2333 }
2334
2335 rodata.nr_op_layers = layer_specs
2336 .iter()
2337 .filter(|spec| match &spec.kind {
2338 LayerKind::Open { .. } => spec.kind.common().preempt,
2339 _ => false,
2340 })
2341 .count() as u32;
2342 rodata.nr_on_layers = layer_specs
2343 .iter()
2344 .filter(|spec| match &spec.kind {
2345 LayerKind::Open { .. } => !spec.kind.common().preempt,
2346 _ => false,
2347 })
2348 .count() as u32;
2349 rodata.nr_gp_layers = layer_specs
2350 .iter()
2351 .filter(|spec| match &spec.kind {
2352 LayerKind::Grouped { .. } => spec.kind.common().preempt,
2353 _ => false,
2354 })
2355 .count() as u32;
2356 rodata.nr_gn_layers = layer_specs
2357 .iter()
2358 .filter(|spec| match &spec.kind {
2359 LayerKind::Grouped { .. } => !spec.kind.common().preempt,
2360 _ => false,
2361 })
2362 .count() as u32;
2363 rodata.nr_excl_layers = layer_specs
2364 .iter()
2365 .filter(|spec| spec.kind.common().exclusive)
2366 .count() as u32;
2367
2368 let mut min_open = u64::MAX;
2369 let mut min_preempt = u64::MAX;
2370
2371 for spec in layer_specs.iter() {
2372 if let LayerKind::Open { common, .. } = &spec.kind {
2373 min_open = min_open.min(common.disallow_open_after_us.unwrap());
2374 min_preempt = min_preempt.min(common.disallow_preempt_after_us.unwrap());
2375 }
2376 }
2377
2378 rodata.min_open_layer_disallow_open_after_ns = match min_open {
2379 u64::MAX => *DFL_DISALLOW_OPEN_AFTER_US,
2380 v => v,
2381 };
2382 rodata.min_open_layer_disallow_preempt_after_ns = match min_preempt {
2383 u64::MAX => *DFL_DISALLOW_PREEMPT_AFTER_US,
2384 v => v,
2385 };
2386
2387 for i in 0..layer_specs.len() {
2389 skel.maps.bss_data.as_mut().unwrap().empty_layer_ids[i] = i as u32;
2390 }
2391 skel.maps.bss_data.as_mut().unwrap().nr_empty_layer_ids = nr_layers as u32;
2392
2393 let layered_task_hint_map_path = &opts.task_hint_map;
2398 let hint_map = &mut skel.maps.scx_layered_task_hint_map;
2399 if layered_task_hint_map_path.is_empty() == false {
2401 hint_map.set_pin_path(layered_task_hint_map_path).unwrap();
2402 rodata.task_hint_map_enabled = true;
2403 }
2404
2405 if !opts.hi_fb_thread_name.is_empty() {
2406 let bpf_hi_fb_thread_name = &mut rodata.hi_fb_thread_name;
2407 copy_into_cstr(bpf_hi_fb_thread_name, opts.hi_fb_thread_name.as_str());
2408 rodata.enable_hi_fb_thread_name_match = true;
2409 }
2410
2411 Self::init_layers(&mut skel, &layer_specs, &topo)?;
2412 Self::init_nodes(&mut skel, opts, &topo);
2413
2414 let mut skel = scx_ops_load!(skel, layered, uei)?;
2415
2416 if hint_to_layer_map.len() != 0 {
2418 for (k, v) in hint_to_layer_map.iter() {
2419 let key: u32 = *k as u32;
2420 let value: u32 = *v as u32;
2421 skel.maps.hint_to_layer_id_map.update(
2422 &key.to_ne_bytes(),
2423 &value.to_ne_bytes(),
2424 libbpf_rs::MapFlags::ANY,
2425 )?;
2426 }
2427 }
2428
2429 if membw_tracking {
2430 create_perf_fds(&mut skel, event)?;
2431 }
2432
2433 let mut layers = vec![];
2434 let layer_growth_orders =
2435 LayerGrowthAlgo::layer_core_orders(&cpu_pool, &layer_specs, &topo)?;
2436 for (idx, spec) in layer_specs.iter().enumerate() {
2437 let growth_order = layer_growth_orders
2438 .get(&idx)
2439 .with_context(|| "layer has no growth order".to_string())?;
2440 layers.push(Layer::new(spec, &topo, growth_order)?);
2441 }
2442
2443 let mut idle_qos_enabled = layers
2444 .iter()
2445 .any(|layer| layer.kind.common().idle_resume_us.unwrap_or(0) > 0);
2446 if idle_qos_enabled && !cpu_idle_resume_latency_supported() {
2447 warn!("idle_resume_us not supported, ignoring");
2448 idle_qos_enabled = false;
2449 }
2450
2451 Self::init_cpus(&skel, &layer_specs, &topo)?;
2452 Self::init_llc_prox_map(&mut skel, &topo)?;
2453
2454 let proc_reader = fb_procfs::ProcReader::new();
2456
2457 let input = ProgramInput {
2459 ..Default::default()
2460 };
2461 let prog = &mut skel.progs.initialize_pid_namespace;
2462
2463 let _ = prog.test_run(input);
2464
2465 if layered_task_hint_map_path.is_empty() == false {
2474 let path = CString::new(layered_task_hint_map_path.as_bytes()).unwrap();
2475 let mode: libc::mode_t = 0o666;
2476 unsafe {
2477 if libc::chmod(path.as_ptr(), mode) != 0 {
2478 trace!("'chmod' to 666 of task hint map failed, continuing...");
2479 }
2480 }
2481 }
2482
2483 let struct_ops = scx_ops_attach!(skel, layered)?;
2485 let stats_server = StatsServer::new(stats::server_data()).launch()?;
2486 let mut gpu_task_handler =
2487 GpuTaskAffinitizer::new(opts.gpu_affinitize_secs, opts.enable_gpu_affinitize);
2488 gpu_task_handler.init(topo.clone());
2489 let sched = Self {
2490 struct_ops: Some(struct_ops),
2491 layer_specs,
2492
2493 sched_intv: Duration::from_secs_f64(opts.interval),
2494 layer_refresh_intv: Duration::from_millis(opts.layer_refresh_ms_avgruntime),
2495
2496 cpu_pool,
2497 layers,
2498 idle_qos_enabled,
2499
2500 sched_stats: Stats::new(&mut skel, &proc_reader, &gpu_task_handler)?,
2501
2502 nr_layer_cpus_ranges: vec![(0, 0); nr_layers],
2503 processing_dur: Default::default(),
2504
2505 proc_reader,
2506 skel,
2507
2508 topo,
2509 netdevs,
2510 stats_server,
2511 gpu_task_handler,
2512 };
2513
2514 info!("Layered Scheduler Attached. Run `scx_layered --monitor` for metrics.");
2515
2516 Ok(sched)
2517 }
2518
2519 fn update_cpumask(mask: &Cpumask, bpfmask: &mut [u8]) {
2520 for cpu in 0..mask.len() {
2521 if mask.test_cpu(cpu) {
2522 bpfmask[cpu / 8] |= 1 << (cpu % 8);
2523 } else {
2524 bpfmask[cpu / 8] &= !(1 << (cpu % 8));
2525 }
2526 }
2527 }
2528
2529 fn update_bpf_layer_cpumask(layer: &Layer, bpf_layer: &mut types::layer) {
2530 trace!("[{}] Updating BPF CPUs: {}", layer.name, &layer.cpus);
2531 Self::update_cpumask(&layer.cpus, &mut bpf_layer.cpus);
2532
2533 bpf_layer.nr_cpus = layer.nr_cpus as u32;
2534 for (llc_id, &nr_llc_cpus) in layer.nr_llc_cpus.iter().enumerate() {
2535 bpf_layer.nr_llc_cpus[llc_id] = nr_llc_cpus as u32;
2536 }
2537
2538 bpf_layer.refresh_cpus = 1;
2539 }
2540
2541 fn update_netdev_cpumasks(&mut self) -> Result<()> {
2542 let available_cpus = self.cpu_pool.available_cpus();
2543 if available_cpus.is_empty() {
2544 return Ok(());
2545 }
2546
2547 for (iface, netdev) in self.netdevs.iter_mut() {
2548 let node = self
2549 .topo
2550 .nodes
2551 .values()
2552 .take_while(|n| n.id == netdev.node())
2553 .next()
2554 .ok_or_else(|| anyhow!("Failed to get netdev node"))?;
2555 let node_cpus = node.span.clone();
2556 for (irq, irqmask) in netdev.irqs.iter_mut() {
2557 irqmask.clear_all();
2558 for cpu in available_cpus.iter() {
2559 if !node_cpus.test_cpu(cpu) {
2560 continue;
2561 }
2562 let _ = irqmask.set_cpu(cpu);
2563 }
2564 if irqmask.weight() == 0 {
2566 for cpu in node_cpus.iter() {
2567 let _ = irqmask.set_cpu(cpu);
2568 }
2569 }
2570 trace!("{} updating irq {} cpumask {:?}", iface, irq, irqmask);
2571 }
2572 netdev.apply_cpumasks()?;
2573 }
2574
2575 Ok(())
2576 }
2577
2578 fn clamp_target_by_membw(
2579 &self,
2580 layer: &Layer,
2581 membw_limit: usize,
2582 last_membw: usize,
2583 low: usize,
2584 high: usize,
2585 ) -> usize {
2586 let ncpu = layer.cpus.weight();
2587 let last_membw_percpu = if ncpu > 0 { last_membw / ncpu } else { 0 };
2588
2589 if last_membw_percpu * high < membw_limit {
2591 return high;
2592 }
2593
2594 if last_membw_percpu * low > membw_limit {
2597 warn!("cannot satisfy memory bw limit for layer {}", layer.name);
2598 return low;
2599 }
2600
2601 return membw_limit / last_membw_percpu;
2603 }
2604
2605 fn calc_target_nr_cpus(&self) -> Vec<(usize, usize)> {
2611 let nr_cpus = self.cpu_pool.topo.all_cpus.len();
2612 let utils = &self.sched_stats.layer_utils;
2613
2614 let mut records: Vec<(u64, u64, u64, usize, usize, usize)> = vec![];
2615 let mut targets: Vec<(usize, usize)> = vec![];
2616
2617 for (idx, layer) in self.layers.iter().enumerate() {
2618 targets.push(match &layer.kind {
2619 LayerKind::Confined {
2620 util_range,
2621 cpus_range,
2622 cpus_range_frac,
2623 membw_gb,
2624 ..
2625 }
2626 | LayerKind::Grouped {
2627 util_range,
2628 cpus_range,
2629 cpus_range_frac,
2630 membw_gb,
2631 ..
2632 } => {
2633 let owned = utils[idx][LAYER_USAGE_OWNED];
2638 let open = utils[idx][LAYER_USAGE_OPEN];
2639
2640 let mut util = owned;
2641 if layer.kind.util_includes_open_cputime() || layer.nr_cpus == 0 {
2642 util += open;
2643 }
2644
2645 let util = if util < 0.01 { 0.0 } else { util };
2646 let low = (util / util_range.1).ceil() as usize;
2647 let mut high = ((util / util_range.0).floor() as usize).max(low);
2648
2649 if let Some(membw_limit) = membw_gb {
2650 let membw_cpus = &self.sched_stats.layer_membws[idx];
2651 let last_membw = membw_cpus.into_iter().map(|x: &f64| *x as usize).sum();
2652 trace!(
2653 "(last_membw, membw_limit): ({last_membw} bytes, {membw_limit} GiB)"
2654 );
2655
2656 high = self.clamp_target_by_membw(
2657 &layer,
2658 (*membw_limit * (1024_u64.pow(3) as f64)) as usize,
2659 last_membw,
2660 low,
2661 high,
2662 );
2663 }
2664
2665 let target = layer.cpus.weight().clamp(low, high);
2666 let cpus_range =
2667 resolve_cpus_pct_range(cpus_range, cpus_range_frac, nr_cpus).unwrap();
2668
2669 records.push((
2670 (owned * 100.0) as u64,
2671 (open * 100.0) as u64,
2672 (util * 100.0) as u64,
2673 low,
2674 high,
2675 target,
2676 ));
2677
2678 (target.clamp(cpus_range.0, cpus_range.1), cpus_range.0)
2679 }
2680 LayerKind::Open { .. } => (0, 0),
2681 });
2682 }
2683
2684 trace!("initial targets: {:?}", &targets);
2685 trace!("(owned, open, util, low, high, target): {:?}", &records);
2686 targets
2687 }
2688
2689 fn weighted_target_nr_cpus(&self, targets: &[(usize, usize)]) -> Vec<usize> {
2693 let mut nr_left = self.cpu_pool.topo.all_cpus.len();
2694 let weights: Vec<usize> = self
2695 .layers
2696 .iter()
2697 .map(|layer| layer.kind.common().weight as usize)
2698 .collect();
2699 let mut cands: BTreeMap<usize, (usize, usize, usize)> = targets
2700 .iter()
2701 .zip(&weights)
2702 .enumerate()
2703 .map(|(i, ((target, min), weight))| (i, (*target, *min, *weight)))
2704 .collect();
2705 let mut weight_sum: usize = weights.iter().sum();
2706 let mut weighted: Vec<usize> = vec![0; self.layers.len()];
2707
2708 trace!("cands: {:?}", &cands);
2709
2710 cands.retain(|&i, &mut (target, min, weight)| {
2712 if target <= min {
2713 let target = target.min(nr_left);
2714 weighted[i] = target;
2715 weight_sum -= weight;
2716 nr_left -= target;
2717 false
2718 } else {
2719 true
2720 }
2721 });
2722
2723 trace!("cands after accepting mins: {:?}", &cands);
2724
2725 let calc_share = |nr_left, weight, weight_sum| {
2727 (((nr_left * weight) as f64 / weight_sum as f64).ceil() as usize).min(nr_left)
2728 };
2729
2730 while !cands.is_empty() {
2731 let mut progress = false;
2732
2733 cands.retain(|&i, &mut (target, _min, weight)| {
2734 let share = calc_share(nr_left, weight, weight_sum);
2735 if target <= share {
2736 weighted[i] = target;
2737 weight_sum -= weight;
2738 nr_left -= target;
2739 progress = true;
2740 false
2741 } else {
2742 true
2743 }
2744 });
2745
2746 if !progress {
2747 break;
2748 }
2749 }
2750
2751 trace!("cands after accepting under allotted: {:?}", &cands);
2752
2753 let nr_to_share = nr_left;
2756 for (i, (_target, _min, weight)) in cands.into_iter() {
2757 let share = calc_share(nr_to_share, weight, weight_sum).min(nr_left);
2758 weighted[i] = share;
2759 nr_left -= share;
2760 }
2761
2762 trace!("weighted: {:?}", &weighted);
2763
2764 weighted
2765 }
2766
2767 fn compute_target_llcs(target: usize, topo: &Topology) -> (usize, usize) {
2771 let cores_per_llc = topo.all_cores.len() / topo.all_llcs.len();
2773 let cpus_per_core = topo.all_cores.first_key_value().unwrap().1.cpus.len();
2775 let cpus_per_llc = cores_per_llc * cpus_per_core;
2776
2777 let full = target / cpus_per_llc;
2778 let extra = target % cpus_per_llc;
2779
2780 (full, extra.div_ceil(cpus_per_core))
2781 }
2782
2783 fn recompute_layer_core_order(&mut self, layer_targets: &Vec<(usize, usize)>) {
2791 debug!(
2793 " free: before pass: free_llcs={:?}",
2794 self.cpu_pool.free_llcs
2795 );
2796 for &(idx, target) in layer_targets.iter().rev() {
2797 let layer = &mut self.layers[idx];
2798 let old_tlc = layer.target_llc_cpus;
2799 let new_tlc = Self::compute_target_llcs(target, &self.topo);
2800
2801 if layer.growth_algo != LayerGrowthAlgo::StickyDynamic {
2802 continue;
2803 }
2804
2805 let mut to_free = (old_tlc.0 as i32 - new_tlc.0 as i32).max(0) as usize;
2806
2807 debug!(
2808 " free: layer={} old_tlc={:?} new_tlc={:?} to_free={} assigned={} free={}",
2809 layer.name,
2810 old_tlc,
2811 new_tlc,
2812 to_free,
2813 layer.assigned_llcs.len(),
2814 self.cpu_pool.free_llcs.len()
2815 );
2816
2817 while to_free > 0 && layer.assigned_llcs.len() > 0 {
2818 let llc = layer.assigned_llcs.pop().unwrap();
2819 self.cpu_pool.free_llcs.push((llc, 0));
2820 to_free -= 1;
2821
2822 debug!(" layer={} freed_llc={}", layer.name, llc);
2823 }
2824 }
2825 debug!(" free: after pass: free_llcs={:?}", self.cpu_pool.free_llcs);
2826
2827 for &(idx, target) in layer_targets.iter().rev() {
2829 let layer = &mut self.layers[idx];
2830 let old_tlc = layer.target_llc_cpus;
2831 let new_tlc = Self::compute_target_llcs(target, &self.topo);
2832
2833 if layer.growth_algo != LayerGrowthAlgo::StickyDynamic {
2834 continue;
2835 }
2836
2837 let mut to_alloc = (new_tlc.0 as i32 - old_tlc.0 as i32).max(0) as usize;
2838
2839 debug!(
2840 " alloc: layer={} old_tlc={:?} new_tlc={:?} to_alloc={} assigned={} free={}",
2841 layer.name,
2842 old_tlc,
2843 new_tlc,
2844 to_alloc,
2845 layer.assigned_llcs.len(),
2846 self.cpu_pool.free_llcs.len()
2847 );
2848
2849 while to_alloc > 0
2850 && self.cpu_pool.free_llcs.len() > 0
2851 && to_alloc <= self.cpu_pool.free_llcs.len()
2852 {
2853 let llc = self.cpu_pool.free_llcs.pop().unwrap().0;
2854 layer.assigned_llcs.push(llc);
2855 to_alloc -= 1;
2856
2857 debug!(" layer={} alloc_llc={}", layer.name, llc);
2858 }
2859
2860 debug!(
2861 " alloc: layer={} assigned_llcs={:?}",
2862 layer.name, layer.assigned_llcs
2863 );
2864
2865 layer.target_llc_cpus = new_tlc;
2867 }
2868
2869 for &(idx, _) in layer_targets.iter() {
2872 let mut core_order = vec![];
2873 let layer = &mut self.layers[idx];
2874
2875 if layer.growth_algo != LayerGrowthAlgo::StickyDynamic {
2876 continue;
2877 }
2878
2879 let tlc = layer.target_llc_cpus;
2880 let mut extra = tlc.1;
2881 let cores_per_llc = self.topo.all_cores.len() / self.topo.all_llcs.len();
2883 let cpus_per_core = self.topo.all_cores.first_key_value().unwrap().1.cpus.len();
2884 let cpus_per_llc = cores_per_llc * cpus_per_core;
2885
2886 for i in 0..self.cpu_pool.free_llcs.len() {
2888 let free_vec = &mut self.cpu_pool.free_llcs;
2889 let avail = cpus_per_llc - free_vec[i].1;
2891 let mut used = extra.min(avail);
2893
2894 let shift = free_vec[i].1;
2895 free_vec[i].1 += used;
2896
2897 let llc_id = free_vec[i].0;
2898 let llc = self.topo.all_llcs.get(&llc_id).unwrap();
2899
2900 for core in llc.cores.iter().skip(shift) {
2901 core_order.push(core.1.id);
2902 if used == 0 {
2903 break;
2904 }
2905 used -= 1;
2906 }
2907
2908 extra -= used;
2909 if extra == 0 {
2910 break;
2911 }
2912 }
2913
2914 core_order.reverse();
2915 layer.core_order = core_order;
2916 }
2917
2918 for i in 0..self.cpu_pool.free_llcs.len() {
2920 self.cpu_pool.free_llcs[i].1 = 0;
2921 }
2922
2923 for &(idx, _) in layer_targets.iter() {
2924 let layer = &mut self.layers[idx];
2925
2926 if layer.growth_algo != LayerGrowthAlgo::StickyDynamic {
2927 continue;
2928 }
2929
2930 for core in self.topo.all_cores.iter() {
2931 let llc_id = core.1.llc_id;
2932 if layer.assigned_llcs.contains(&llc_id) {
2933 layer.core_order.push(core.1.id);
2934 }
2935 }
2936 layer.core_order.reverse();
2938
2939 debug!(
2940 " alloc: layer={} core_order={:?}",
2941 layer.name, layer.core_order
2942 );
2943 }
2944 }
2945
2946 fn refresh_cpumasks(&mut self) -> Result<()> {
2947 let layer_is_open = |layer: &Layer| matches!(layer.kind, LayerKind::Open { .. });
2948
2949 let mut updated = false;
2950 let targets = self.calc_target_nr_cpus();
2951 let targets = self.weighted_target_nr_cpus(&targets);
2952
2953 let mut ascending: Vec<(usize, usize)> = targets.iter().copied().enumerate().collect();
2954 ascending.sort_by(|a, b| a.1.cmp(&b.1));
2955
2956 self.recompute_layer_core_order(&ascending);
2957
2958 let mut force_free = self
2961 .layers
2962 .iter()
2963 .zip(targets.iter())
2964 .any(|(layer, &target)| layer.nr_cpus < target);
2965
2966 for &(idx, target) in ascending.iter().rev() {
2970 let layer = &mut self.layers[idx];
2971 if layer_is_open(layer) {
2972 continue;
2973 }
2974
2975 let nr_cur = layer.cpus.weight();
2976 if nr_cur <= target {
2977 continue;
2978 }
2979 let mut nr_to_free = nr_cur - target;
2980
2981 let nr_to_break_at = nr_to_free / 2;
2986
2987 let mut freed = false;
2988
2989 while nr_to_free > 0 {
2990 let max_to_free = if force_free {
2991 force_free = false;
2992 layer.nr_cpus
2993 } else {
2994 nr_to_free
2995 };
2996
2997 let nr_freed = layer.free_some_cpus(&mut self.cpu_pool, max_to_free)?;
2998 if nr_freed == 0 {
2999 break;
3000 }
3001
3002 nr_to_free = nr_to_free.saturating_sub(nr_freed);
3003 freed = true;
3004
3005 if nr_to_free <= nr_to_break_at {
3006 break;
3007 }
3008 }
3009
3010 if freed {
3011 Self::update_bpf_layer_cpumask(
3012 layer,
3013 &mut self.skel.maps.bss_data.as_mut().unwrap().layers[idx],
3014 );
3015 updated = true;
3016 }
3017 }
3018
3019 for &(idx, target) in &ascending {
3025 let layer = &mut self.layers[idx];
3026
3027 if layer_is_open(layer) {
3028 continue;
3029 }
3030
3031 let nr_cur = layer.cpus.weight();
3032 if nr_cur >= target {
3033 continue;
3034 }
3035
3036 let mut nr_to_alloc = target - nr_cur;
3037 let mut alloced = false;
3038
3039 while nr_to_alloc > 0 {
3040 let nr_alloced = layer.alloc_some_cpus(&mut self.cpu_pool)?;
3041 if nr_alloced == 0 {
3042 break;
3043 }
3044 alloced = true;
3045 nr_to_alloc -= nr_alloced.min(nr_to_alloc);
3046 }
3047
3048 if alloced {
3049 Self::update_bpf_layer_cpumask(
3050 layer,
3051 &mut self.skel.maps.bss_data.as_mut().unwrap().layers[idx],
3052 );
3053 updated = true;
3054 }
3055 }
3056
3057 if updated {
3059 for (idx, layer) in self.layers.iter_mut().enumerate() {
3060 if !layer_is_open(layer) {
3061 continue;
3062 }
3063
3064 let bpf_layer = &mut self.skel.maps.bss_data.as_mut().unwrap().layers[idx];
3065 let available_cpus = self.cpu_pool.available_cpus().and(&layer.allowed_cpus);
3066 let nr_available_cpus = available_cpus.weight();
3067
3068 layer.cpus = available_cpus;
3071 layer.nr_cpus = nr_available_cpus;
3072 Self::update_bpf_layer_cpumask(layer, bpf_layer);
3073 }
3074
3075 self.skel.maps.bss_data.as_mut().unwrap().fallback_cpu =
3076 self.cpu_pool.fallback_cpu as u32;
3077
3078 for (lidx, layer) in self.layers.iter().enumerate() {
3079 self.nr_layer_cpus_ranges[lidx] = (
3080 self.nr_layer_cpus_ranges[lidx].0.min(layer.nr_cpus),
3081 self.nr_layer_cpus_ranges[lidx].1.max(layer.nr_cpus),
3082 );
3083 }
3084
3085 let input = ProgramInput {
3087 ..Default::default()
3088 };
3089 let prog = &mut self.skel.progs.refresh_layer_cpumasks;
3090 let _ = prog.test_run(input);
3091
3092 let empty_layer_ids: Vec<u32> = self
3094 .layers
3095 .iter()
3096 .enumerate()
3097 .filter(|(_idx, layer)| layer.nr_cpus == 0)
3098 .map(|(idx, _layer)| idx as u32)
3099 .collect();
3100 for i in 0..self.layers.len() {
3101 self.skel.maps.bss_data.as_mut().unwrap().empty_layer_ids[i] =
3102 empty_layer_ids.get(i).cloned().unwrap_or(MAX_LAYERS as u32);
3103 }
3104 self.skel.maps.bss_data.as_mut().unwrap().nr_empty_layer_ids =
3105 empty_layer_ids.len() as u32;
3106 }
3107
3108 let _ = self.update_netdev_cpumasks();
3109 Ok(())
3110 }
3111
3112 fn refresh_idle_qos(&mut self) -> Result<()> {
3113 if !self.idle_qos_enabled {
3114 return Ok(());
3115 }
3116
3117 let mut cpu_idle_qos = vec![0; *NR_CPU_IDS];
3118 for layer in self.layers.iter() {
3119 let idle_resume_us = layer.kind.common().idle_resume_us.unwrap_or(0) as i32;
3120 for cpu in layer.cpus.iter() {
3121 cpu_idle_qos[cpu] = idle_resume_us;
3122 }
3123 }
3124
3125 for (cpu, idle_resume_usec) in cpu_idle_qos.iter().enumerate() {
3126 update_cpu_idle_resume_latency(cpu, *idle_resume_usec)?;
3127 }
3128
3129 Ok(())
3130 }
3131
3132 fn step(&mut self) -> Result<()> {
3133 let started_at = Instant::now();
3134 self.sched_stats.refresh(
3135 &mut self.skel,
3136 &self.proc_reader,
3137 started_at,
3138 self.processing_dur,
3139 &self.gpu_task_handler,
3140 )?;
3141 self.refresh_cpumasks()?;
3142 self.refresh_idle_qos()?;
3143 self.gpu_task_handler.maybe_affinitize();
3144 self.processing_dur += Instant::now().duration_since(started_at);
3145 Ok(())
3146 }
3147
3148 fn generate_sys_stats(
3149 &mut self,
3150 stats: &Stats,
3151 cpus_ranges: &mut [(usize, usize)],
3152 ) -> Result<SysStats> {
3153 let bstats = &stats.bpf_stats;
3154 let mut sys_stats = SysStats::new(stats, bstats, self.cpu_pool.fallback_cpu)?;
3155
3156 for (lidx, (spec, layer)) in self.layer_specs.iter().zip(self.layers.iter()).enumerate() {
3157 let layer_stats = LayerStats::new(lidx, layer, stats, bstats, cpus_ranges[lidx]);
3158 sys_stats.layers.insert(spec.name.to_string(), layer_stats);
3159 cpus_ranges[lidx] = (layer.nr_cpus, layer.nr_cpus);
3160 }
3161
3162 Ok(sys_stats)
3163 }
3164
3165 fn run(&mut self, shutdown: Arc<AtomicBool>) -> Result<UserExitInfo> {
3166 let (res_ch, req_ch) = self.stats_server.channels();
3167 let mut next_sched_at = Instant::now() + self.sched_intv;
3168 let enable_layer_refresh = !self.layer_refresh_intv.is_zero();
3169 let mut next_layer_refresh_at = Instant::now() + self.layer_refresh_intv;
3170 let mut cpus_ranges = HashMap::<ThreadId, Vec<(usize, usize)>>::new();
3171
3172 while !shutdown.load(Ordering::Relaxed) && !uei_exited!(&self.skel, uei) {
3173 let now = Instant::now();
3174
3175 if now >= next_sched_at {
3176 self.step()?;
3177 while next_sched_at < now {
3178 next_sched_at += self.sched_intv;
3179 }
3180 }
3181
3182 if enable_layer_refresh && now >= next_layer_refresh_at {
3183 self.skel
3184 .maps
3185 .bss_data
3186 .as_mut()
3187 .unwrap()
3188 .layer_refresh_seq_avgruntime += 1;
3189 while next_layer_refresh_at < now {
3190 next_layer_refresh_at += self.layer_refresh_intv;
3191 }
3192 }
3193
3194 match req_ch.recv_deadline(next_sched_at) {
3195 Ok(StatsReq::Hello(tid)) => {
3196 cpus_ranges.insert(
3197 tid,
3198 self.layers.iter().map(|l| (l.nr_cpus, l.nr_cpus)).collect(),
3199 );
3200 let stats =
3201 Stats::new(&mut self.skel, &self.proc_reader, &self.gpu_task_handler)?;
3202 res_ch.send(StatsRes::Hello(stats))?;
3203 }
3204 Ok(StatsReq::Refresh(tid, mut stats)) => {
3205 for i in 0..self.nr_layer_cpus_ranges.len() {
3207 for (_, ranges) in cpus_ranges.iter_mut() {
3208 ranges[i] = (
3209 ranges[i].0.min(self.nr_layer_cpus_ranges[i].0),
3210 ranges[i].1.max(self.nr_layer_cpus_ranges[i].1),
3211 );
3212 }
3213 self.nr_layer_cpus_ranges[i] =
3214 (self.layers[i].nr_cpus, self.layers[i].nr_cpus);
3215 }
3216
3217 stats.refresh(
3218 &mut self.skel,
3219 &self.proc_reader,
3220 now,
3221 self.processing_dur,
3222 &self.gpu_task_handler,
3223 )?;
3224 let sys_stats =
3225 self.generate_sys_stats(&stats, cpus_ranges.get_mut(&tid).unwrap())?;
3226 res_ch.send(StatsRes::Refreshed((stats, sys_stats)))?;
3227 }
3228 Ok(StatsReq::Bye(tid)) => {
3229 cpus_ranges.remove(&tid);
3230 res_ch.send(StatsRes::Bye)?;
3231 }
3232 Err(RecvTimeoutError::Timeout) => {}
3233 Err(e) => Err(e)?,
3234 }
3235 }
3236
3237 let _ = self.struct_ops.take();
3238 uei_report!(&self.skel, uei)
3239 }
3240}
3241
3242impl Drop for Scheduler<'_> {
3243 fn drop(&mut self) {
3244 info!("Unregister {SCHEDULER_NAME} scheduler");
3245
3246 if let Some(struct_ops) = self.struct_ops.take() {
3247 drop(struct_ops);
3248 }
3249 }
3250}
3251
3252fn write_example_file(path: &str) -> Result<()> {
3253 let mut f = fs::OpenOptions::new()
3254 .create_new(true)
3255 .write(true)
3256 .open(path)?;
3257 Ok(f.write_all(serde_json::to_string_pretty(&*EXAMPLE_CONFIG)?.as_bytes())?)
3258}
3259
3260fn verify_layer_specs(specs: &[LayerSpec]) -> Result<HashMap<u64, usize>> {
3261 let mut hint_to_layer_map = HashMap::<u64, (usize, String)>::new();
3262
3263 let nr_specs = specs.len();
3264 if nr_specs == 0 {
3265 bail!("No layer spec");
3266 }
3267 if nr_specs > MAX_LAYERS {
3268 bail!("Too many layer specs");
3269 }
3270
3271 for (idx, spec) in specs.iter().enumerate() {
3272 if idx < nr_specs - 1 {
3273 if spec.matches.is_empty() {
3274 bail!("Non-terminal spec {:?} has NULL matches", spec.name);
3275 }
3276 } else {
3277 if spec.matches.len() != 1 || !spec.matches[0].is_empty() {
3278 bail!("Terminal spec {:?} must have an empty match", spec.name);
3279 }
3280 }
3281
3282 if spec.matches.len() > MAX_LAYER_MATCH_ORS {
3283 bail!(
3284 "Spec {:?} has too many ({}) OR match blocks",
3285 spec.name,
3286 spec.matches.len()
3287 );
3288 }
3289
3290 for (ands_idx, ands) in spec.matches.iter().enumerate() {
3291 if ands.len() > NR_LAYER_MATCH_KINDS {
3292 bail!(
3293 "Spec {:?}'s {}th OR block has too many ({}) match conditions",
3294 spec.name,
3295 ands_idx,
3296 ands.len()
3297 );
3298 }
3299 let mut hint_equals_cnt = 0;
3300 for one in ands.iter() {
3301 match one {
3302 LayerMatch::CgroupPrefix(prefix) => {
3303 if prefix.len() > MAX_PATH {
3304 bail!("Spec {:?} has too long a cgroup prefix", spec.name);
3305 }
3306 }
3307 LayerMatch::CgroupSuffix(suffix) => {
3308 if suffix.len() > MAX_PATH {
3309 bail!("Spec {:?} has too long a cgroup suffix", spec.name);
3310 }
3311 }
3312 LayerMatch::CgroupContains(substr) => {
3313 if substr.len() > MAX_PATH {
3314 bail!("Spec {:?} has too long a cgroup substr", spec.name);
3315 }
3316 }
3317 LayerMatch::CommPrefix(prefix) => {
3318 if prefix.len() > MAX_COMM {
3319 bail!("Spec {:?} has too long a comm prefix", spec.name);
3320 }
3321 }
3322 LayerMatch::PcommPrefix(prefix) => {
3323 if prefix.len() > MAX_COMM {
3324 bail!("Spec {:?} has too long a process name prefix", spec.name);
3325 }
3326 }
3327 LayerMatch::HintEquals(hint) => {
3328 if *hint > 1024 {
3329 bail!(
3330 "Spec {:?} has hint value outside the range [0, 1024]",
3331 spec.name
3332 );
3333 }
3334
3335 if let Some((layer_id, name)) = hint_to_layer_map.get(hint) {
3336 if *layer_id != idx {
3337 bail!(
3338 "Spec {:?} has hint value ({}) that is already mapped to Spec {:?}",
3339 spec.name,
3340 hint,
3341 name
3342 );
3343 }
3344 } else {
3345 hint_to_layer_map.insert(*hint, (idx, spec.name.clone()));
3346 }
3347 hint_equals_cnt += 1;
3348 }
3349 _ => {}
3350 }
3351 }
3352 if hint_equals_cnt > 1 {
3353 bail!("Only 1 HintEquals match permitted per AND block");
3354 }
3355 if hint_equals_cnt == 1 && ands.len() != 1 {
3356 bail!("HintEquals match cannot be in conjunction with other matches");
3357 }
3358 }
3359
3360 match spec.kind {
3361 LayerKind::Confined {
3362 cpus_range,
3363 util_range,
3364 ..
3365 }
3366 | LayerKind::Grouped {
3367 cpus_range,
3368 util_range,
3369 ..
3370 } => {
3371 if let Some((cpus_min, cpus_max)) = cpus_range {
3372 if cpus_min > cpus_max {
3373 bail!(
3374 "Spec {:?} has invalid cpus_range({}, {})",
3375 spec.name,
3376 cpus_min,
3377 cpus_max
3378 );
3379 }
3380 }
3381 if util_range.0 >= util_range.1 {
3382 bail!(
3383 "Spec {:?} has invalid util_range ({}, {})",
3384 spec.name,
3385 util_range.0,
3386 util_range.1
3387 );
3388 }
3389 }
3390 _ => {}
3391 }
3392 }
3393
3394 Ok(hint_to_layer_map
3395 .into_iter()
3396 .map(|(k, v)| (k, v.0))
3397 .collect())
3398}
3399
3400fn name_suffix(cgroup: &str, len: usize) -> String {
3401 let suffixlen = std::cmp::min(len, cgroup.len());
3402 let suffixrev: String = cgroup.chars().rev().take(suffixlen).collect();
3403
3404 suffixrev.chars().rev().collect()
3405}
3406
3407fn traverse_sysfs(dir: &Path) -> Result<Vec<PathBuf>> {
3408 let mut paths = vec![];
3409
3410 if !dir.is_dir() {
3411 panic!("path {:?} does not correspond to directory", dir);
3412 }
3413
3414 let direntries = fs::read_dir(dir)?;
3415
3416 for entry in direntries {
3417 let path = entry?.path();
3418 if path.is_dir() {
3419 paths.append(&mut traverse_sysfs(&path)?);
3420 paths.push(path);
3421 }
3422 }
3423
3424 Ok(paths)
3425}
3426
3427fn find_cpumask(cgroup: &str) -> Cpumask {
3428 let mut path = String::from(cgroup);
3429 path.push_str("/cpuset.cpus.effective");
3430
3431 let description = fs::read_to_string(&mut path).unwrap();
3432
3433 Cpumask::from_cpulist(&description).unwrap()
3434}
3435
3436fn expand_template(rule: &LayerMatch) -> Result<Vec<(LayerMatch, Cpumask)>> {
3437 match rule {
3438 LayerMatch::CgroupSuffix(suffix) => Ok(traverse_sysfs(Path::new("/sys/fs/cgroup"))?
3439 .into_iter()
3440 .map(|cgroup| String::from(cgroup.to_str().expect("could not parse cgroup path")))
3441 .filter(|cgroup| cgroup.ends_with(suffix))
3442 .map(|cgroup| {
3443 (
3444 {
3445 let mut slashterminated = cgroup.clone();
3446 slashterminated.push('/');
3447 LayerMatch::CgroupSuffix(name_suffix(&slashterminated, 64))
3448 },
3449 find_cpumask(&cgroup),
3450 )
3451 })
3452 .collect()),
3453 LayerMatch::CgroupRegex(expr) => Ok(traverse_sysfs(Path::new("/sys/fs/cgroup"))?
3454 .into_iter()
3455 .map(|cgroup| String::from(cgroup.to_str().expect("could not parse cgroup path")))
3456 .filter(|cgroup| {
3457 let re = Regex::new(expr).unwrap();
3458 re.is_match(cgroup)
3459 })
3460 .map(|cgroup| {
3461 (
3462 {
3466 let mut slashterminated = cgroup.clone();
3467 slashterminated.push('/');
3468 LayerMatch::CgroupSuffix(name_suffix(&slashterminated, 64))
3469 },
3470 find_cpumask(&cgroup),
3471 )
3472 })
3473 .collect()),
3474 _ => panic!("Unimplemented template enum {:?}", rule),
3475 }
3476}
3477
3478fn create_perf_fds(skel: &mut BpfSkel, event: u64) -> Result<()> {
3479 let mut attr = perf::bindings::perf_event_attr::default();
3480 attr.size = std::mem::size_of::<perf::bindings::perf_event_attr>() as u32;
3481 attr.type_ = perf::bindings::PERF_TYPE_RAW;
3482 attr.config = event;
3483 attr.sample_type = 0u64;
3484 attr.set_disabled(0);
3485
3486 let perf_events_map = &skel.maps.scx_pmu_map;
3487 let map_fd = unsafe { libbpf_sys::bpf_map__fd(perf_events_map.as_libbpf_object().as_ptr()) };
3488
3489 let mut failures = 0u64;
3490
3491 for cpu in 0..*NR_CPUS_POSSIBLE {
3492 let fd = unsafe { perf::perf_event_open(&mut attr as *mut _, -1, cpu as i32, -1, 0) };
3493 if fd < 0 {
3494 failures += 1;
3495 trace!(
3496 "perf_event_open failed cpu={cpu} errno={}",
3497 std::io::Error::last_os_error()
3498 );
3499 continue;
3500 }
3501
3502 let key = cpu as u32;
3503 let val = fd as u32;
3504 let ret = unsafe {
3505 libbpf_sys::bpf_map_update_elem(
3506 map_fd,
3507 &key as *const _ as *const _,
3508 &val as *const _ as *const _,
3509 0,
3510 )
3511 };
3512 if ret != 0 {
3513 trace!("bpf_map_update_elem failed cpu={cpu} fd={fd} ret={ret}");
3514 } else {
3515 trace!("mapped cpu={cpu} -> fd={fd}");
3516 }
3517 }
3518
3519 if failures > 0 {
3520 trace!("failed to install {failures} counters");
3521 }
3523
3524 Ok(())
3525}
3526
3527fn setup_membw_tracking(skel: &mut OpenBpfSkel) -> Result<u64> {
3529 let pmumanager = PMUManager::new(None)?;
3530 let codename = &pmumanager.codename as &str;
3531
3532 let pmuspec = match codename {
3533 "amdzen1" | "amdzen2" | "amdzen3" => {
3534 trace!("found AMD codename {codename}");
3535 pmumanager.pmus.get("ls_any_fills_from_sys.mem_io_local")
3536 }
3537 "amdzen4" | "amdzen5" => {
3538 trace!("found AMD codename {codename}");
3539 pmumanager.pmus.get("ls_any_fills_from_sys.dram_io_all")
3540 }
3541
3542 "haswell" | "broadwell" | "broadwellde" | "broadwellx" | "skylake" | "skylakex"
3543 | "cascadelakex" | "arrowlake" | "meteorlake" | "sapphirerapids" | "emeraldrapids"
3544 | "graniterapids" => {
3545 trace!("found Intel codename {codename}");
3546 pmumanager.pmus.get("mem_load_retired.l3_miss")
3547 }
3548
3549 _ => {
3550 trace!("found unknown codename {codename}");
3551 None
3552 }
3553 };
3554
3555 let spec = pmuspec.ok_or("not_found").unwrap();
3556 let config = (spec.umask << 8) | spec.event;
3557
3558 skel.maps.rodata_data.as_mut().unwrap().membw_event = config;
3560
3561 Ok(config)
3562}
3563
3564fn main() -> Result<()> {
3565 let opts = Opts::parse();
3566
3567 if opts.version {
3568 println!(
3569 "scx_layered {}",
3570 build_id::full_version(env!("CARGO_PKG_VERSION"))
3571 );
3572 return Ok(());
3573 }
3574
3575 if opts.help_stats {
3576 stats::server_data().describe_meta(&mut std::io::stdout(), None)?;
3577 return Ok(());
3578 }
3579
3580 if opts.no_load_frac_limit {
3581 warn!("--no-load-frac-limit is deprecated and noop");
3582 }
3583 if opts.layer_preempt_weight_disable != 0.0 {
3584 warn!("--layer-preempt-weight-disable is deprecated and noop");
3585 }
3586 if opts.layer_growth_weight_disable != 0.0 {
3587 warn!("--layer-growth-weight-disable is deprecated and noop");
3588 }
3589 if opts.local_llc_iteration {
3590 warn!("--local_llc_iteration is deprecated and noop");
3591 }
3592
3593 let llv = match opts.verbose {
3594 0 => simplelog::LevelFilter::Info,
3595 1 => simplelog::LevelFilter::Debug,
3596 _ => simplelog::LevelFilter::Trace,
3597 };
3598 let mut lcfg = simplelog::ConfigBuilder::new();
3599 lcfg.set_time_offset_to_local()
3600 .expect("Failed to set local time offset")
3601 .set_time_level(simplelog::LevelFilter::Error)
3602 .set_location_level(simplelog::LevelFilter::Off)
3603 .set_target_level(simplelog::LevelFilter::Off)
3604 .set_thread_level(simplelog::LevelFilter::Off);
3605 simplelog::TermLogger::init(
3606 llv,
3607 lcfg.build(),
3608 simplelog::TerminalMode::Stderr,
3609 simplelog::ColorChoice::Auto,
3610 )?;
3611
3612 debug!("opts={:?}", &opts);
3613
3614 let shutdown = Arc::new(AtomicBool::new(false));
3615 let shutdown_clone = shutdown.clone();
3616 ctrlc::set_handler(move || {
3617 shutdown_clone.store(true, Ordering::Relaxed);
3618 })
3619 .context("Error setting Ctrl-C handler")?;
3620
3621 if let Some(intv) = opts.monitor.or(opts.stats) {
3622 let shutdown_copy = shutdown.clone();
3623 let jh = std::thread::spawn(move || {
3624 match stats::monitor(Duration::from_secs_f64(intv), shutdown_copy) {
3625 Ok(_) => {
3626 debug!("stats monitor thread finished successfully")
3627 }
3628 Err(error_object) => {
3629 warn!(
3630 "stats monitor thread finished because of an error {}",
3631 error_object
3632 )
3633 }
3634 }
3635 });
3636 if opts.monitor.is_some() {
3637 let _ = jh.join();
3638 return Ok(());
3639 }
3640 }
3641
3642 if let Some(path) = &opts.example {
3643 write_example_file(path)?;
3644 return Ok(());
3645 }
3646
3647 let mut layer_config = match opts.run_example {
3648 true => EXAMPLE_CONFIG.clone(),
3649 false => LayerConfig { specs: vec![] },
3650 };
3651
3652 for (idx, input) in opts.specs.iter().enumerate() {
3653 let specs = LayerSpec::parse(input)
3654 .context(format!("Failed to parse specs[{}] ({:?})", idx, input))?;
3655
3656 for spec in specs {
3657 match spec.template {
3658 Some(ref rule) => {
3659 let matches = expand_template(&rule)?;
3660 if matches.is_empty() {
3663 layer_config.specs.push(spec);
3664 } else {
3665 for (mt, mask) in matches {
3666 let mut genspec = spec.clone();
3667
3668 genspec.cpuset = Some(mask);
3669
3670 for orterm in &mut genspec.matches {
3672 orterm.push(mt.clone());
3673 }
3674
3675 match &mt {
3676 LayerMatch::CgroupSuffix(cgroup) => genspec.name.push_str(cgroup),
3677 _ => bail!("Template match has unexpected type"),
3678 }
3679
3680 layer_config.specs.push(genspec);
3682 }
3683 }
3684 }
3685
3686 None => {
3687 layer_config.specs.push(spec);
3688 }
3689 }
3690 }
3691 }
3692
3693 for spec in layer_config.specs.iter_mut() {
3694 let common = spec.kind.common_mut();
3695
3696 if common.slice_us == 0 {
3697 common.slice_us = opts.slice_us;
3698 }
3699
3700 if common.weight == 0 {
3701 common.weight = DEFAULT_LAYER_WEIGHT;
3702 }
3703 common.weight = common.weight.clamp(MIN_LAYER_WEIGHT, MAX_LAYER_WEIGHT);
3704
3705 if common.preempt {
3706 if common.disallow_open_after_us.is_some() {
3707 warn!(
3708 "Preempt layer {} has non-null disallow_open_after_us, ignored",
3709 &spec.name
3710 );
3711 }
3712 if common.disallow_preempt_after_us.is_some() {
3713 warn!(
3714 "Preempt layer {} has non-null disallow_preempt_after_us, ignored",
3715 &spec.name
3716 );
3717 }
3718 common.disallow_open_after_us = Some(u64::MAX);
3719 common.disallow_preempt_after_us = Some(u64::MAX);
3720 } else {
3721 if common.disallow_open_after_us.is_none() {
3722 common.disallow_open_after_us = Some(*DFL_DISALLOW_OPEN_AFTER_US);
3723 }
3724
3725 if common.disallow_preempt_after_us.is_none() {
3726 common.disallow_preempt_after_us = Some(*DFL_DISALLOW_PREEMPT_AFTER_US);
3727 }
3728 }
3729
3730 if common.idle_smt.is_some() {
3731 warn!("Layer {} has deprecated flag \"idle_smt\"", &spec.name);
3732 }
3733 }
3734
3735 let membw_required = layer_config.specs.iter().any(|spec| match spec.kind {
3736 LayerKind::Confined { membw_gb, .. } | LayerKind::Grouped { membw_gb, .. } => {
3737 membw_gb.is_some()
3738 }
3739 LayerKind::Open { .. } => false,
3740 });
3741
3742 if opts.print_and_exit {
3743 println!("specs={}", serde_json::to_string_pretty(&layer_config)?);
3744 return Ok(());
3745 }
3746
3747 debug!("specs={}", serde_json::to_string_pretty(&layer_config)?);
3748 let hint_to_layer_map = verify_layer_specs(&layer_config.specs)?;
3749
3750 let mut open_object = MaybeUninit::uninit();
3751 loop {
3752 let mut sched = Scheduler::init(
3753 &opts,
3754 &layer_config.specs,
3755 &mut open_object,
3756 &hint_to_layer_map,
3757 membw_required,
3758 )?;
3759 if !sched.run(shutdown.clone())?.should_restart() {
3760 break;
3761 }
3762 }
3763
3764 Ok(())
3765}