1mod bpf_skel;
6mod stats;
7
8use std::collections::BTreeMap;
9use std::collections::HashMap;
10use std::collections::HashSet;
11use std::ffi::CString;
12use std::fs;
13use std::io::Write;
14use std::mem::MaybeUninit;
15use std::ops::Sub;
16use std::path::Path;
17use std::path::PathBuf;
18use std::sync::atomic::AtomicBool;
19use std::sync::atomic::Ordering;
20use std::sync::Arc;
21use std::thread::ThreadId;
22use std::time::Duration;
23use std::time::Instant;
24
25use anyhow::anyhow;
26use anyhow::bail;
27use anyhow::Context;
28use anyhow::Result;
29pub use bpf_skel::*;
30use clap::Parser;
31use crossbeam::channel::RecvTimeoutError;
32use lazy_static::lazy_static;
33use libbpf_rs::MapCore as _;
34use libbpf_rs::OpenObject;
35use libbpf_rs::ProgramInput;
36use log::info;
37use log::trace;
38use log::warn;
39use log::{debug, error};
40use nix::sched::CpuSet;
41use nvml_wrapper::error::NvmlError;
42use nvml_wrapper::Nvml;
43use once_cell::sync::OnceCell;
44use regex::Regex;
45use scx_bpf_compat;
46use scx_layered::*;
47use scx_stats::prelude::*;
48use scx_utils::build_id;
49use scx_utils::compat;
50use scx_utils::init_libbpf_logging;
51use scx_utils::libbpf_clap_opts::LibbpfOpts;
52use scx_utils::pm::{cpu_idle_resume_latency_supported, update_cpu_idle_resume_latency};
53use scx_utils::read_netdevs;
54use scx_utils::scx_enums;
55use scx_utils::scx_ops_attach;
56use scx_utils::scx_ops_load;
57use scx_utils::scx_ops_open;
58use scx_utils::uei_exited;
59use scx_utils::uei_report;
60use scx_utils::CoreType;
61use scx_utils::Cpumask;
62use scx_utils::Llc;
63use scx_utils::NetDev;
64use scx_utils::Topology;
65use scx_utils::UserExitInfo;
66use scx_utils::NR_CPUS_POSSIBLE;
67use scx_utils::NR_CPU_IDS;
68use stats::LayerStats;
69use stats::StatsReq;
70use stats::StatsRes;
71use stats::SysStats;
72use std::collections::VecDeque;
73use sysinfo::{Pid, ProcessRefreshKind, ProcessesToUpdate, System};
74
75const SCHEDULER_NAME: &str = "scx_layered";
76const MAX_PATH: usize = bpf_intf::consts_MAX_PATH as usize;
77const MAX_COMM: usize = bpf_intf::consts_MAX_COMM as usize;
78const MAX_LAYER_WEIGHT: u32 = bpf_intf::consts_MAX_LAYER_WEIGHT;
79const MIN_LAYER_WEIGHT: u32 = bpf_intf::consts_MIN_LAYER_WEIGHT;
80const MAX_LAYER_MATCH_ORS: usize = bpf_intf::consts_MAX_LAYER_MATCH_ORS as usize;
81const MAX_LAYER_NAME: usize = bpf_intf::consts_MAX_LAYER_NAME as usize;
82const MAX_LAYERS: usize = bpf_intf::consts_MAX_LAYERS as usize;
83const DEFAULT_LAYER_WEIGHT: u32 = bpf_intf::consts_DEFAULT_LAYER_WEIGHT;
84const USAGE_HALF_LIFE: u32 = bpf_intf::consts_USAGE_HALF_LIFE;
85const USAGE_HALF_LIFE_F64: f64 = USAGE_HALF_LIFE as f64 / 1_000_000_000.0;
86
87const LAYER_USAGE_OWNED: usize = bpf_intf::layer_usage_LAYER_USAGE_OWNED as usize;
88const LAYER_USAGE_OPEN: usize = bpf_intf::layer_usage_LAYER_USAGE_OPEN as usize;
89const LAYER_USAGE_SUM_UPTO: usize = bpf_intf::layer_usage_LAYER_USAGE_SUM_UPTO as usize;
90const LAYER_USAGE_PROTECTED: usize = bpf_intf::layer_usage_LAYER_USAGE_PROTECTED as usize;
91const LAYER_USAGE_PROTECTED_PREEMPT: usize =
92 bpf_intf::layer_usage_LAYER_USAGE_PROTECTED_PREEMPT as usize;
93const NR_LAYER_USAGES: usize = bpf_intf::layer_usage_NR_LAYER_USAGES as usize;
94
95const NR_GSTATS: usize = bpf_intf::global_stat_id_NR_GSTATS as usize;
96const NR_LSTATS: usize = bpf_intf::layer_stat_id_NR_LSTATS as usize;
97const NR_LLC_LSTATS: usize = bpf_intf::llc_layer_stat_id_NR_LLC_LSTATS as usize;
98
99const NR_LAYER_MATCH_KINDS: usize = bpf_intf::layer_match_kind_NR_LAYER_MATCH_KINDS as usize;
100
101static NVML: OnceCell<Nvml> = OnceCell::new();
102
103fn nvml() -> Result<&'static Nvml, NvmlError> {
104 NVML.get_or_try_init(Nvml::init)
105}
106
107lazy_static! {
108 static ref USAGE_DECAY: f64 = 0.5f64.powf(1.0 / USAGE_HALF_LIFE_F64);
109 static ref DFL_DISALLOW_OPEN_AFTER_US: u64 = 2 * scx_enums.SCX_SLICE_DFL / 1000;
110 static ref DFL_DISALLOW_PREEMPT_AFTER_US: u64 = 4 * scx_enums.SCX_SLICE_DFL / 1000;
111 static ref EXAMPLE_CONFIG: LayerConfig = LayerConfig {
112 specs: vec![
113 LayerSpec {
114 name: "batch".into(),
115 comment: Some("tasks under system.slice or tasks with nice value > 0".into()),
116 cpuset: None,
117 template: None,
118 matches: vec![
119 vec![LayerMatch::CgroupPrefix("system.slice/".into())],
120 vec![LayerMatch::NiceAbove(0)],
121 ],
122 kind: LayerKind::Confined {
123 util_range: (0.8, 0.9),
124 cpus_range: Some((0, 16)),
125 cpus_range_frac: None,
126 protected: false,
127 membw_gb: None,
128 common: LayerCommon {
129 min_exec_us: 1000,
130 yield_ignore: 0.0,
131 preempt: false,
132 preempt_first: false,
133 exclusive: false,
134 allow_node_aligned: false,
135 skip_remote_node: false,
136 prev_over_idle_core: false,
137 idle_smt: None,
138 slice_us: 20000,
139 fifo: false,
140 weight: DEFAULT_LAYER_WEIGHT,
141 disallow_open_after_us: None,
142 disallow_preempt_after_us: None,
143 xllc_mig_min_us: 1000.0,
144 growth_algo: LayerGrowthAlgo::Sticky,
145 idle_resume_us: None,
146 perf: 1024,
147 nodes: vec![],
148 llcs: vec![],
149 placement: LayerPlacement::Standard,
150 },
151 },
152 },
153 LayerSpec {
154 name: "immediate".into(),
155 comment: Some("tasks under workload.slice with nice value < 0".into()),
156 cpuset: None,
157 template: None,
158 matches: vec![vec![
159 LayerMatch::CgroupPrefix("workload.slice/".into()),
160 LayerMatch::NiceBelow(0),
161 ]],
162 kind: LayerKind::Open {
163 common: LayerCommon {
164 min_exec_us: 100,
165 yield_ignore: 0.25,
166 preempt: true,
167 preempt_first: false,
168 exclusive: true,
169 allow_node_aligned: true,
170 skip_remote_node: false,
171 prev_over_idle_core: true,
172 idle_smt: None,
173 slice_us: 20000,
174 fifo: false,
175 weight: DEFAULT_LAYER_WEIGHT,
176 disallow_open_after_us: None,
177 disallow_preempt_after_us: None,
178 xllc_mig_min_us: 0.0,
179 growth_algo: LayerGrowthAlgo::Sticky,
180 perf: 1024,
181 idle_resume_us: None,
182 nodes: vec![],
183 llcs: vec![],
184 placement: LayerPlacement::Standard,
185 },
186 },
187 },
188 LayerSpec {
189 name: "stress-ng".into(),
190 comment: Some("stress-ng test layer".into()),
191 cpuset: None,
192 template: None,
193 matches: vec![
194 vec![LayerMatch::CommPrefix("stress-ng".into()),],
195 vec![LayerMatch::PcommPrefix("stress-ng".into()),]
196 ],
197 kind: LayerKind::Confined {
198 cpus_range: None,
199 util_range: (0.2, 0.8),
200 protected: false,
201 cpus_range_frac: None,
202 membw_gb: None,
203 common: LayerCommon {
204 min_exec_us: 800,
205 yield_ignore: 0.0,
206 preempt: true,
207 preempt_first: false,
208 exclusive: false,
209 allow_node_aligned: false,
210 skip_remote_node: false,
211 prev_over_idle_core: false,
212 idle_smt: None,
213 slice_us: 800,
214 fifo: false,
215 weight: DEFAULT_LAYER_WEIGHT,
216 disallow_open_after_us: None,
217 disallow_preempt_after_us: None,
218 xllc_mig_min_us: 0.0,
219 growth_algo: LayerGrowthAlgo::Topo,
220 perf: 1024,
221 idle_resume_us: None,
222 nodes: vec![],
223 llcs: vec![],
224 placement: LayerPlacement::Standard,
225 },
226 },
227 },
228 LayerSpec {
229 name: "normal".into(),
230 comment: Some("the rest".into()),
231 cpuset: None,
232 template: None,
233 matches: vec![vec![]],
234 kind: LayerKind::Grouped {
235 cpus_range: None,
236 util_range: (0.5, 0.6),
237 util_includes_open_cputime: true,
238 protected: false,
239 cpus_range_frac: None,
240 membw_gb: None,
241 common: LayerCommon {
242 min_exec_us: 200,
243 yield_ignore: 0.0,
244 preempt: false,
245 preempt_first: false,
246 exclusive: false,
247 allow_node_aligned: false,
248 skip_remote_node: false,
249 prev_over_idle_core: false,
250 idle_smt: None,
251 slice_us: 20000,
252 fifo: false,
253 weight: DEFAULT_LAYER_WEIGHT,
254 disallow_open_after_us: None,
255 disallow_preempt_after_us: None,
256 xllc_mig_min_us: 100.0,
257 growth_algo: LayerGrowthAlgo::Linear,
258 perf: 1024,
259 idle_resume_us: None,
260 nodes: vec![],
261 llcs: vec![],
262 placement: LayerPlacement::Standard,
263 },
264 },
265 },
266 ],
267 };
268}
269
270#[derive(Debug, Parser)]
550#[command(verbatim_doc_comment)]
551struct Opts {
552 #[clap(short = 's', long, default_value = "20000")]
554 slice_us: u64,
555
556 #[clap(short = 'M', long, default_value = "0")]
561 max_exec_us: u64,
562
563 #[clap(short = 'i', long, default_value = "0.1")]
565 interval: f64,
566
567 #[clap(short = 'n', long, default_value = "false")]
570 no_load_frac_limit: bool,
571
572 #[clap(long, default_value = "0")]
574 exit_dump_len: u32,
575
576 #[clap(short = 'v', long, action = clap::ArgAction::Count)]
579 verbose: u8,
580
581 #[arg(short = 't', long, num_args = 0..=1, default_missing_value = "true", require_equals = true)]
585 disable_topology: Option<bool>,
586
587 #[clap(long)]
589 xnuma_preemption: bool,
590
591 #[clap(long)]
593 monitor_disable: bool,
594
595 #[clap(short = 'e', long)]
597 example: Option<String>,
598
599 #[clap(long, default_value = "0.0")]
603 layer_preempt_weight_disable: f64,
604
605 #[clap(long, default_value = "0.0")]
609 layer_growth_weight_disable: f64,
610
611 #[clap(long)]
613 stats: Option<f64>,
614
615 #[clap(long)]
618 monitor: Option<f64>,
619
620 #[clap(long)]
622 run_example: bool,
623
624 #[clap(long, default_value = "false")]
627 local_llc_iteration: bool,
628
629 #[clap(long, default_value = "10000")]
634 lo_fb_wait_us: u64,
635
636 #[clap(long, default_value = ".05")]
639 lo_fb_share: f64,
640
641 #[clap(long, default_value = "false")]
643 disable_antistall: bool,
644
645 #[clap(long, default_value = "false")]
647 enable_gpu_affinitize: bool,
648
649 #[clap(long, default_value = "900")]
652 gpu_affinitize_secs: u64,
653
654 #[clap(long, default_value = "false")]
659 enable_match_debug: bool,
660
661 #[clap(long, default_value = "3")]
663 antistall_sec: u64,
664
665 #[clap(long, default_value = "false")]
667 enable_gpu_support: bool,
668
669 #[clap(long, default_value = "3")]
677 gpu_kprobe_level: u64,
678
679 #[clap(long, default_value = "false")]
681 netdev_irq_balance: bool,
682
683 #[clap(long, default_value = "false")]
685 disable_queued_wakeup: bool,
686
687 #[clap(long, default_value = "false")]
689 disable_percpu_kthread_preempt: bool,
690
691 #[clap(long, default_value = "false")]
695 percpu_kthread_preempt_all: bool,
696
697 #[clap(short = 'V', long, action = clap::ArgAction::SetTrue)]
699 version: bool,
700
701 #[clap(long)]
703 help_stats: bool,
704
705 specs: Vec<String>,
707
708 #[clap(long, default_value = "2000")]
711 layer_refresh_ms_avgruntime: u64,
712
713 #[clap(long, default_value = "")]
715 task_hint_map: String,
716
717 #[clap(long, default_value = "false")]
719 print_and_exit: bool,
720
721 #[clap(long, default_value = "")]
723 hi_fb_thread_name: String,
724
725 #[clap(flatten, next_help_heading = "Libbpf Options")]
726 pub libbpf: LibbpfOpts,
727}
728
729fn read_total_cpu(reader: &fb_procfs::ProcReader) -> Result<fb_procfs::CpuStat> {
730 reader
731 .read_stat()
732 .context("Failed to read procfs")?
733 .total_cpu
734 .ok_or_else(|| anyhow!("Could not read total cpu stat in proc"))
735}
736
737fn calc_util(curr: &fb_procfs::CpuStat, prev: &fb_procfs::CpuStat) -> Result<f64> {
738 match (curr, prev) {
739 (
740 fb_procfs::CpuStat {
741 user_usec: Some(curr_user),
742 nice_usec: Some(curr_nice),
743 system_usec: Some(curr_system),
744 idle_usec: Some(curr_idle),
745 iowait_usec: Some(curr_iowait),
746 irq_usec: Some(curr_irq),
747 softirq_usec: Some(curr_softirq),
748 stolen_usec: Some(curr_stolen),
749 ..
750 },
751 fb_procfs::CpuStat {
752 user_usec: Some(prev_user),
753 nice_usec: Some(prev_nice),
754 system_usec: Some(prev_system),
755 idle_usec: Some(prev_idle),
756 iowait_usec: Some(prev_iowait),
757 irq_usec: Some(prev_irq),
758 softirq_usec: Some(prev_softirq),
759 stolen_usec: Some(prev_stolen),
760 ..
761 },
762 ) => {
763 let idle_usec = curr_idle.saturating_sub(*prev_idle);
764 let iowait_usec = curr_iowait.saturating_sub(*prev_iowait);
765 let user_usec = curr_user.saturating_sub(*prev_user);
766 let system_usec = curr_system.saturating_sub(*prev_system);
767 let nice_usec = curr_nice.saturating_sub(*prev_nice);
768 let irq_usec = curr_irq.saturating_sub(*prev_irq);
769 let softirq_usec = curr_softirq.saturating_sub(*prev_softirq);
770 let stolen_usec = curr_stolen.saturating_sub(*prev_stolen);
771
772 let busy_usec =
773 user_usec + system_usec + nice_usec + irq_usec + softirq_usec + stolen_usec;
774 let total_usec = idle_usec + busy_usec + iowait_usec;
775 if total_usec > 0 {
776 Ok(((busy_usec as f64) / (total_usec as f64)).clamp(0.0, 1.0))
777 } else {
778 Ok(1.0)
779 }
780 }
781 _ => bail!("Missing stats in cpustat"),
782 }
783}
784
785fn copy_into_cstr(dst: &mut [i8], src: &str) {
786 let cstr = CString::new(src).unwrap();
787 let bytes = unsafe { std::mem::transmute::<&[u8], &[i8]>(cstr.as_bytes_with_nul()) };
788 dst[0..bytes.len()].copy_from_slice(bytes);
789}
790
791fn nodemask_from_nodes(nodes: &Vec<usize>) -> usize {
792 let mut mask = 0;
793 for node in nodes {
794 mask |= 1 << node;
795 }
796 mask
797}
798
799fn llcmask_from_llcs(llcs: &BTreeMap<usize, Arc<Llc>>) -> usize {
800 let mut mask = 0;
801 for (_, cache) in llcs {
802 mask |= 1 << cache.id;
803 }
804 mask
805}
806
807fn read_cpu_ctxs(skel: &BpfSkel) -> Result<Vec<bpf_intf::cpu_ctx>> {
808 let mut cpu_ctxs = vec![];
809 let cpu_ctxs_vec = skel
810 .maps
811 .cpu_ctxs
812 .lookup_percpu(&0u32.to_ne_bytes(), libbpf_rs::MapFlags::ANY)
813 .context("Failed to lookup cpu_ctx")?
814 .unwrap();
815 for cpu in 0..*NR_CPUS_POSSIBLE {
816 cpu_ctxs.push(*unsafe {
817 &*(cpu_ctxs_vec[cpu].as_slice().as_ptr() as *const bpf_intf::cpu_ctx)
818 });
819 }
820 Ok(cpu_ctxs)
821}
822
823#[derive(Clone, Debug)]
824struct BpfStats {
825 gstats: Vec<u64>,
826 lstats: Vec<Vec<u64>>,
827 lstats_sums: Vec<u64>,
828 llc_lstats: Vec<Vec<Vec<u64>>>, }
830
831impl BpfStats {
832 fn read(skel: &BpfSkel, cpu_ctxs: &[bpf_intf::cpu_ctx]) -> Self {
833 let nr_layers = skel.maps.rodata_data.as_ref().unwrap().nr_layers as usize;
834 let nr_llcs = skel.maps.rodata_data.as_ref().unwrap().nr_llcs as usize;
835 let mut gstats = vec![0u64; NR_GSTATS];
836 let mut lstats = vec![vec![0u64; NR_LSTATS]; nr_layers];
837 let mut llc_lstats = vec![vec![vec![0u64; NR_LLC_LSTATS]; nr_llcs]; nr_layers];
838
839 for cpu in 0..*NR_CPUS_POSSIBLE {
840 for stat in 0..NR_GSTATS {
841 gstats[stat] += cpu_ctxs[cpu].gstats[stat];
842 }
843 for layer in 0..nr_layers {
844 for stat in 0..NR_LSTATS {
845 lstats[layer][stat] += cpu_ctxs[cpu].lstats[layer][stat];
846 }
847 }
848 }
849
850 let mut lstats_sums = vec![0u64; NR_LSTATS];
851 for layer in 0..nr_layers {
852 for stat in 0..NR_LSTATS {
853 lstats_sums[stat] += lstats[layer][stat];
854 }
855 }
856
857 for llc_id in 0..nr_llcs {
858 let key = llc_id as u32;
864 let llc_id_slice =
865 unsafe { std::slice::from_raw_parts((&key as *const u32) as *const u8, 4) };
866 let v = skel
867 .maps
868 .llc_data
869 .lookup(llc_id_slice, libbpf_rs::MapFlags::ANY)
870 .unwrap()
871 .unwrap();
872 let llcc = unsafe { *(v.as_slice().as_ptr() as *const bpf_intf::llc_ctx) };
873
874 for layer_id in 0..nr_layers {
875 for stat_id in 0..NR_LLC_LSTATS {
876 llc_lstats[layer_id][llc_id][stat_id] = llcc.lstats[layer_id][stat_id];
877 }
878 }
879 }
880
881 Self {
882 gstats,
883 lstats,
884 lstats_sums,
885 llc_lstats,
886 }
887 }
888}
889
890impl<'a, 'b> Sub<&'b BpfStats> for &'a BpfStats {
891 type Output = BpfStats;
892
893 fn sub(self, rhs: &'b BpfStats) -> BpfStats {
894 let vec_sub = |l: &[u64], r: &[u64]| l.iter().zip(r.iter()).map(|(l, r)| *l - *r).collect();
895 BpfStats {
896 gstats: vec_sub(&self.gstats, &rhs.gstats),
897 lstats: self
898 .lstats
899 .iter()
900 .zip(rhs.lstats.iter())
901 .map(|(l, r)| vec_sub(l, r))
902 .collect(),
903 lstats_sums: vec_sub(&self.lstats_sums, &rhs.lstats_sums),
904 llc_lstats: self
905 .llc_lstats
906 .iter()
907 .zip(rhs.llc_lstats.iter())
908 .map(|(l_layer, r_layer)| {
909 l_layer
910 .iter()
911 .zip(r_layer.iter())
912 .map(|(l_llc, r_llc)| {
913 let (l_llc, mut r_llc) = (l_llc.clone(), r_llc.clone());
914 r_llc[bpf_intf::llc_layer_stat_id_LLC_LSTAT_LAT as usize] = 0;
916 vec_sub(&l_llc, &r_llc)
917 })
918 .collect()
919 })
920 .collect(),
921 }
922 }
923}
924
925#[derive(Clone, Debug)]
926struct Stats {
927 at: Instant,
928 elapsed: Duration,
929 nr_layers: usize,
930 nr_layer_tasks: Vec<usize>,
931 nr_nodes: usize,
932
933 total_util: f64, layer_utils: Vec<Vec<f64>>,
935 prev_layer_usages: Vec<Vec<u64>>,
936
937 layer_membws: Vec<Vec<f64>>, prev_layer_membw_agg: Vec<Vec<u64>>, cpu_busy: f64, prev_total_cpu: fb_procfs::CpuStat,
942
943 bpf_stats: BpfStats,
944 prev_bpf_stats: BpfStats,
945
946 processing_dur: Duration,
947 prev_processing_dur: Duration,
948
949 layer_slice_us: Vec<u64>,
950
951 gpu_tasks_affinitized: u64,
952 gpu_task_affinitization_ms: u64,
953}
954
955impl Stats {
956 fn read_layer_usages(cpu_ctxs: &[bpf_intf::cpu_ctx], nr_layers: usize) -> Vec<Vec<u64>> {
957 let mut layer_usages = vec![vec![0u64; NR_LAYER_USAGES]; nr_layers];
958
959 for cpu in 0..*NR_CPUS_POSSIBLE {
960 for layer in 0..nr_layers {
961 for usage in 0..NR_LAYER_USAGES {
962 layer_usages[layer][usage] += cpu_ctxs[cpu].layer_usages[layer][usage];
963 }
964 }
965 }
966
967 layer_usages
968 }
969
970 fn read_layer_membw_agg(cpu_ctxs: &[bpf_intf::cpu_ctx], nr_layers: usize) -> Vec<Vec<u64>> {
972 let mut layer_membw_agg = vec![vec![0u64; NR_LAYER_USAGES]; nr_layers];
973
974 for cpu in 0..*NR_CPUS_POSSIBLE {
975 for layer in 0..nr_layers {
976 for usage in 0..NR_LAYER_USAGES {
977 layer_membw_agg[layer][usage] += cpu_ctxs[cpu].layer_membw_agg[layer][usage];
978 }
979 }
980 }
981
982 layer_membw_agg
983 }
984
985 fn new(
986 skel: &mut BpfSkel,
987 proc_reader: &fb_procfs::ProcReader,
988 gpu_task_affinitizer: &GpuTaskAffinitizer,
989 ) -> Result<Self> {
990 let nr_layers = skel.maps.rodata_data.as_ref().unwrap().nr_layers as usize;
991 let cpu_ctxs = read_cpu_ctxs(skel)?;
992 let bpf_stats = BpfStats::read(skel, &cpu_ctxs);
993 let nr_nodes = skel.maps.rodata_data.as_ref().unwrap().nr_nodes as usize;
994
995 Ok(Self {
996 at: Instant::now(),
997 elapsed: Default::default(),
998 nr_layers,
999 nr_layer_tasks: vec![0; nr_layers],
1000 nr_nodes,
1001
1002 total_util: 0.0,
1003 layer_utils: vec![vec![0.0; NR_LAYER_USAGES]; nr_layers],
1004 layer_membws: vec![vec![0.0; NR_LAYER_USAGES]; nr_layers],
1005 prev_layer_usages: Self::read_layer_usages(&cpu_ctxs, nr_layers),
1006 prev_layer_membw_agg: Self::read_layer_membw_agg(&cpu_ctxs, nr_layers),
1007
1008 cpu_busy: 0.0,
1009 prev_total_cpu: read_total_cpu(proc_reader)?,
1010
1011 bpf_stats: bpf_stats.clone(),
1012 prev_bpf_stats: bpf_stats,
1013
1014 processing_dur: Default::default(),
1015 prev_processing_dur: Default::default(),
1016
1017 layer_slice_us: vec![0; nr_layers],
1018 gpu_tasks_affinitized: gpu_task_affinitizer.tasks_affinitized,
1019 gpu_task_affinitization_ms: gpu_task_affinitizer.last_task_affinitization_ms,
1020 })
1021 }
1022
1023 fn refresh(
1024 &mut self,
1025 skel: &mut BpfSkel,
1026 proc_reader: &fb_procfs::ProcReader,
1027 now: Instant,
1028 cur_processing_dur: Duration,
1029 gpu_task_affinitizer: &GpuTaskAffinitizer,
1030 ) -> Result<()> {
1031 let elapsed = now.duration_since(self.at);
1032 let elapsed_f64 = elapsed.as_secs_f64();
1033 let cpu_ctxs = read_cpu_ctxs(skel)?;
1034
1035 let nr_layer_tasks: Vec<usize> = skel
1036 .maps
1037 .bss_data
1038 .as_ref()
1039 .unwrap()
1040 .layers
1041 .iter()
1042 .take(self.nr_layers)
1043 .map(|layer| layer.nr_tasks as usize)
1044 .collect();
1045 let layer_slice_us: Vec<u64> = skel
1046 .maps
1047 .bss_data
1048 .as_ref()
1049 .unwrap()
1050 .layers
1051 .iter()
1052 .take(self.nr_layers)
1053 .map(|layer| layer.slice_ns / 1000_u64)
1054 .collect();
1055
1056 let cur_layer_usages = Self::read_layer_usages(&cpu_ctxs, self.nr_layers);
1057 let cur_layer_membw_agg = Self::read_layer_membw_agg(&cpu_ctxs, self.nr_layers);
1058
1059 let compute_diff = |cur_agg: &Vec<Vec<u64>>, prev_agg: &Vec<Vec<u64>>| {
1060 cur_agg
1061 .iter()
1062 .zip(prev_agg.iter())
1063 .map(|(cur, prev)| {
1064 cur.iter()
1065 .zip(prev.iter())
1066 .map(|(c, p)| (c - p) as f64 / 1_000_000_000.0 / elapsed_f64)
1067 .collect()
1068 })
1069 .collect()
1070 };
1071
1072 let cur_layer_utils: Vec<Vec<f64>> =
1073 compute_diff(&cur_layer_usages, &self.prev_layer_usages);
1074 let cur_layer_membw: Vec<Vec<f64>> =
1075 compute_diff(&cur_layer_membw_agg, &self.prev_layer_membw_agg);
1076
1077 let metric_decay = |cur_metric: Vec<Vec<f64>>, prev_metric: &Vec<Vec<f64>>| {
1078 cur_metric
1079 .iter()
1080 .zip(prev_metric.iter())
1081 .map(|(cur, prev)| {
1082 cur.iter()
1083 .zip(prev.iter())
1084 .map(|(c, p)| {
1085 let decay = USAGE_DECAY.powf(elapsed_f64);
1086 p * decay + c * (1.0 - decay)
1087 })
1088 .collect()
1089 })
1090 .collect()
1091 };
1092
1093 let layer_utils: Vec<Vec<f64>> = metric_decay(cur_layer_utils, &self.layer_utils);
1094 let layer_membws: Vec<Vec<f64>> = metric_decay(cur_layer_membw, &self.layer_membws);
1095
1096 let cur_total_cpu = read_total_cpu(proc_reader)?;
1097 let cpu_busy = calc_util(&cur_total_cpu, &self.prev_total_cpu)?;
1098
1099 let cur_bpf_stats = BpfStats::read(skel, &cpu_ctxs);
1100 let bpf_stats = &cur_bpf_stats - &self.prev_bpf_stats;
1101
1102 let processing_dur = cur_processing_dur
1103 .checked_sub(self.prev_processing_dur)
1104 .unwrap();
1105
1106 *self = Self {
1107 at: now,
1108 elapsed,
1109 nr_layers: self.nr_layers,
1110 nr_layer_tasks,
1111 nr_nodes: self.nr_nodes,
1112
1113 total_util: layer_utils
1114 .iter()
1115 .map(|x| x.iter().take(LAYER_USAGE_SUM_UPTO + 1).sum::<f64>())
1116 .sum(),
1117 layer_utils,
1118 layer_membws,
1119 prev_layer_usages: cur_layer_usages,
1120 prev_layer_membw_agg: cur_layer_membw_agg,
1121
1122 cpu_busy,
1123 prev_total_cpu: cur_total_cpu,
1124
1125 bpf_stats,
1126 prev_bpf_stats: cur_bpf_stats,
1127
1128 processing_dur,
1129 prev_processing_dur: cur_processing_dur,
1130
1131 layer_slice_us,
1132 gpu_tasks_affinitized: gpu_task_affinitizer.tasks_affinitized,
1133 gpu_task_affinitization_ms: gpu_task_affinitizer.last_task_affinitization_ms,
1134 };
1135 Ok(())
1136 }
1137}
1138
1139#[derive(Debug)]
1140struct Layer {
1141 name: String,
1142 kind: LayerKind,
1143 growth_algo: LayerGrowthAlgo,
1144 core_order: Vec<usize>,
1145
1146 target_llc_cpus: (usize, usize),
1147 assigned_llcs: Vec<usize>,
1148
1149 nr_cpus: usize,
1150 nr_llc_cpus: Vec<usize>,
1151 cpus: Cpumask,
1152 allowed_cpus: Cpumask,
1153}
1154
1155fn get_kallsyms_addr(sym_name: &str) -> Result<u64> {
1156 fs::read_to_string("/proc/kallsyms")?
1157 .lines()
1158 .find(|line| line.contains(sym_name))
1159 .and_then(|line| line.split_whitespace().next())
1160 .and_then(|addr| u64::from_str_radix(addr, 16).ok())
1161 .ok_or_else(|| anyhow!("Symbol '{}' not found", sym_name))
1162}
1163
1164fn resolve_cpus_pct_range(
1165 cpus_range: &Option<(usize, usize)>,
1166 cpus_range_frac: &Option<(f64, f64)>,
1167 max_cpus: usize,
1168) -> Result<(usize, usize)> {
1169 match (cpus_range, cpus_range_frac) {
1170 (Some(_x), Some(_y)) => {
1171 bail!("cpus_range cannot be used with cpus_pct.");
1172 }
1173 (Some((cpus_range_min, cpus_range_max)), None) => Ok((*cpus_range_min, *cpus_range_max)),
1174 (None, Some((cpus_frac_min, cpus_frac_max))) => {
1175 if *cpus_frac_min < 0_f64
1176 || *cpus_frac_min > 1_f64
1177 || *cpus_frac_max < 0_f64
1178 || *cpus_frac_max > 1_f64
1179 {
1180 bail!("cpus_range_frac values must be between 0.0 and 1.0");
1181 }
1182 let cpus_min_count = ((max_cpus as f64) * cpus_frac_min).round_ties_even() as usize;
1183 let cpus_max_count = ((max_cpus as f64) * cpus_frac_max).round_ties_even() as usize;
1184 Ok((
1185 std::cmp::max(cpus_min_count, 1),
1186 std::cmp::min(cpus_max_count, max_cpus),
1187 ))
1188 }
1189 (None, None) => Ok((0, max_cpus)),
1190 }
1191}
1192
1193impl Layer {
1194 fn new(spec: &LayerSpec, topo: &Topology, core_order: &Vec<usize>) -> Result<Self> {
1195 let name = &spec.name;
1196 let kind = spec.kind.clone();
1197 let mut allowed_cpus = Cpumask::new();
1198 match &kind {
1199 LayerKind::Confined {
1200 cpus_range,
1201 cpus_range_frac,
1202 common: LayerCommon { nodes, llcs, .. },
1203 ..
1204 } => {
1205 let cpus_range =
1206 resolve_cpus_pct_range(cpus_range, cpus_range_frac, topo.all_cpus.len())?;
1207 if cpus_range.0 > cpus_range.1 || cpus_range.1 == 0 {
1208 bail!("invalid cpus_range {:?}", cpus_range);
1209 }
1210 if nodes.is_empty() && llcs.is_empty() {
1211 allowed_cpus.set_all();
1212 } else {
1213 for (node_id, node) in &topo.nodes {
1215 if nodes.contains(node_id) {
1217 for &id in node.all_cpus.keys() {
1218 allowed_cpus.set_cpu(id)?;
1219 }
1220 }
1221 for (llc_id, llc) in &node.llcs {
1223 if llcs.contains(llc_id) {
1224 for &id in llc.all_cpus.keys() {
1225 allowed_cpus.set_cpu(id)?;
1226 }
1227 }
1228 }
1229 }
1230 }
1231 }
1232 LayerKind::Grouped {
1233 common: LayerCommon { nodes, llcs, .. },
1234 ..
1235 }
1236 | LayerKind::Open {
1237 common: LayerCommon { nodes, llcs, .. },
1238 ..
1239 } => {
1240 if nodes.is_empty() && llcs.is_empty() {
1241 allowed_cpus.set_all();
1242 } else {
1243 for (node_id, node) in &topo.nodes {
1245 if nodes.contains(node_id) {
1247 for &id in node.all_cpus.keys() {
1248 allowed_cpus.set_cpu(id)?;
1249 }
1250 }
1251 for (llc_id, llc) in &node.llcs {
1253 if llcs.contains(llc_id) {
1254 for &id in llc.all_cpus.keys() {
1255 allowed_cpus.set_cpu(id)?;
1256 }
1257 }
1258 }
1259 }
1260 }
1261 }
1262 }
1263
1264 if let Some(util_range) = kind.util_range() {
1267 if util_range.0 < 0.0 || util_range.1 < 0.0 || util_range.0 >= util_range.1 {
1268 bail!("invalid util_range {:?}", util_range);
1269 }
1270 }
1271
1272 let layer_growth_algo = kind.common().growth_algo.clone();
1273
1274 debug!(
1275 "layer: {} algo: {:?} core order: {:?}",
1276 name, &layer_growth_algo, core_order
1277 );
1278
1279 Ok(Self {
1280 name: name.into(),
1281 kind,
1282 growth_algo: layer_growth_algo,
1283 core_order: core_order.clone(),
1284
1285 target_llc_cpus: (0, 0),
1286 assigned_llcs: vec![],
1287
1288 nr_cpus: 0,
1289 nr_llc_cpus: vec![0; topo.all_llcs.len()],
1290 cpus: Cpumask::new(),
1291 allowed_cpus,
1292 })
1293 }
1294
1295 fn free_some_cpus(&mut self, cpu_pool: &mut CpuPool, max_to_free: usize) -> Result<usize> {
1296 let cpus_to_free = match cpu_pool.next_to_free(&self.cpus, self.core_order.iter().rev())? {
1297 Some(ret) => ret.clone(),
1298 None => return Ok(0),
1299 };
1300
1301 let nr_to_free = cpus_to_free.weight();
1302
1303 Ok(if nr_to_free <= max_to_free {
1304 trace!("[{}] freeing CPUs: {}", self.name, &cpus_to_free);
1305 self.cpus &= &cpus_to_free.not();
1306 self.nr_cpus -= nr_to_free;
1307 for cpu in cpus_to_free.iter() {
1308 self.nr_llc_cpus[cpu_pool.topo.all_cpus[&cpu].llc_id] -= 1;
1309 }
1310 cpu_pool.free(&cpus_to_free)?;
1311 nr_to_free
1312 } else {
1313 0
1314 })
1315 }
1316
1317 fn alloc_some_cpus(&mut self, cpu_pool: &mut CpuPool) -> Result<usize> {
1318 let new_cpus = match cpu_pool
1319 .alloc_cpus(&self.allowed_cpus, &self.core_order)
1320 .clone()
1321 {
1322 Some(ret) => ret.clone(),
1323 None => {
1324 trace!("layer-{} can't grow, no CPUs", &self.name);
1325 return Ok(0);
1326 }
1327 };
1328
1329 let nr_new_cpus = new_cpus.weight();
1330
1331 trace!("[{}] adding CPUs: {}", &self.name, &new_cpus);
1332 self.cpus |= &new_cpus;
1333 self.nr_cpus += nr_new_cpus;
1334 for cpu in new_cpus.iter() {
1335 self.nr_llc_cpus[cpu_pool.topo.all_cpus[&cpu].llc_id] += 1;
1336 }
1337 Ok(nr_new_cpus)
1338 }
1339}
1340#[derive(Debug, Clone)]
1341struct NodeInfo {
1342 node_mask: nix::sched::CpuSet,
1343 _node_id: usize,
1344}
1345
1346#[derive(Debug)]
1347struct GpuTaskAffinitizer {
1348 gpu_devs_to_node_info: HashMap<u32, NodeInfo>,
1351 gpu_pids_to_devs: HashMap<Pid, u32>,
1352 last_process_time: Option<Instant>,
1353 sys: System,
1354 pid_map: HashMap<Pid, Vec<Pid>>,
1355 poll_interval: Duration,
1356 enable: bool,
1357 tasks_affinitized: u64,
1358 last_task_affinitization_ms: u64,
1359}
1360
1361impl GpuTaskAffinitizer {
1362 pub fn new(poll_interval: u64, enable: bool) -> GpuTaskAffinitizer {
1363 GpuTaskAffinitizer {
1364 gpu_devs_to_node_info: HashMap::new(),
1365 gpu_pids_to_devs: HashMap::new(),
1366 last_process_time: None,
1367 sys: System::default(),
1368 pid_map: HashMap::new(),
1369 poll_interval: Duration::from_secs(poll_interval),
1370 enable,
1371 tasks_affinitized: 0,
1372 last_task_affinitization_ms: 0,
1373 }
1374 }
1375
1376 fn find_one_cpu(&self, affinity: Vec<u64>) -> Result<u32> {
1377 for (chunk, &mask) in affinity.iter().enumerate() {
1378 let mut inner_offset: u64 = 1;
1379 for _ in 0..64 {
1380 if (mask & inner_offset) != 0 {
1381 return Ok((64 * chunk + u64::trailing_zeros(inner_offset) as usize) as u32);
1382 }
1383 inner_offset = inner_offset << 1;
1384 }
1385 }
1386 anyhow::bail!("unable to get CPU from NVML bitmask");
1387 }
1388
1389 fn node_to_cpuset(&self, node: &scx_utils::Node) -> Result<CpuSet> {
1390 let mut cpuset = CpuSet::new();
1391 for (cpu_id, _cpu) in &node.all_cpus {
1392 cpuset.set(*cpu_id)?;
1393 }
1394 Ok(cpuset)
1395 }
1396
1397 fn init_dev_node_map(&mut self, topo: Arc<Topology>) -> Result<()> {
1398 let nvml = nvml()?;
1399 let device_count = nvml.device_count()?;
1400
1401 for idx in 0..device_count {
1402 let dev = nvml.device_by_index(idx)?;
1403 let cpu = dev.cpu_affinity(16)?;
1405 let ideal_cpu = self.find_one_cpu(cpu)?;
1406 if let Some(cpu) = topo.all_cpus.get(&(ideal_cpu as usize)) {
1407 self.gpu_devs_to_node_info.insert(
1408 idx,
1409 NodeInfo {
1410 node_mask: self.node_to_cpuset(
1411 topo.nodes.get(&cpu.node_id).expect("topo missing node"),
1412 )?,
1413 _node_id: cpu.node_id,
1414 },
1415 );
1416 }
1417 }
1418 Ok(())
1419 }
1420
1421 fn update_gpu_pids(&mut self) -> Result<()> {
1422 let nvml = nvml()?;
1423 for i in 0..nvml.device_count()? {
1424 let device = nvml.device_by_index(i)?;
1425 for proc in device
1426 .running_compute_processes()?
1427 .into_iter()
1428 .chain(device.running_graphics_processes()?.into_iter())
1429 {
1430 self.gpu_pids_to_devs.insert(Pid::from_u32(proc.pid), i);
1431 }
1432 }
1433 Ok(())
1434 }
1435
1436 fn update_process_info(&mut self) -> Result<()> {
1437 self.sys.refresh_processes_specifics(
1438 ProcessesToUpdate::All,
1439 true,
1440 ProcessRefreshKind::nothing(),
1441 );
1442 self.pid_map.clear();
1443 for (pid, proc_) in self.sys.processes() {
1444 if let Some(ppid) = proc_.parent() {
1445 self.pid_map.entry(ppid).or_default().push(*pid);
1446 }
1447 }
1448 Ok(())
1449 }
1450
1451 fn get_child_pids_and_tids(&self, root_pid: Pid) -> HashSet<Pid> {
1452 let mut work = VecDeque::from([root_pid]);
1453 let mut pids_and_tids: HashSet<Pid> = HashSet::new();
1454
1455 while let Some(pid) = work.pop_front() {
1456 if pids_and_tids.insert(pid) {
1457 if let Some(kids) = self.pid_map.get(&pid) {
1458 work.extend(kids);
1459 }
1460 if let Some(proc_) = self.sys.process(pid) {
1461 if let Some(tasks) = proc_.tasks() {
1462 pids_and_tids.extend(tasks.iter().copied());
1463 }
1464 }
1465 }
1466 }
1467 pids_and_tids
1468 }
1469
1470 fn affinitize_gpu_pids(&mut self) -> Result<()> {
1471 if !self.enable {
1472 return Ok(());
1473 }
1474 for (pid, dev) in &self.gpu_pids_to_devs {
1475 let node_info = self
1476 .gpu_devs_to_node_info
1477 .get(&dev)
1478 .expect("Unable to get gpu pid node mask");
1479 for child in self.get_child_pids_and_tids(*pid) {
1480 match nix::sched::sched_setaffinity(
1481 nix::unistd::Pid::from_raw(child.as_u32() as i32),
1482 &node_info.node_mask,
1483 ) {
1484 Ok(_) => {
1485 self.tasks_affinitized += 1;
1487 }
1488 Err(_) => {
1489 debug!(
1490 "Error affinitizing gpu pid {} to node {:#?}",
1491 child.as_u32(),
1492 node_info
1493 );
1494 }
1495 };
1496 }
1497 }
1498 Ok(())
1499 }
1500
1501 pub fn maybe_affinitize(&mut self) {
1502 if !self.enable {
1503 return;
1504 }
1505 let now = Instant::now();
1506
1507 if let Some(last_process_time) = self.last_process_time {
1508 if (now - last_process_time) < self.poll_interval {
1509 return;
1510 }
1511 }
1512
1513 match self.update_gpu_pids() {
1514 Ok(_) => {}
1515 Err(e) => {
1516 error!("Error updating GPU PIDs: {}", e);
1517 }
1518 };
1519 match self.update_process_info() {
1520 Ok(_) => {}
1521 Err(e) => {
1522 error!("Error updating process info to affinitize GPU PIDs: {}", e);
1523 }
1524 };
1525 match self.affinitize_gpu_pids() {
1526 Ok(_) => {}
1527 Err(e) => {
1528 error!("Error updating GPU PIDs: {}", e);
1529 }
1530 };
1531 self.last_process_time = Some(now);
1532 self.last_task_affinitization_ms = (Instant::now() - now).as_millis() as u64;
1533
1534 return;
1535 }
1536
1537 pub fn init(&mut self, topo: Arc<Topology>) {
1538 if !self.enable || self.last_process_time.is_some() {
1539 return;
1540 }
1541
1542 match self.init_dev_node_map(topo) {
1543 Ok(_) => {}
1544 Err(e) => {
1545 error!("Error initializing gpu node dev map: {}", e);
1546 }
1547 };
1548 self.sys = System::new_all();
1549 return;
1550 }
1551}
1552
1553struct Scheduler<'a> {
1554 skel: BpfSkel<'a>,
1555 struct_ops: Option<libbpf_rs::Link>,
1556 layer_specs: Vec<LayerSpec>,
1557
1558 sched_intv: Duration,
1559 layer_refresh_intv: Duration,
1560
1561 cpu_pool: CpuPool,
1562 layers: Vec<Layer>,
1563 idle_qos_enabled: bool,
1564
1565 proc_reader: fb_procfs::ProcReader,
1566 sched_stats: Stats,
1567
1568 nr_layer_cpus_ranges: Vec<(usize, usize)>,
1569 processing_dur: Duration,
1570
1571 topo: Arc<Topology>,
1572 netdevs: BTreeMap<String, NetDev>,
1573 stats_server: StatsServer<StatsReq, StatsRes>,
1574 gpu_task_handler: GpuTaskAffinitizer,
1575}
1576
1577impl<'a> Scheduler<'a> {
1578 fn init_layers(skel: &mut OpenBpfSkel, specs: &[LayerSpec], topo: &Topology) -> Result<()> {
1579 skel.maps.rodata_data.as_mut().unwrap().nr_layers = specs.len() as u32;
1580 let mut perf_set = false;
1581
1582 let mut layer_iteration_order = (0..specs.len()).collect::<Vec<_>>();
1583 let mut layer_weights: Vec<usize> = vec![];
1584
1585 for (spec_i, spec) in specs.iter().enumerate() {
1586 let layer = &mut skel.maps.bss_data.as_mut().unwrap().layers[spec_i];
1587
1588 for (or_i, or) in spec.matches.iter().enumerate() {
1589 for (and_i, and) in or.iter().enumerate() {
1590 let mt = &mut layer.matches[or_i].matches[and_i];
1591
1592 mt.exclude.write(false);
1594
1595 match and {
1596 LayerMatch::CgroupPrefix(prefix) => {
1597 mt.kind = bpf_intf::layer_match_kind_MATCH_CGROUP_PREFIX as i32;
1598 copy_into_cstr(&mut mt.cgroup_prefix, prefix.as_str());
1599 }
1600 LayerMatch::CgroupSuffix(suffix) => {
1601 mt.kind = bpf_intf::layer_match_kind_MATCH_CGROUP_SUFFIX as i32;
1602 copy_into_cstr(&mut mt.cgroup_suffix, suffix.as_str());
1603 }
1604 LayerMatch::CgroupRegex(_) => {
1605 panic!("CgroupRegex match only supported in template");
1606 }
1607 LayerMatch::CgroupContains(substr) => {
1608 mt.kind = bpf_intf::layer_match_kind_MATCH_CGROUP_CONTAINS as i32;
1609 copy_into_cstr(&mut mt.cgroup_substr, substr.as_str());
1610 }
1611 LayerMatch::CommPrefix(prefix) => {
1612 mt.kind = bpf_intf::layer_match_kind_MATCH_COMM_PREFIX as i32;
1613 copy_into_cstr(&mut mt.comm_prefix, prefix.as_str());
1614 }
1615 LayerMatch::CommPrefixExclude(prefix) => {
1616 mt.kind = bpf_intf::layer_match_kind_MATCH_COMM_PREFIX as i32;
1617 mt.exclude.write(true);
1618 copy_into_cstr(&mut mt.comm_prefix, prefix.as_str());
1619 }
1620 LayerMatch::PcommPrefix(prefix) => {
1621 mt.kind = bpf_intf::layer_match_kind_MATCH_PCOMM_PREFIX as i32;
1622 copy_into_cstr(&mut mt.pcomm_prefix, prefix.as_str());
1623 }
1624 LayerMatch::PcommPrefixExclude(prefix) => {
1625 mt.kind = bpf_intf::layer_match_kind_MATCH_PCOMM_PREFIX as i32;
1626 mt.exclude.write(true);
1627 copy_into_cstr(&mut mt.pcomm_prefix, prefix.as_str());
1628 }
1629 LayerMatch::NiceAbove(nice) => {
1630 mt.kind = bpf_intf::layer_match_kind_MATCH_NICE_ABOVE as i32;
1631 mt.nice = *nice;
1632 }
1633 LayerMatch::NiceBelow(nice) => {
1634 mt.kind = bpf_intf::layer_match_kind_MATCH_NICE_BELOW as i32;
1635 mt.nice = *nice;
1636 }
1637 LayerMatch::NiceEquals(nice) => {
1638 mt.kind = bpf_intf::layer_match_kind_MATCH_NICE_EQUALS as i32;
1639 mt.nice = *nice;
1640 }
1641 LayerMatch::UIDEquals(user_id) => {
1642 mt.kind = bpf_intf::layer_match_kind_MATCH_USER_ID_EQUALS as i32;
1643 mt.user_id = *user_id;
1644 }
1645 LayerMatch::GIDEquals(group_id) => {
1646 mt.kind = bpf_intf::layer_match_kind_MATCH_GROUP_ID_EQUALS as i32;
1647 mt.group_id = *group_id;
1648 }
1649 LayerMatch::PIDEquals(pid) => {
1650 mt.kind = bpf_intf::layer_match_kind_MATCH_PID_EQUALS as i32;
1651 mt.pid = *pid;
1652 }
1653 LayerMatch::PPIDEquals(ppid) => {
1654 mt.kind = bpf_intf::layer_match_kind_MATCH_PPID_EQUALS as i32;
1655 mt.ppid = *ppid;
1656 }
1657 LayerMatch::TGIDEquals(tgid) => {
1658 mt.kind = bpf_intf::layer_match_kind_MATCH_TGID_EQUALS as i32;
1659 mt.tgid = *tgid;
1660 }
1661 LayerMatch::NSPIDEquals(nsid, pid) => {
1662 mt.kind = bpf_intf::layer_match_kind_MATCH_NSPID_EQUALS as i32;
1663 mt.nsid = *nsid;
1664 mt.pid = *pid;
1665 }
1666 LayerMatch::NSEquals(nsid) => {
1667 mt.kind = bpf_intf::layer_match_kind_MATCH_NS_EQUALS as i32;
1668 mt.nsid = *nsid as u64;
1669 }
1670 LayerMatch::CmdJoin(joincmd) => {
1671 mt.kind = bpf_intf::layer_match_kind_MATCH_SCXCMD_JOIN as i32;
1672 copy_into_cstr(&mut mt.comm_prefix, joincmd);
1673 }
1674 LayerMatch::IsGroupLeader(polarity) => {
1675 mt.kind = bpf_intf::layer_match_kind_MATCH_IS_GROUP_LEADER as i32;
1676 mt.is_group_leader.write(*polarity);
1677 }
1678 LayerMatch::IsKthread(polarity) => {
1679 mt.kind = bpf_intf::layer_match_kind_MATCH_IS_KTHREAD as i32;
1680 mt.is_kthread.write(*polarity);
1681 }
1682 LayerMatch::UsedGpuTid(polarity) => {
1683 mt.kind = bpf_intf::layer_match_kind_MATCH_USED_GPU_TID as i32;
1684 mt.used_gpu_tid.write(*polarity);
1685 }
1686 LayerMatch::UsedGpuPid(polarity) => {
1687 mt.kind = bpf_intf::layer_match_kind_MATCH_USED_GPU_PID as i32;
1688 mt.used_gpu_pid.write(*polarity);
1689 }
1690 LayerMatch::AvgRuntime(min, max) => {
1691 mt.kind = bpf_intf::layer_match_kind_MATCH_AVG_RUNTIME as i32;
1692 mt.min_avg_runtime_us = *min;
1693 mt.max_avg_runtime_us = *max;
1694 }
1695 LayerMatch::HintEquals(hint) => {
1696 mt.kind = bpf_intf::layer_match_kind_MATCH_HINT_EQUALS as i32;
1697 mt.hint = *hint;
1698 }
1699 }
1700 }
1701 layer.matches[or_i].nr_match_ands = or.len() as i32;
1702 }
1703
1704 layer.nr_match_ors = spec.matches.len() as u32;
1705 layer.kind = spec.kind.as_bpf_enum();
1706
1707 {
1708 let LayerCommon {
1709 min_exec_us,
1710 yield_ignore,
1711 perf,
1712 preempt,
1713 preempt_first,
1714 exclusive,
1715 allow_node_aligned,
1716 skip_remote_node,
1717 prev_over_idle_core,
1718 growth_algo,
1719 nodes,
1720 slice_us,
1721 fifo,
1722 weight,
1723 disallow_open_after_us,
1724 disallow_preempt_after_us,
1725 xllc_mig_min_us,
1726 placement,
1727 ..
1728 } = spec.kind.common();
1729
1730 layer.slice_ns = *slice_us * 1000;
1731 layer.fifo.write(*fifo);
1732 layer.min_exec_ns = min_exec_us * 1000;
1733 layer.yield_step_ns = if *yield_ignore > 0.999 {
1734 0
1735 } else if *yield_ignore < 0.001 {
1736 layer.slice_ns
1737 } else {
1738 (layer.slice_ns as f64 * (1.0 - *yield_ignore)) as u64
1739 };
1740 let mut layer_name: String = spec.name.clone();
1741 layer_name.truncate(MAX_LAYER_NAME);
1742 copy_into_cstr(&mut layer.name, layer_name.as_str());
1743 layer.preempt.write(*preempt);
1744 layer.preempt_first.write(*preempt_first);
1745 layer.excl.write(*exclusive);
1746 layer.allow_node_aligned.write(*allow_node_aligned);
1747 layer.skip_remote_node.write(*skip_remote_node);
1748 layer.prev_over_idle_core.write(*prev_over_idle_core);
1749 layer.growth_algo = growth_algo.as_bpf_enum();
1750 layer.weight = *weight;
1751 layer.disallow_open_after_ns = match disallow_open_after_us.unwrap() {
1752 v if v == u64::MAX => v,
1753 v => v * 1000,
1754 };
1755 layer.disallow_preempt_after_ns = match disallow_preempt_after_us.unwrap() {
1756 v if v == u64::MAX => v,
1757 v => v * 1000,
1758 };
1759 layer.xllc_mig_min_ns = (xllc_mig_min_us * 1000.0) as u64;
1760 layer_weights.push(layer.weight.try_into().unwrap());
1761 layer.perf = u32::try_from(*perf)?;
1762 layer.node_mask = nodemask_from_nodes(nodes) as u64;
1763 for (topo_node_id, topo_node) in &topo.nodes {
1764 if !nodes.is_empty() && !nodes.contains(topo_node_id) {
1765 continue;
1766 }
1767 layer.llc_mask |= llcmask_from_llcs(&topo_node.llcs) as u64;
1768 }
1769
1770 let task_place = |place: u32| crate::types::layer_task_place(place);
1771 layer.task_place = match placement {
1772 LayerPlacement::Standard => {
1773 task_place(bpf_intf::layer_task_place_PLACEMENT_STD as u32)
1774 }
1775 LayerPlacement::Sticky => {
1776 task_place(bpf_intf::layer_task_place_PLACEMENT_STICK as u32)
1777 }
1778 LayerPlacement::Floating => {
1779 task_place(bpf_intf::layer_task_place_PLACEMENT_FLOAT as u32)
1780 }
1781 };
1782 }
1783
1784 layer.is_protected.write(match spec.kind {
1785 LayerKind::Open { .. } => false,
1786 LayerKind::Confined { protected, .. } | LayerKind::Grouped { protected, .. } => {
1787 protected
1788 }
1789 });
1790
1791 match &spec.cpuset {
1792 Some(mask) => {
1793 Self::update_cpumask(&mask, &mut layer.cpuset);
1794 }
1795 None => {
1796 for i in 0..layer.cpuset.len() {
1797 layer.cpuset[i] = u8::MAX;
1798 }
1799 }
1800 };
1801
1802 perf_set |= layer.perf > 0;
1803 }
1804
1805 layer_iteration_order.sort_by(|i, j| layer_weights[*i].cmp(&layer_weights[*j]));
1806 for (idx, layer_idx) in layer_iteration_order.iter().enumerate() {
1807 skel.maps
1808 .rodata_data
1809 .as_mut()
1810 .unwrap()
1811 .layer_iteration_order[idx] = *layer_idx as u32;
1812 }
1813
1814 if perf_set && !compat::ksym_exists("scx_bpf_cpuperf_set")? {
1815 warn!("cpufreq support not available, ignoring perf configurations");
1816 }
1817
1818 Ok(())
1819 }
1820
1821 fn init_nodes(skel: &mut OpenBpfSkel, _opts: &Opts, topo: &Topology) {
1822 skel.maps.rodata_data.as_mut().unwrap().nr_nodes = topo.nodes.len() as u32;
1823 skel.maps.rodata_data.as_mut().unwrap().nr_llcs = 0;
1824
1825 for (&node_id, node) in &topo.nodes {
1826 debug!("configuring node {}, LLCs {:?}", node_id, node.llcs.len());
1827 skel.maps.rodata_data.as_mut().unwrap().nr_llcs += node.llcs.len() as u32;
1828 let raw_numa_slice = node.span.as_raw_slice();
1829 let node_cpumask_slice =
1830 &mut skel.maps.rodata_data.as_mut().unwrap().numa_cpumasks[node_id];
1831 let (left, _) = node_cpumask_slice.split_at_mut(raw_numa_slice.len());
1832 left.clone_from_slice(raw_numa_slice);
1833 debug!(
1834 "node {} mask: {:?}",
1835 node_id,
1836 skel.maps.rodata_data.as_ref().unwrap().numa_cpumasks[node_id]
1837 );
1838
1839 for llc in node.llcs.values() {
1840 debug!("configuring llc {:?} for node {:?}", llc.id, node_id);
1841 skel.maps.rodata_data.as_mut().unwrap().llc_numa_id_map[llc.id] = node_id as u32;
1842 }
1843 }
1844
1845 for cpu in topo.all_cpus.values() {
1846 skel.maps.rodata_data.as_mut().unwrap().cpu_llc_id_map[cpu.id] = cpu.llc_id as u32;
1847 }
1848 }
1849
1850 fn init_cpu_prox_map(topo: &Topology, cpu_ctxs: &mut [bpf_intf::cpu_ctx]) {
1851 let radiate = |mut vec: Vec<usize>, center_id: usize| -> Vec<usize> {
1852 vec.sort_by_key(|&id| (center_id as i32 - id as i32).abs());
1853 vec
1854 };
1855 let radiate_cpu =
1856 |mut vec: Vec<usize>, center_cpu: usize, center_core: usize| -> Vec<usize> {
1857 vec.sort_by_key(|&id| {
1858 (
1859 (center_core as i32 - topo.all_cpus.get(&id).unwrap().core_id as i32).abs(),
1860 (center_cpu as i32 - id as i32).abs(),
1861 )
1862 });
1863 vec
1864 };
1865
1866 for (&cpu_id, cpu) in &topo.all_cpus {
1867 let mut core_span = topo.all_cores[&cpu.core_id].span.clone();
1869 let llc_span = &topo.all_llcs[&cpu.llc_id].span;
1870 let node_span = &topo.nodes[&cpu.node_id].span;
1871 let sys_span = &topo.span;
1872
1873 let sys_span = sys_span.and(&node_span.not());
1875 let node_span = node_span.and(&llc_span.not());
1876 let llc_span = llc_span.and(&core_span.not());
1877 core_span.clear_cpu(cpu_id).unwrap();
1878
1879 let mut sys_order: Vec<usize> = sys_span.iter().collect();
1881 let mut node_order: Vec<usize> = node_span.iter().collect();
1882 let mut llc_order: Vec<usize> = llc_span.iter().collect();
1883 let mut core_order: Vec<usize> = core_span.iter().collect();
1884
1885 sys_order = radiate_cpu(sys_order, cpu_id, cpu.core_id);
1890 node_order = radiate(node_order, cpu.node_id);
1891 llc_order = radiate_cpu(llc_order, cpu_id, cpu.core_id);
1892 core_order = radiate_cpu(core_order, cpu_id, cpu.core_id);
1893
1894 let mut order: Vec<usize> = vec![];
1896 let mut idx: usize = 0;
1897
1898 idx += 1;
1899 order.push(cpu_id);
1900
1901 idx += core_order.len();
1902 order.append(&mut core_order);
1903 let core_end = idx;
1904
1905 idx += llc_order.len();
1906 order.append(&mut llc_order);
1907 let llc_end = idx;
1908
1909 idx += node_order.len();
1910 order.append(&mut node_order);
1911 let node_end = idx;
1912
1913 idx += sys_order.len();
1914 order.append(&mut sys_order);
1915 let sys_end = idx;
1916
1917 debug!(
1918 "CPU[{}] proximity map[{}/{}/{}/{}]: {:?}",
1919 cpu_id, core_end, llc_end, node_end, sys_end, &order
1920 );
1921
1922 let pmap = &mut cpu_ctxs[cpu_id].prox_map;
1924 for (i, &cpu) in order.iter().enumerate() {
1925 pmap.cpus[i] = cpu as u16;
1926 }
1927 pmap.core_end = core_end as u32;
1928 pmap.llc_end = llc_end as u32;
1929 pmap.node_end = node_end as u32;
1930 pmap.sys_end = sys_end as u32;
1931 }
1932 }
1933
1934 fn convert_cpu_ctxs(cpu_ctxs: Vec<bpf_intf::cpu_ctx>) -> Vec<Vec<u8>> {
1935 cpu_ctxs
1936 .into_iter()
1937 .map(|cpu_ctx| {
1938 let bytes = unsafe {
1939 std::slice::from_raw_parts(
1940 &cpu_ctx as *const bpf_intf::cpu_ctx as *const u8,
1941 std::mem::size_of::<bpf_intf::cpu_ctx>(),
1942 )
1943 };
1944 bytes.to_vec()
1945 })
1946 .collect()
1947 }
1948
1949 fn init_cpus(skel: &BpfSkel, layer_specs: &[LayerSpec], topo: &Topology) -> Result<()> {
1950 let key = (0_u32).to_ne_bytes();
1951 let mut cpu_ctxs: Vec<bpf_intf::cpu_ctx> = vec![];
1952 let cpu_ctxs_vec = skel
1953 .maps
1954 .cpu_ctxs
1955 .lookup_percpu(&key, libbpf_rs::MapFlags::ANY)
1956 .context("Failed to lookup cpu_ctx")?
1957 .unwrap();
1958
1959 let op_layers: Vec<u32> = layer_specs
1960 .iter()
1961 .enumerate()
1962 .filter(|(_idx, spec)| match &spec.kind {
1963 LayerKind::Open { .. } => spec.kind.common().preempt,
1964 _ => false,
1965 })
1966 .map(|(idx, _)| idx as u32)
1967 .collect();
1968 let on_layers: Vec<u32> = layer_specs
1969 .iter()
1970 .enumerate()
1971 .filter(|(_idx, spec)| match &spec.kind {
1972 LayerKind::Open { .. } => !spec.kind.common().preempt,
1973 _ => false,
1974 })
1975 .map(|(idx, _)| idx as u32)
1976 .collect();
1977 let gp_layers: Vec<u32> = layer_specs
1978 .iter()
1979 .enumerate()
1980 .filter(|(_idx, spec)| match &spec.kind {
1981 LayerKind::Grouped { .. } => spec.kind.common().preempt,
1982 _ => false,
1983 })
1984 .map(|(idx, _)| idx as u32)
1985 .collect();
1986 let gn_layers: Vec<u32> = layer_specs
1987 .iter()
1988 .enumerate()
1989 .filter(|(_idx, spec)| match &spec.kind {
1990 LayerKind::Grouped { .. } => !spec.kind.common().preempt,
1991 _ => false,
1992 })
1993 .map(|(idx, _)| idx as u32)
1994 .collect();
1995
1996 for cpu in 0..*NR_CPUS_POSSIBLE {
1998 cpu_ctxs.push(*unsafe {
1999 &*(cpu_ctxs_vec[cpu].as_slice().as_ptr() as *const bpf_intf::cpu_ctx)
2000 });
2001
2002 let topo_cpu = topo.all_cpus.get(&cpu).unwrap();
2003 let is_big = topo_cpu.core_type == CoreType::Big { turbo: true };
2004 cpu_ctxs[cpu].cpu = cpu as i32;
2005 cpu_ctxs[cpu].layer_id = MAX_LAYERS as u32;
2006 cpu_ctxs[cpu].task_layer_id = MAX_LAYERS as u32;
2007 cpu_ctxs[cpu].is_big = is_big;
2008
2009 fastrand::seed(cpu as u64);
2010
2011 let mut ogp_order = op_layers.clone();
2012 ogp_order.append(&mut gp_layers.clone());
2013 fastrand::shuffle(&mut ogp_order);
2014
2015 let mut ogn_order = on_layers.clone();
2016 ogn_order.append(&mut gn_layers.clone());
2017 fastrand::shuffle(&mut ogn_order);
2018
2019 let mut op_order = op_layers.clone();
2020 fastrand::shuffle(&mut op_order);
2021
2022 let mut on_order = on_layers.clone();
2023 fastrand::shuffle(&mut on_order);
2024
2025 let mut gp_order = gp_layers.clone();
2026 fastrand::shuffle(&mut gp_order);
2027
2028 let mut gn_order = gn_layers.clone();
2029 fastrand::shuffle(&mut gn_order);
2030
2031 for i in 0..MAX_LAYERS {
2032 cpu_ctxs[cpu].ogp_layer_order[i] =
2033 ogp_order.get(i).cloned().unwrap_or(MAX_LAYERS as u32);
2034 cpu_ctxs[cpu].ogn_layer_order[i] =
2035 ogn_order.get(i).cloned().unwrap_or(MAX_LAYERS as u32);
2036
2037 cpu_ctxs[cpu].op_layer_order[i] =
2038 op_order.get(i).cloned().unwrap_or(MAX_LAYERS as u32);
2039 cpu_ctxs[cpu].on_layer_order[i] =
2040 on_order.get(i).cloned().unwrap_or(MAX_LAYERS as u32);
2041 cpu_ctxs[cpu].gp_layer_order[i] =
2042 gp_order.get(i).cloned().unwrap_or(MAX_LAYERS as u32);
2043 cpu_ctxs[cpu].gn_layer_order[i] =
2044 gn_order.get(i).cloned().unwrap_or(MAX_LAYERS as u32);
2045 }
2046 }
2047
2048 Self::init_cpu_prox_map(topo, &mut cpu_ctxs);
2049
2050 skel.maps
2051 .cpu_ctxs
2052 .update_percpu(
2053 &key,
2054 &Self::convert_cpu_ctxs(cpu_ctxs),
2055 libbpf_rs::MapFlags::ANY,
2056 )
2057 .context("Failed to update cpu_ctx")?;
2058
2059 Ok(())
2060 }
2061
2062 fn init_llc_prox_map(skel: &mut BpfSkel, topo: &Topology) -> Result<()> {
2063 for (&llc_id, llc) in &topo.all_llcs {
2064 let mut node_order: Vec<usize> =
2066 topo.nodes[&llc.node_id].llcs.keys().cloned().collect();
2067 let mut sys_order: Vec<usize> = topo.all_llcs.keys().cloned().collect();
2068
2069 sys_order.retain(|id| !node_order.contains(id));
2071 node_order.retain(|&id| id != llc_id);
2072
2073 fastrand::seed(llc_id as u64);
2076 fastrand::shuffle(&mut sys_order);
2077 fastrand::shuffle(&mut node_order);
2078
2079 let mut order: Vec<usize> = vec![];
2081 let mut idx: usize = 0;
2082
2083 idx += 1;
2084 order.push(llc_id);
2085
2086 idx += node_order.len();
2087 order.append(&mut node_order);
2088 let node_end = idx;
2089
2090 idx += sys_order.len();
2091 order.append(&mut sys_order);
2092 let sys_end = idx;
2093
2094 debug!(
2095 "LLC[{}] proximity map[{}/{}]: {:?}",
2096 llc_id, node_end, sys_end, &order
2097 );
2098
2099 let key = llc_id as u32;
2104 let llc_id_slice =
2105 unsafe { std::slice::from_raw_parts((&key as *const u32) as *const u8, 4) };
2106 let v = skel
2107 .maps
2108 .llc_data
2109 .lookup(llc_id_slice, libbpf_rs::MapFlags::ANY)
2110 .unwrap()
2111 .unwrap();
2112 let mut llcc = unsafe { *(v.as_slice().as_ptr() as *const bpf_intf::llc_ctx) };
2113
2114 let pmap = &mut llcc.prox_map;
2115 for (i, &llc_id) in order.iter().enumerate() {
2116 pmap.llcs[i] = llc_id as u16;
2117 }
2118 pmap.node_end = node_end as u32;
2119 pmap.sys_end = sys_end as u32;
2120
2121 let v = unsafe {
2122 std::slice::from_raw_parts(
2123 &llcc as *const bpf_intf::llc_ctx as *const u8,
2124 std::mem::size_of::<bpf_intf::llc_ctx>(),
2125 )
2126 };
2127
2128 skel.maps
2129 .llc_data
2130 .update(llc_id_slice, v, libbpf_rs::MapFlags::ANY)?
2131 }
2132
2133 Ok(())
2134 }
2135
2136 fn init(
2137 opts: &'a Opts,
2138 layer_specs: &[LayerSpec],
2139 open_object: &'a mut MaybeUninit<OpenObject>,
2140 hint_to_layer_map: &HashMap<u64, usize>,
2141 ) -> Result<Self> {
2142 let nr_layers = layer_specs.len();
2143 let mut disable_topology = opts.disable_topology.unwrap_or(false);
2144
2145 let topo = Arc::new(if disable_topology {
2146 Topology::with_flattened_llc_node()?
2147 } else {
2148 Topology::new()?
2149 });
2150
2151 if topo.nodes.keys().enumerate().any(|(i, &k)| i != k) {
2158 bail!("Holes in node IDs detected: {:?}", topo.nodes.keys());
2159 }
2160 if topo.all_llcs.keys().enumerate().any(|(i, &k)| i != k) {
2161 bail!("Holes in LLC IDs detected: {:?}", topo.all_llcs.keys());
2162 }
2163 if topo.all_cpus.keys().enumerate().any(|(i, &k)| i != k) {
2164 bail!("Holes in CPU IDs detected: {:?}", topo.all_cpus.keys());
2165 }
2166
2167 let netdevs = if opts.netdev_irq_balance {
2168 warn!(
2169 "Experimental netdev IRQ balancing enabled. Reset IRQ masks of network devices after use!!!"
2170 );
2171 read_netdevs()?
2172 } else {
2173 BTreeMap::new()
2174 };
2175
2176 if !disable_topology {
2177 if topo.nodes.len() == 1 && topo.nodes[&0].llcs.len() == 1 {
2178 disable_topology = true;
2179 };
2180 info!(
2181 "Topology awareness not specified, selecting {} based on hardware",
2182 if disable_topology {
2183 "disabled"
2184 } else {
2185 "enabled"
2186 }
2187 );
2188 };
2189
2190 let cpu_pool = CpuPool::new(topo.clone())?;
2191
2192 let layer_specs: Vec<_> = if disable_topology {
2195 info!("Disabling topology awareness");
2196 layer_specs
2197 .iter()
2198 .cloned()
2199 .map(|mut s| {
2200 s.kind.common_mut().nodes.clear();
2201 s.kind.common_mut().llcs.clear();
2202 s
2203 })
2204 .collect()
2205 } else {
2206 layer_specs.to_vec()
2207 };
2208
2209 init_libbpf_logging(None);
2211 let kfuncs_in_syscall = scx_bpf_compat::kfuncs_supported_in_syscall()?;
2212 if !kfuncs_in_syscall {
2213 warn!("Using slow path: kfuncs not supported in syscall programs (a8e03b6bbb2c ∉ ker)");
2214 }
2215
2216 let mut skel_builder = BpfSkelBuilder::default();
2218 skel_builder.obj_builder.debug(opts.verbose > 1);
2219 info!(
2220 "Running scx_layered (build ID: {})",
2221 build_id::full_version(env!("CARGO_PKG_VERSION"))
2222 );
2223 let open_opts = opts.libbpf.clone().into_bpf_open_opts();
2224 let mut skel = scx_ops_open!(skel_builder, open_object, layered, open_opts)?;
2225
2226 if opts.enable_gpu_support {
2229 if opts.gpu_kprobe_level >= 1 {
2232 compat::cond_kprobe_enable("nvidia_open", &skel.progs.kprobe_nvidia_open)?;
2233 }
2234 if opts.gpu_kprobe_level >= 2 {
2237 compat::cond_kprobe_enable("nvidia_mmap", &skel.progs.kprobe_nvidia_mmap)?;
2238 }
2239 if opts.gpu_kprobe_level >= 3 {
2240 compat::cond_kprobe_enable("nvidia_poll", &skel.progs.kprobe_nvidia_poll)?;
2241 }
2242 }
2243
2244 let ext_sched_class_addr = get_kallsyms_addr("ext_sched_class");
2245 let idle_sched_class_addr = get_kallsyms_addr("idle_sched_class");
2246
2247 let rodata = skel.maps.rodata_data.as_mut().unwrap();
2248
2249 if ext_sched_class_addr.is_ok() && idle_sched_class_addr.is_ok() {
2250 rodata.ext_sched_class_addr = ext_sched_class_addr.unwrap();
2251 rodata.idle_sched_class_addr = idle_sched_class_addr.unwrap();
2252 } else {
2253 warn!(
2254 "Unable to get sched_class addresses from /proc/kallsyms, disabling skip_preempt."
2255 );
2256 }
2257
2258 rodata.slice_ns = scx_enums.SCX_SLICE_DFL;
2259 rodata.max_exec_ns = 20 * scx_enums.SCX_SLICE_DFL;
2260
2261 skel.struct_ops.layered_mut().exit_dump_len = opts.exit_dump_len;
2263
2264 if !opts.disable_queued_wakeup {
2265 match *compat::SCX_OPS_ALLOW_QUEUED_WAKEUP {
2266 0 => info!("Kernel does not support queued wakeup optimization"),
2267 v => skel.struct_ops.layered_mut().flags |= v,
2268 }
2269 }
2270
2271 rodata.percpu_kthread_preempt = !opts.disable_percpu_kthread_preempt;
2272 rodata.percpu_kthread_preempt_all =
2273 !opts.disable_percpu_kthread_preempt && opts.percpu_kthread_preempt_all;
2274 rodata.debug = opts.verbose as u32;
2275 rodata.slice_ns = opts.slice_us * 1000;
2276 rodata.max_exec_ns = if opts.max_exec_us > 0 {
2277 opts.max_exec_us * 1000
2278 } else {
2279 opts.slice_us * 1000 * 20
2280 };
2281 rodata.nr_cpu_ids = *NR_CPU_IDS as u32;
2282 rodata.nr_possible_cpus = *NR_CPUS_POSSIBLE as u32;
2283 rodata.smt_enabled = topo.smt_enabled;
2284 rodata.has_little_cores = topo.has_little_cores();
2285 rodata.xnuma_preemption = opts.xnuma_preemption;
2286 rodata.antistall_sec = opts.antistall_sec;
2287 rodata.monitor_disable = opts.monitor_disable;
2288 rodata.lo_fb_wait_ns = opts.lo_fb_wait_us * 1000;
2289 rodata.lo_fb_share_ppk = ((opts.lo_fb_share * 1024.0) as u32).clamp(1, 1024);
2290 rodata.enable_antistall = !opts.disable_antistall;
2291 rodata.enable_match_debug = opts.enable_match_debug;
2292 rodata.enable_gpu_support = opts.enable_gpu_support;
2293 rodata.kfuncs_supported_in_syscall = kfuncs_in_syscall;
2294
2295 for (cpu, sib) in topo.sibling_cpus().iter().enumerate() {
2296 rodata.__sibling_cpu[cpu] = *sib;
2297 }
2298 for cpu in topo.all_cpus.keys() {
2299 rodata.all_cpus[cpu / 8] |= 1 << (cpu % 8);
2300 }
2301
2302 rodata.nr_op_layers = layer_specs
2303 .iter()
2304 .filter(|spec| match &spec.kind {
2305 LayerKind::Open { .. } => spec.kind.common().preempt,
2306 _ => false,
2307 })
2308 .count() as u32;
2309 rodata.nr_on_layers = layer_specs
2310 .iter()
2311 .filter(|spec| match &spec.kind {
2312 LayerKind::Open { .. } => !spec.kind.common().preempt,
2313 _ => false,
2314 })
2315 .count() as u32;
2316 rodata.nr_gp_layers = layer_specs
2317 .iter()
2318 .filter(|spec| match &spec.kind {
2319 LayerKind::Grouped { .. } => spec.kind.common().preempt,
2320 _ => false,
2321 })
2322 .count() as u32;
2323 rodata.nr_gn_layers = layer_specs
2324 .iter()
2325 .filter(|spec| match &spec.kind {
2326 LayerKind::Grouped { .. } => !spec.kind.common().preempt,
2327 _ => false,
2328 })
2329 .count() as u32;
2330 rodata.nr_excl_layers = layer_specs
2331 .iter()
2332 .filter(|spec| spec.kind.common().exclusive)
2333 .count() as u32;
2334
2335 let mut min_open = u64::MAX;
2336 let mut min_preempt = u64::MAX;
2337
2338 for spec in layer_specs.iter() {
2339 if let LayerKind::Open { common, .. } = &spec.kind {
2340 min_open = min_open.min(common.disallow_open_after_us.unwrap());
2341 min_preempt = min_preempt.min(common.disallow_preempt_after_us.unwrap());
2342 }
2343 }
2344
2345 rodata.min_open_layer_disallow_open_after_ns = match min_open {
2346 u64::MAX => *DFL_DISALLOW_OPEN_AFTER_US,
2347 v => v,
2348 };
2349 rodata.min_open_layer_disallow_preempt_after_ns = match min_preempt {
2350 u64::MAX => *DFL_DISALLOW_PREEMPT_AFTER_US,
2351 v => v,
2352 };
2353
2354 for i in 0..layer_specs.len() {
2356 skel.maps.bss_data.as_mut().unwrap().empty_layer_ids[i] = i as u32;
2357 }
2358 skel.maps.bss_data.as_mut().unwrap().nr_empty_layer_ids = nr_layers as u32;
2359
2360 let layered_task_hint_map_path = &opts.task_hint_map;
2365 let hint_map = &mut skel.maps.scx_layered_task_hint_map;
2366 if layered_task_hint_map_path.is_empty() == false {
2368 hint_map.set_pin_path(layered_task_hint_map_path).unwrap();
2369 rodata.task_hint_map_enabled = true;
2370 }
2371
2372 if !opts.hi_fb_thread_name.is_empty() {
2373 let bpf_hi_fb_thread_name = &mut rodata.hi_fb_thread_name;
2374 copy_into_cstr(bpf_hi_fb_thread_name, opts.hi_fb_thread_name.as_str());
2375 rodata.enable_hi_fb_thread_name_match = true;
2376 }
2377
2378 Self::init_layers(&mut skel, &layer_specs, &topo)?;
2379 Self::init_nodes(&mut skel, opts, &topo);
2380
2381 let mut skel = scx_ops_load!(skel, layered, uei)?;
2382
2383 if hint_to_layer_map.len() != 0 {
2385 for (k, v) in hint_to_layer_map.iter() {
2386 let key: u32 = *k as u32;
2387 let value: u32 = *v as u32;
2388 skel.maps.hint_to_layer_id_map.update(
2389 &key.to_ne_bytes(),
2390 &value.to_ne_bytes(),
2391 libbpf_rs::MapFlags::ANY,
2392 )?;
2393 }
2394 }
2395
2396 let mut layers = vec![];
2397 let layer_growth_orders =
2398 LayerGrowthAlgo::layer_core_orders(&cpu_pool, &layer_specs, &topo)?;
2399 for (idx, spec) in layer_specs.iter().enumerate() {
2400 let growth_order = layer_growth_orders
2401 .get(&idx)
2402 .with_context(|| "layer has no growth order".to_string())?;
2403 layers.push(Layer::new(spec, &topo, growth_order)?);
2404 }
2405
2406 let mut idle_qos_enabled = layers
2407 .iter()
2408 .any(|layer| layer.kind.common().idle_resume_us.unwrap_or(0) > 0);
2409 if idle_qos_enabled && !cpu_idle_resume_latency_supported() {
2410 warn!("idle_resume_us not supported, ignoring");
2411 idle_qos_enabled = false;
2412 }
2413
2414 Self::init_cpus(&skel, &layer_specs, &topo)?;
2415 Self::init_llc_prox_map(&mut skel, &topo)?;
2416
2417 let proc_reader = fb_procfs::ProcReader::new();
2419
2420 let input = ProgramInput {
2422 ..Default::default()
2423 };
2424 let prog = &mut skel.progs.initialize_pid_namespace;
2425
2426 let _ = prog.test_run(input);
2427
2428 if layered_task_hint_map_path.is_empty() == false {
2437 let path = CString::new(layered_task_hint_map_path.as_bytes()).unwrap();
2438 let mode: libc::mode_t = 0o666;
2439 unsafe {
2440 if libc::chmod(path.as_ptr(), mode) != 0 {
2441 trace!("'chmod' to 666 of task hint map failed, continuing...");
2442 }
2443 }
2444 }
2445
2446 let struct_ops = scx_ops_attach!(skel, layered)?;
2448 let stats_server = StatsServer::new(stats::server_data()).launch()?;
2449 let mut gpu_task_handler =
2450 GpuTaskAffinitizer::new(opts.gpu_affinitize_secs, opts.enable_gpu_affinitize);
2451 gpu_task_handler.init(topo.clone());
2452 let sched = Self {
2453 struct_ops: Some(struct_ops),
2454 layer_specs,
2455
2456 sched_intv: Duration::from_secs_f64(opts.interval),
2457 layer_refresh_intv: Duration::from_millis(opts.layer_refresh_ms_avgruntime),
2458
2459 cpu_pool,
2460 layers,
2461 idle_qos_enabled,
2462
2463 sched_stats: Stats::new(&mut skel, &proc_reader, &gpu_task_handler)?,
2464
2465 nr_layer_cpus_ranges: vec![(0, 0); nr_layers],
2466 processing_dur: Default::default(),
2467
2468 proc_reader,
2469 skel,
2470
2471 topo,
2472 netdevs,
2473 stats_server,
2474 gpu_task_handler,
2475 };
2476
2477 info!("Layered Scheduler Attached. Run `scx_layered --monitor` for metrics.");
2478
2479 Ok(sched)
2480 }
2481
2482 fn update_cpumask(mask: &Cpumask, bpfmask: &mut [u8]) {
2483 for cpu in 0..mask.len() {
2484 if mask.test_cpu(cpu) {
2485 bpfmask[cpu / 8] |= 1 << (cpu % 8);
2486 } else {
2487 bpfmask[cpu / 8] &= !(1 << (cpu % 8));
2488 }
2489 }
2490 }
2491
2492 fn update_bpf_layer_cpumask(layer: &Layer, bpf_layer: &mut types::layer) {
2493 trace!("[{}] Updating BPF CPUs: {}", layer.name, &layer.cpus);
2494 Self::update_cpumask(&layer.cpus, &mut bpf_layer.cpus);
2495
2496 bpf_layer.nr_cpus = layer.nr_cpus as u32;
2497 for (llc_id, &nr_llc_cpus) in layer.nr_llc_cpus.iter().enumerate() {
2498 bpf_layer.nr_llc_cpus[llc_id] = nr_llc_cpus as u32;
2499 }
2500
2501 bpf_layer.refresh_cpus = 1;
2502 }
2503
2504 fn update_netdev_cpumasks(&mut self) -> Result<()> {
2505 let available_cpus = self.cpu_pool.available_cpus();
2506 if available_cpus.is_empty() {
2507 return Ok(());
2508 }
2509
2510 for (iface, netdev) in self.netdevs.iter_mut() {
2511 let node = self
2512 .topo
2513 .nodes
2514 .values()
2515 .take_while(|n| n.id == netdev.node())
2516 .next()
2517 .ok_or_else(|| anyhow!("Failed to get netdev node"))?;
2518 let node_cpus = node.span.clone();
2519 for (irq, irqmask) in netdev.irqs.iter_mut() {
2520 irqmask.clear_all();
2521 for cpu in available_cpus.iter() {
2522 if !node_cpus.test_cpu(cpu) {
2523 continue;
2524 }
2525 let _ = irqmask.set_cpu(cpu);
2526 }
2527 if irqmask.weight() == 0 {
2529 for cpu in node_cpus.iter() {
2530 let _ = irqmask.set_cpu(cpu);
2531 }
2532 }
2533 trace!("{} updating irq {} cpumask {:?}", iface, irq, irqmask);
2534 }
2535 netdev.apply_cpumasks()?;
2536 }
2537
2538 Ok(())
2539 }
2540
2541 fn clamp_target_by_membw(
2542 &self,
2543 layer: &Layer,
2544 membw_limit: usize,
2545 last_membw: usize,
2546 low: usize,
2547 high: usize,
2548 ) -> usize {
2549 let ncpu = layer.cpus.weight();
2550 let last_membw_percpu = if ncpu > 0 { last_membw / ncpu } else { 0 };
2551
2552 if last_membw_percpu * high < membw_limit {
2554 return high;
2555 }
2556
2557 if last_membw_percpu * low > membw_limit {
2560 warn!("cannot satisfy memory bw limit for layer {}", layer.name);
2561 return low;
2562 }
2563
2564 return membw_limit / last_membw_percpu;
2566 }
2567
2568 fn calc_target_nr_cpus(&self) -> Vec<(usize, usize)> {
2574 let nr_cpus = self.cpu_pool.topo.all_cpus.len();
2575 let utils = &self.sched_stats.layer_utils;
2576
2577 let mut records: Vec<(u64, u64, u64, usize, usize, usize)> = vec![];
2578 let mut targets: Vec<(usize, usize)> = vec![];
2579
2580 for (idx, layer) in self.layers.iter().enumerate() {
2581 targets.push(match &layer.kind {
2582 LayerKind::Confined {
2583 util_range,
2584 cpus_range,
2585 cpus_range_frac,
2586 membw_gb,
2587 ..
2588 }
2589 | LayerKind::Grouped {
2590 util_range,
2591 cpus_range,
2592 cpus_range_frac,
2593 membw_gb,
2594 ..
2595 } => {
2596 let owned = utils[idx][LAYER_USAGE_OWNED];
2601 let open = utils[idx][LAYER_USAGE_OPEN];
2602
2603 let mut util = owned;
2604 if layer.kind.util_includes_open_cputime() || layer.nr_cpus == 0 {
2605 util += open;
2606 }
2607
2608 let util = if util < 0.01 { 0.0 } else { util };
2609 let low = (util / util_range.1).ceil() as usize;
2610 let mut high = ((util / util_range.0).floor() as usize).max(low);
2611
2612 if let Some(membw_limit) = membw_gb {
2613 let membw_cpus = &self.sched_stats.layer_membws[idx];
2614 let last_membw = membw_cpus.into_iter().map(|x: &f64| *x as usize).sum();
2615 high = self.clamp_target_by_membw(
2616 &layer,
2617 (*membw_limit * (1024_u64.pow(3) as f64)) as usize,
2618 last_membw,
2619 low,
2620 high,
2621 );
2622 }
2623
2624 let target = layer.cpus.weight().clamp(low, high);
2625 let cpus_range =
2626 resolve_cpus_pct_range(cpus_range, cpus_range_frac, nr_cpus).unwrap();
2627
2628 records.push((
2629 (owned * 100.0) as u64,
2630 (open * 100.0) as u64,
2631 (util * 100.0) as u64,
2632 low,
2633 high,
2634 target,
2635 ));
2636
2637 (target.clamp(cpus_range.0, cpus_range.1), cpus_range.0)
2638 }
2639 LayerKind::Open { .. } => (0, 0),
2640 });
2641 }
2642
2643 trace!("initial targets: {:?}", &targets);
2644 trace!("(owned, open, util, low, high, target): {:?}", &records);
2645 targets
2646 }
2647
2648 fn weighted_target_nr_cpus(&self, targets: &[(usize, usize)]) -> Vec<usize> {
2652 let mut nr_left = self.cpu_pool.topo.all_cpus.len();
2653 let weights: Vec<usize> = self
2654 .layers
2655 .iter()
2656 .map(|layer| layer.kind.common().weight as usize)
2657 .collect();
2658 let mut cands: BTreeMap<usize, (usize, usize, usize)> = targets
2659 .iter()
2660 .zip(&weights)
2661 .enumerate()
2662 .map(|(i, ((target, min), weight))| (i, (*target, *min, *weight)))
2663 .collect();
2664 let mut weight_sum: usize = weights.iter().sum();
2665 let mut weighted: Vec<usize> = vec![0; self.layers.len()];
2666
2667 trace!("cands: {:?}", &cands);
2668
2669 cands.retain(|&i, &mut (target, min, weight)| {
2671 if target <= min {
2672 let target = target.min(nr_left);
2673 weighted[i] = target;
2674 weight_sum -= weight;
2675 nr_left -= target;
2676 false
2677 } else {
2678 true
2679 }
2680 });
2681
2682 trace!("cands after accepting mins: {:?}", &cands);
2683
2684 let calc_share = |nr_left, weight, weight_sum| {
2686 (((nr_left * weight) as f64 / weight_sum as f64).ceil() as usize).min(nr_left)
2687 };
2688
2689 while !cands.is_empty() {
2690 let mut progress = false;
2691
2692 cands.retain(|&i, &mut (target, _min, weight)| {
2693 let share = calc_share(nr_left, weight, weight_sum);
2694 if target <= share {
2695 weighted[i] = target;
2696 weight_sum -= weight;
2697 nr_left -= target;
2698 progress = true;
2699 false
2700 } else {
2701 true
2702 }
2703 });
2704
2705 if !progress {
2706 break;
2707 }
2708 }
2709
2710 trace!("cands after accepting under allotted: {:?}", &cands);
2711
2712 let nr_to_share = nr_left;
2715 for (i, (_target, _min, weight)) in cands.into_iter() {
2716 let share = calc_share(nr_to_share, weight, weight_sum).min(nr_left);
2717 weighted[i] = share;
2718 nr_left -= share;
2719 }
2720
2721 trace!("weighted: {:?}", &weighted);
2722
2723 weighted
2724 }
2725
2726 fn compute_target_llcs(target: usize, topo: &Topology) -> (usize, usize) {
2730 let cores_per_llc = topo.all_cores.len() / topo.all_llcs.len();
2732 let cpus_per_core = topo.all_cores.first_key_value().unwrap().1.cpus.len();
2734 let cpus_per_llc = cores_per_llc * cpus_per_core;
2735
2736 let full = target / cpus_per_llc;
2737 let extra = target % cpus_per_llc;
2738
2739 (full, extra.div_ceil(cpus_per_core))
2740 }
2741
2742 fn recompute_layer_core_order(&mut self, layer_targets: &Vec<(usize, usize)>) {
2750 debug!(
2752 " free: before pass: free_llcs={:?}",
2753 self.cpu_pool.free_llcs
2754 );
2755 for &(idx, target) in layer_targets.iter().rev() {
2756 let layer = &mut self.layers[idx];
2757 let old_tlc = layer.target_llc_cpus;
2758 let new_tlc = Self::compute_target_llcs(target, &self.topo);
2759
2760 if layer.growth_algo != LayerGrowthAlgo::StickyDynamic {
2761 continue;
2762 }
2763
2764 let mut to_free = (old_tlc.0 as i32 - new_tlc.0 as i32).max(0) as usize;
2765
2766 debug!(
2767 " free: layer={} old_tlc={:?} new_tlc={:?} to_free={} assigned={} free={}",
2768 layer.name,
2769 old_tlc,
2770 new_tlc,
2771 to_free,
2772 layer.assigned_llcs.len(),
2773 self.cpu_pool.free_llcs.len()
2774 );
2775
2776 while to_free > 0 && layer.assigned_llcs.len() > 0 {
2777 let llc = layer.assigned_llcs.pop().unwrap();
2778 self.cpu_pool.free_llcs.push((llc, 0));
2779 to_free -= 1;
2780
2781 debug!(" layer={} freed_llc={}", layer.name, llc);
2782 }
2783 }
2784 debug!(" free: after pass: free_llcs={:?}", self.cpu_pool.free_llcs);
2785
2786 for &(idx, target) in layer_targets.iter().rev() {
2788 let layer = &mut self.layers[idx];
2789 let old_tlc = layer.target_llc_cpus;
2790 let new_tlc = Self::compute_target_llcs(target, &self.topo);
2791
2792 if layer.growth_algo != LayerGrowthAlgo::StickyDynamic {
2793 continue;
2794 }
2795
2796 let mut to_alloc = (new_tlc.0 as i32 - old_tlc.0 as i32).max(0) as usize;
2797
2798 debug!(
2799 " alloc: layer={} old_tlc={:?} new_tlc={:?} to_alloc={} assigned={} free={}",
2800 layer.name,
2801 old_tlc,
2802 new_tlc,
2803 to_alloc,
2804 layer.assigned_llcs.len(),
2805 self.cpu_pool.free_llcs.len()
2806 );
2807
2808 while to_alloc > 0
2809 && self.cpu_pool.free_llcs.len() > 0
2810 && to_alloc <= self.cpu_pool.free_llcs.len()
2811 {
2812 let llc = self.cpu_pool.free_llcs.pop().unwrap().0;
2813 layer.assigned_llcs.push(llc);
2814 to_alloc -= 1;
2815
2816 debug!(" layer={} alloc_llc={}", layer.name, llc);
2817 }
2818
2819 debug!(
2820 " alloc: layer={} assigned_llcs={:?}",
2821 layer.name, layer.assigned_llcs
2822 );
2823
2824 layer.target_llc_cpus = new_tlc;
2826 }
2827
2828 for &(idx, _) in layer_targets.iter() {
2831 let mut core_order = vec![];
2832 let layer = &mut self.layers[idx];
2833
2834 if layer.growth_algo != LayerGrowthAlgo::StickyDynamic {
2835 continue;
2836 }
2837
2838 let tlc = layer.target_llc_cpus;
2839 let mut extra = tlc.1;
2840 let cores_per_llc = self.topo.all_cores.len() / self.topo.all_llcs.len();
2842 let cpus_per_core = self.topo.all_cores.first_key_value().unwrap().1.cpus.len();
2843 let cpus_per_llc = cores_per_llc * cpus_per_core;
2844
2845 for i in 0..self.cpu_pool.free_llcs.len() {
2847 let free_vec = &mut self.cpu_pool.free_llcs;
2848 let avail = cpus_per_llc - free_vec[i].1;
2850 let mut used = extra.min(avail);
2852
2853 let shift = free_vec[i].1;
2854 free_vec[i].1 += used;
2855
2856 let llc_id = free_vec[i].0;
2857 let llc = self.topo.all_llcs.get(&llc_id).unwrap();
2858
2859 for core in llc.cores.iter().skip(shift) {
2860 core_order.push(core.1.id);
2861 if used == 0 {
2862 break;
2863 }
2864 used -= 1;
2865 }
2866
2867 extra -= used;
2868 if extra == 0 {
2869 break;
2870 }
2871 }
2872
2873 core_order.reverse();
2874 layer.core_order = core_order;
2875 }
2876
2877 for i in 0..self.cpu_pool.free_llcs.len() {
2879 self.cpu_pool.free_llcs[i].1 = 0;
2880 }
2881
2882 for &(idx, _) in layer_targets.iter() {
2883 let layer = &mut self.layers[idx];
2884
2885 if layer.growth_algo != LayerGrowthAlgo::StickyDynamic {
2886 continue;
2887 }
2888
2889 for core in self.topo.all_cores.iter() {
2890 let llc_id = core.1.llc_id;
2891 if layer.assigned_llcs.contains(&llc_id) {
2892 layer.core_order.push(core.1.id);
2893 }
2894 }
2895 layer.core_order.reverse();
2897
2898 debug!(
2899 " alloc: layer={} core_order={:?}",
2900 layer.name, layer.core_order
2901 );
2902 }
2903 }
2904
2905 fn refresh_cpumasks(&mut self) -> Result<()> {
2906 let layer_is_open = |layer: &Layer| matches!(layer.kind, LayerKind::Open { .. });
2907
2908 let mut updated = false;
2909 let targets = self.calc_target_nr_cpus();
2910 let targets = self.weighted_target_nr_cpus(&targets);
2911
2912 let mut ascending: Vec<(usize, usize)> = targets.iter().copied().enumerate().collect();
2913 ascending.sort_by(|a, b| a.1.cmp(&b.1));
2914
2915 self.recompute_layer_core_order(&ascending);
2916
2917 let mut force_free = self
2920 .layers
2921 .iter()
2922 .zip(targets.iter())
2923 .any(|(layer, &target)| layer.nr_cpus < target);
2924
2925 for &(idx, target) in ascending.iter().rev() {
2929 let layer = &mut self.layers[idx];
2930 if layer_is_open(layer) {
2931 continue;
2932 }
2933
2934 let nr_cur = layer.cpus.weight();
2935 if nr_cur <= target {
2936 continue;
2937 }
2938 let mut nr_to_free = nr_cur - target;
2939
2940 let nr_to_break_at = nr_to_free / 2;
2945
2946 let mut freed = false;
2947
2948 while nr_to_free > 0 {
2949 let max_to_free = if force_free {
2950 force_free = false;
2951 layer.nr_cpus
2952 } else {
2953 nr_to_free
2954 };
2955
2956 let nr_freed = layer.free_some_cpus(&mut self.cpu_pool, max_to_free)?;
2957 if nr_freed == 0 {
2958 break;
2959 }
2960
2961 nr_to_free = nr_to_free.saturating_sub(nr_freed);
2962 freed = true;
2963
2964 if nr_to_free <= nr_to_break_at {
2965 break;
2966 }
2967 }
2968
2969 if freed {
2970 Self::update_bpf_layer_cpumask(
2971 layer,
2972 &mut self.skel.maps.bss_data.as_mut().unwrap().layers[idx],
2973 );
2974 updated = true;
2975 }
2976 }
2977
2978 for &(idx, target) in &ascending {
2984 let layer = &mut self.layers[idx];
2985
2986 if layer_is_open(layer) {
2987 continue;
2988 }
2989
2990 let nr_cur = layer.cpus.weight();
2991 if nr_cur >= target {
2992 continue;
2993 }
2994
2995 let mut nr_to_alloc = target - nr_cur;
2996 let mut alloced = false;
2997
2998 while nr_to_alloc > 0 {
2999 let nr_alloced = layer.alloc_some_cpus(&mut self.cpu_pool)?;
3000 if nr_alloced == 0 {
3001 break;
3002 }
3003 alloced = true;
3004 nr_to_alloc -= nr_alloced.min(nr_to_alloc);
3005 }
3006
3007 if alloced {
3008 Self::update_bpf_layer_cpumask(
3009 layer,
3010 &mut self.skel.maps.bss_data.as_mut().unwrap().layers[idx],
3011 );
3012 updated = true;
3013 }
3014 }
3015
3016 if updated {
3018 for (idx, layer) in self.layers.iter_mut().enumerate() {
3019 if !layer_is_open(layer) {
3020 continue;
3021 }
3022
3023 let bpf_layer = &mut self.skel.maps.bss_data.as_mut().unwrap().layers[idx];
3024 let available_cpus = self.cpu_pool.available_cpus().and(&layer.allowed_cpus);
3025 let nr_available_cpus = available_cpus.weight();
3026
3027 layer.cpus = available_cpus;
3030 layer.nr_cpus = nr_available_cpus;
3031 Self::update_bpf_layer_cpumask(layer, bpf_layer);
3032 }
3033
3034 self.skel.maps.bss_data.as_mut().unwrap().fallback_cpu =
3035 self.cpu_pool.fallback_cpu as u32;
3036
3037 for (lidx, layer) in self.layers.iter().enumerate() {
3038 self.nr_layer_cpus_ranges[lidx] = (
3039 self.nr_layer_cpus_ranges[lidx].0.min(layer.nr_cpus),
3040 self.nr_layer_cpus_ranges[lidx].1.max(layer.nr_cpus),
3041 );
3042 }
3043
3044 let input = ProgramInput {
3046 ..Default::default()
3047 };
3048 let prog = &mut self.skel.progs.refresh_layer_cpumasks;
3049 let _ = prog.test_run(input);
3050
3051 let empty_layer_ids: Vec<u32> = self
3053 .layers
3054 .iter()
3055 .enumerate()
3056 .filter(|(_idx, layer)| layer.nr_cpus == 0)
3057 .map(|(idx, _layer)| idx as u32)
3058 .collect();
3059 for i in 0..self.layers.len() {
3060 self.skel.maps.bss_data.as_mut().unwrap().empty_layer_ids[i] =
3061 empty_layer_ids.get(i).cloned().unwrap_or(MAX_LAYERS as u32);
3062 }
3063 self.skel.maps.bss_data.as_mut().unwrap().nr_empty_layer_ids =
3064 empty_layer_ids.len() as u32;
3065 }
3066
3067 let _ = self.update_netdev_cpumasks();
3068 Ok(())
3069 }
3070
3071 fn refresh_idle_qos(&mut self) -> Result<()> {
3072 if !self.idle_qos_enabled {
3073 return Ok(());
3074 }
3075
3076 let mut cpu_idle_qos = vec![0; *NR_CPU_IDS];
3077 for layer in self.layers.iter() {
3078 let idle_resume_us = layer.kind.common().idle_resume_us.unwrap_or(0) as i32;
3079 for cpu in layer.cpus.iter() {
3080 cpu_idle_qos[cpu] = idle_resume_us;
3081 }
3082 }
3083
3084 for (cpu, idle_resume_usec) in cpu_idle_qos.iter().enumerate() {
3085 update_cpu_idle_resume_latency(cpu, *idle_resume_usec)?;
3086 }
3087
3088 Ok(())
3089 }
3090
3091 fn step(&mut self) -> Result<()> {
3092 let started_at = Instant::now();
3093 self.sched_stats.refresh(
3094 &mut self.skel,
3095 &self.proc_reader,
3096 started_at,
3097 self.processing_dur,
3098 &self.gpu_task_handler,
3099 )?;
3100 self.refresh_cpumasks()?;
3101 self.refresh_idle_qos()?;
3102 self.gpu_task_handler.maybe_affinitize();
3103 self.processing_dur += Instant::now().duration_since(started_at);
3104 Ok(())
3105 }
3106
3107 fn generate_sys_stats(
3108 &mut self,
3109 stats: &Stats,
3110 cpus_ranges: &mut [(usize, usize)],
3111 ) -> Result<SysStats> {
3112 let bstats = &stats.bpf_stats;
3113 let mut sys_stats = SysStats::new(stats, bstats, self.cpu_pool.fallback_cpu)?;
3114
3115 for (lidx, (spec, layer)) in self.layer_specs.iter().zip(self.layers.iter()).enumerate() {
3116 let layer_stats = LayerStats::new(lidx, layer, stats, bstats, cpus_ranges[lidx]);
3117 sys_stats.layers.insert(spec.name.to_string(), layer_stats);
3118 cpus_ranges[lidx] = (layer.nr_cpus, layer.nr_cpus);
3119 }
3120
3121 Ok(sys_stats)
3122 }
3123
3124 fn run(&mut self, shutdown: Arc<AtomicBool>) -> Result<UserExitInfo> {
3125 let (res_ch, req_ch) = self.stats_server.channels();
3126 let mut next_sched_at = Instant::now() + self.sched_intv;
3127 let enable_layer_refresh = !self.layer_refresh_intv.is_zero();
3128 let mut next_layer_refresh_at = Instant::now() + self.layer_refresh_intv;
3129 let mut cpus_ranges = HashMap::<ThreadId, Vec<(usize, usize)>>::new();
3130
3131 while !shutdown.load(Ordering::Relaxed) && !uei_exited!(&self.skel, uei) {
3132 let now = Instant::now();
3133
3134 if now >= next_sched_at {
3135 self.step()?;
3136 while next_sched_at < now {
3137 next_sched_at += self.sched_intv;
3138 }
3139 }
3140
3141 if enable_layer_refresh && now >= next_layer_refresh_at {
3142 self.skel
3143 .maps
3144 .bss_data
3145 .as_mut()
3146 .unwrap()
3147 .layer_refresh_seq_avgruntime += 1;
3148 while next_layer_refresh_at < now {
3149 next_layer_refresh_at += self.layer_refresh_intv;
3150 }
3151 }
3152
3153 match req_ch.recv_deadline(next_sched_at) {
3154 Ok(StatsReq::Hello(tid)) => {
3155 cpus_ranges.insert(
3156 tid,
3157 self.layers.iter().map(|l| (l.nr_cpus, l.nr_cpus)).collect(),
3158 );
3159 let stats =
3160 Stats::new(&mut self.skel, &self.proc_reader, &self.gpu_task_handler)?;
3161 res_ch.send(StatsRes::Hello(stats))?;
3162 }
3163 Ok(StatsReq::Refresh(tid, mut stats)) => {
3164 for i in 0..self.nr_layer_cpus_ranges.len() {
3166 for (_, ranges) in cpus_ranges.iter_mut() {
3167 ranges[i] = (
3168 ranges[i].0.min(self.nr_layer_cpus_ranges[i].0),
3169 ranges[i].1.max(self.nr_layer_cpus_ranges[i].1),
3170 );
3171 }
3172 self.nr_layer_cpus_ranges[i] =
3173 (self.layers[i].nr_cpus, self.layers[i].nr_cpus);
3174 }
3175
3176 stats.refresh(
3177 &mut self.skel,
3178 &self.proc_reader,
3179 now,
3180 self.processing_dur,
3181 &self.gpu_task_handler,
3182 )?;
3183 let sys_stats =
3184 self.generate_sys_stats(&stats, cpus_ranges.get_mut(&tid).unwrap())?;
3185 res_ch.send(StatsRes::Refreshed((stats, sys_stats)))?;
3186 }
3187 Ok(StatsReq::Bye(tid)) => {
3188 cpus_ranges.remove(&tid);
3189 res_ch.send(StatsRes::Bye)?;
3190 }
3191 Err(RecvTimeoutError::Timeout) => {}
3192 Err(e) => Err(e)?,
3193 }
3194 }
3195
3196 let _ = self.struct_ops.take();
3197 uei_report!(&self.skel, uei)
3198 }
3199}
3200
3201impl Drop for Scheduler<'_> {
3202 fn drop(&mut self) {
3203 info!("Unregister {SCHEDULER_NAME} scheduler");
3204
3205 if let Some(struct_ops) = self.struct_ops.take() {
3206 drop(struct_ops);
3207 }
3208 }
3209}
3210
3211fn write_example_file(path: &str) -> Result<()> {
3212 let mut f = fs::OpenOptions::new()
3213 .create_new(true)
3214 .write(true)
3215 .open(path)?;
3216 Ok(f.write_all(serde_json::to_string_pretty(&*EXAMPLE_CONFIG)?.as_bytes())?)
3217}
3218
3219fn verify_layer_specs(specs: &[LayerSpec]) -> Result<HashMap<u64, usize>> {
3220 let mut hint_to_layer_map = HashMap::<u64, (usize, String)>::new();
3221
3222 let nr_specs = specs.len();
3223 if nr_specs == 0 {
3224 bail!("No layer spec");
3225 }
3226 if nr_specs > MAX_LAYERS {
3227 bail!("Too many layer specs");
3228 }
3229
3230 for (idx, spec) in specs.iter().enumerate() {
3231 if idx < nr_specs - 1 {
3232 if spec.matches.is_empty() {
3233 bail!("Non-terminal spec {:?} has NULL matches", spec.name);
3234 }
3235 } else {
3236 if spec.matches.len() != 1 || !spec.matches[0].is_empty() {
3237 bail!("Terminal spec {:?} must have an empty match", spec.name);
3238 }
3239 }
3240
3241 if spec.matches.len() > MAX_LAYER_MATCH_ORS {
3242 bail!(
3243 "Spec {:?} has too many ({}) OR match blocks",
3244 spec.name,
3245 spec.matches.len()
3246 );
3247 }
3248
3249 for (ands_idx, ands) in spec.matches.iter().enumerate() {
3250 if ands.len() > NR_LAYER_MATCH_KINDS {
3251 bail!(
3252 "Spec {:?}'s {}th OR block has too many ({}) match conditions",
3253 spec.name,
3254 ands_idx,
3255 ands.len()
3256 );
3257 }
3258 let mut hint_equals_cnt = 0;
3259 for one in ands.iter() {
3260 match one {
3261 LayerMatch::CgroupPrefix(prefix) => {
3262 if prefix.len() > MAX_PATH {
3263 bail!("Spec {:?} has too long a cgroup prefix", spec.name);
3264 }
3265 }
3266 LayerMatch::CgroupSuffix(suffix) => {
3267 if suffix.len() > MAX_PATH {
3268 bail!("Spec {:?} has too long a cgroup suffix", spec.name);
3269 }
3270 }
3271 LayerMatch::CgroupContains(substr) => {
3272 if substr.len() > MAX_PATH {
3273 bail!("Spec {:?} has too long a cgroup substr", spec.name);
3274 }
3275 }
3276 LayerMatch::CommPrefix(prefix) => {
3277 if prefix.len() > MAX_COMM {
3278 bail!("Spec {:?} has too long a comm prefix", spec.name);
3279 }
3280 }
3281 LayerMatch::PcommPrefix(prefix) => {
3282 if prefix.len() > MAX_COMM {
3283 bail!("Spec {:?} has too long a process name prefix", spec.name);
3284 }
3285 }
3286 LayerMatch::HintEquals(hint) => {
3287 if *hint > 1024 {
3288 bail!(
3289 "Spec {:?} has hint value outside the range [0, 1024]",
3290 spec.name
3291 );
3292 }
3293
3294 if let Some((layer_id, name)) = hint_to_layer_map.get(hint) {
3295 if *layer_id != idx {
3296 bail!(
3297 "Spec {:?} has hint value ({}) that is already mapped to Spec {:?}",
3298 spec.name,
3299 hint,
3300 name
3301 );
3302 }
3303 } else {
3304 hint_to_layer_map.insert(*hint, (idx, spec.name.clone()));
3305 }
3306 hint_equals_cnt += 1;
3307 }
3308 _ => {}
3309 }
3310 }
3311 if hint_equals_cnt > 1 {
3312 bail!("Only 1 HintEquals match permitted per AND block");
3313 }
3314 if hint_equals_cnt == 1 && ands.len() != 1 {
3315 bail!("HintEquals match cannot be in conjunction with other matches");
3316 }
3317 }
3318
3319 match spec.kind {
3320 LayerKind::Confined {
3321 cpus_range,
3322 util_range,
3323 ..
3324 }
3325 | LayerKind::Grouped {
3326 cpus_range,
3327 util_range,
3328 ..
3329 } => {
3330 if let Some((cpus_min, cpus_max)) = cpus_range {
3331 if cpus_min > cpus_max {
3332 bail!(
3333 "Spec {:?} has invalid cpus_range({}, {})",
3334 spec.name,
3335 cpus_min,
3336 cpus_max
3337 );
3338 }
3339 }
3340 if util_range.0 >= util_range.1 {
3341 bail!(
3342 "Spec {:?} has invalid util_range ({}, {})",
3343 spec.name,
3344 util_range.0,
3345 util_range.1
3346 );
3347 }
3348 }
3349 _ => {}
3350 }
3351 }
3352
3353 Ok(hint_to_layer_map
3354 .into_iter()
3355 .map(|(k, v)| (k, v.0))
3356 .collect())
3357}
3358
3359fn name_suffix(cgroup: &str, len: usize) -> String {
3360 let suffixlen = std::cmp::min(len, cgroup.len());
3361 let suffixrev: String = cgroup.chars().rev().take(suffixlen).collect();
3362
3363 suffixrev.chars().rev().collect()
3364}
3365
3366fn traverse_sysfs(dir: &Path) -> Result<Vec<PathBuf>> {
3367 let mut paths = vec![];
3368
3369 if !dir.is_dir() {
3370 panic!("path {:?} does not correspond to directory", dir);
3371 }
3372
3373 let direntries = fs::read_dir(dir)?;
3374
3375 for entry in direntries {
3376 let path = entry?.path();
3377 if path.is_dir() {
3378 paths.append(&mut traverse_sysfs(&path)?);
3379 paths.push(path);
3380 }
3381 }
3382
3383 Ok(paths)
3384}
3385
3386fn find_cpumask(cgroup: &str) -> Cpumask {
3387 let mut path = String::from(cgroup);
3388 path.push_str("/cpuset.cpus.effective");
3389
3390 let description = fs::read_to_string(&mut path).unwrap();
3391
3392 Cpumask::from_cpulist(&description).unwrap()
3393}
3394
3395fn expand_template(rule: &LayerMatch) -> Result<Vec<(LayerMatch, Cpumask)>> {
3396 match rule {
3397 LayerMatch::CgroupSuffix(suffix) => Ok(traverse_sysfs(Path::new("/sys/fs/cgroup"))?
3398 .into_iter()
3399 .map(|cgroup| String::from(cgroup.to_str().expect("could not parse cgroup path")))
3400 .filter(|cgroup| cgroup.ends_with(suffix))
3401 .map(|cgroup| {
3402 (
3403 {
3404 let mut slashterminated = cgroup.clone();
3405 slashterminated.push('/');
3406 LayerMatch::CgroupSuffix(name_suffix(&slashterminated, 64))
3407 },
3408 find_cpumask(&cgroup),
3409 )
3410 })
3411 .collect()),
3412 LayerMatch::CgroupRegex(expr) => Ok(traverse_sysfs(Path::new("/sys/fs/cgroup"))?
3413 .into_iter()
3414 .map(|cgroup| String::from(cgroup.to_str().expect("could not parse cgroup path")))
3415 .filter(|cgroup| {
3416 let re = Regex::new(expr).unwrap();
3417 re.is_match(cgroup)
3418 })
3419 .map(|cgroup| {
3420 (
3421 {
3425 let mut slashterminated = cgroup.clone();
3426 slashterminated.push('/');
3427 LayerMatch::CgroupSuffix(name_suffix(&slashterminated, 64))
3428 },
3429 find_cpumask(&cgroup),
3430 )
3431 })
3432 .collect()),
3433 _ => panic!("Unimplemented template enum {:?}", rule),
3434 }
3435}
3436
3437fn main() -> Result<()> {
3438 let opts = Opts::parse();
3439
3440 if opts.version {
3441 println!(
3442 "scx_layered {}",
3443 build_id::full_version(env!("CARGO_PKG_VERSION"))
3444 );
3445 return Ok(());
3446 }
3447
3448 if opts.help_stats {
3449 stats::server_data().describe_meta(&mut std::io::stdout(), None)?;
3450 return Ok(());
3451 }
3452
3453 if opts.no_load_frac_limit {
3454 warn!("--no-load-frac-limit is deprecated and noop");
3455 }
3456 if opts.layer_preempt_weight_disable != 0.0 {
3457 warn!("--layer-preempt-weight-disable is deprecated and noop");
3458 }
3459 if opts.layer_growth_weight_disable != 0.0 {
3460 warn!("--layer-growth-weight-disable is deprecated and noop");
3461 }
3462 if opts.local_llc_iteration {
3463 warn!("--local_llc_iteration is deprecated and noop");
3464 }
3465
3466 let llv = match opts.verbose {
3467 0 => simplelog::LevelFilter::Info,
3468 1 => simplelog::LevelFilter::Debug,
3469 _ => simplelog::LevelFilter::Trace,
3470 };
3471 let mut lcfg = simplelog::ConfigBuilder::new();
3472 lcfg.set_time_offset_to_local()
3473 .expect("Failed to set local time offset")
3474 .set_time_level(simplelog::LevelFilter::Error)
3475 .set_location_level(simplelog::LevelFilter::Off)
3476 .set_target_level(simplelog::LevelFilter::Off)
3477 .set_thread_level(simplelog::LevelFilter::Off);
3478 simplelog::TermLogger::init(
3479 llv,
3480 lcfg.build(),
3481 simplelog::TerminalMode::Stderr,
3482 simplelog::ColorChoice::Auto,
3483 )?;
3484
3485 debug!("opts={:?}", &opts);
3486
3487 let shutdown = Arc::new(AtomicBool::new(false));
3488 let shutdown_clone = shutdown.clone();
3489 ctrlc::set_handler(move || {
3490 shutdown_clone.store(true, Ordering::Relaxed);
3491 })
3492 .context("Error setting Ctrl-C handler")?;
3493
3494 if let Some(intv) = opts.monitor.or(opts.stats) {
3495 let shutdown_copy = shutdown.clone();
3496 let jh = std::thread::spawn(move || {
3497 match stats::monitor(Duration::from_secs_f64(intv), shutdown_copy) {
3498 Ok(_) => {
3499 debug!("stats monitor thread finished successfully")
3500 }
3501 Err(error_object) => {
3502 warn!(
3503 "stats monitor thread finished because of an error {}",
3504 error_object
3505 )
3506 }
3507 }
3508 });
3509 if opts.monitor.is_some() {
3510 let _ = jh.join();
3511 return Ok(());
3512 }
3513 }
3514
3515 if let Some(path) = &opts.example {
3516 write_example_file(path)?;
3517 return Ok(());
3518 }
3519
3520 let mut layer_config = match opts.run_example {
3521 true => EXAMPLE_CONFIG.clone(),
3522 false => LayerConfig { specs: vec![] },
3523 };
3524
3525 for (idx, input) in opts.specs.iter().enumerate() {
3526 let specs = LayerSpec::parse(input)
3527 .context(format!("Failed to parse specs[{}] ({:?})", idx, input))?;
3528
3529 for spec in specs {
3530 match spec.template {
3531 Some(ref rule) => {
3532 let matches = expand_template(&rule)?;
3533 if matches.is_empty() {
3536 layer_config.specs.push(spec);
3537 } else {
3538 for (mt, mask) in matches {
3539 let mut genspec = spec.clone();
3540
3541 genspec.cpuset = Some(mask);
3542
3543 for orterm in &mut genspec.matches {
3545 orterm.push(mt.clone());
3546 }
3547
3548 match &mt {
3549 LayerMatch::CgroupSuffix(cgroup) => genspec.name.push_str(cgroup),
3550 _ => bail!("Template match has unexpected type"),
3551 }
3552
3553 layer_config.specs.push(genspec);
3555 }
3556 }
3557 }
3558
3559 None => {
3560 layer_config.specs.push(spec);
3561 }
3562 }
3563 }
3564 }
3565
3566 for spec in layer_config.specs.iter_mut() {
3567 let common = spec.kind.common_mut();
3568
3569 if common.slice_us == 0 {
3570 common.slice_us = opts.slice_us;
3571 }
3572
3573 if common.weight == 0 {
3574 common.weight = DEFAULT_LAYER_WEIGHT;
3575 }
3576 common.weight = common.weight.clamp(MIN_LAYER_WEIGHT, MAX_LAYER_WEIGHT);
3577
3578 if common.preempt {
3579 if common.disallow_open_after_us.is_some() {
3580 warn!(
3581 "Preempt layer {} has non-null disallow_open_after_us, ignored",
3582 &spec.name
3583 );
3584 }
3585 if common.disallow_preempt_after_us.is_some() {
3586 warn!(
3587 "Preempt layer {} has non-null disallow_preempt_after_us, ignored",
3588 &spec.name
3589 );
3590 }
3591 common.disallow_open_after_us = Some(u64::MAX);
3592 common.disallow_preempt_after_us = Some(u64::MAX);
3593 } else {
3594 if common.disallow_open_after_us.is_none() {
3595 common.disallow_open_after_us = Some(*DFL_DISALLOW_OPEN_AFTER_US);
3596 }
3597
3598 if common.disallow_preempt_after_us.is_none() {
3599 common.disallow_preempt_after_us = Some(*DFL_DISALLOW_PREEMPT_AFTER_US);
3600 }
3601 }
3602
3603 if common.idle_smt.is_some() {
3604 warn!("Layer {} has deprecated flag \"idle_smt\"", &spec.name);
3605 }
3606 }
3607
3608 if opts.print_and_exit {
3609 println!("specs={}", serde_json::to_string_pretty(&layer_config)?);
3610 return Ok(());
3611 }
3612
3613 debug!("specs={}", serde_json::to_string_pretty(&layer_config)?);
3614 let hint_to_layer_map = verify_layer_specs(&layer_config.specs)?;
3615
3616 let mut open_object = MaybeUninit::uninit();
3617 loop {
3618 let mut sched = Scheduler::init(
3619 &opts,
3620 &layer_config.specs,
3621 &mut open_object,
3622 &hint_to_layer_map,
3623 )?;
3624 if !sched.run(shutdown.clone())?.should_restart() {
3625 break;
3626 }
3627 }
3628
3629 Ok(())
3630}