1mod bpf_skel;
6mod stats;
7
8use std::collections::BTreeMap;
9use std::collections::HashMap;
10use std::ffi::CString;
11use std::fs;
12use std::io::Write;
13use std::mem::MaybeUninit;
14use std::ops::Sub;
15use std::path::Path;
16use std::path::PathBuf;
17use std::sync::atomic::AtomicBool;
18use std::sync::atomic::Ordering;
19use std::sync::Arc;
20use std::thread::ThreadId;
21use std::time::Duration;
22use std::time::Instant;
23
24use ::fb_procfs as procfs;
25use anyhow::anyhow;
26use anyhow::bail;
27use anyhow::Context;
28use anyhow::Result;
29pub use bpf_skel::*;
30use clap::Parser;
31use crossbeam::channel::RecvTimeoutError;
32use lazy_static::lazy_static;
33use libbpf_rs::MapCore as _;
34use libbpf_rs::OpenObject;
35use libbpf_rs::ProgramInput;
36use log::debug;
37use log::info;
38use log::trace;
39use log::warn;
40use scx_layered::*;
41use scx_stats::prelude::*;
42use scx_utils::compat;
43use scx_utils::init_libbpf_logging;
44use scx_utils::pm::{cpu_idle_resume_latency_supported, update_cpu_idle_resume_latency};
45use scx_utils::read_netdevs;
46use scx_utils::scx_enums;
47use scx_utils::scx_ops_attach;
48use scx_utils::scx_ops_load;
49use scx_utils::scx_ops_open;
50use scx_utils::uei_exited;
51use scx_utils::uei_report;
52use scx_utils::CoreType;
53use scx_utils::Cpumask;
54use scx_utils::Llc;
55use scx_utils::NetDev;
56use scx_utils::Topology;
57use scx_utils::UserExitInfo;
58use scx_utils::NR_CPUS_POSSIBLE;
59use scx_utils::NR_CPU_IDS;
60use stats::LayerStats;
61use stats::StatsReq;
62use stats::StatsRes;
63use stats::SysStats;
64
65const MAX_PATH: usize = bpf_intf::consts_MAX_PATH as usize;
66const MAX_COMM: usize = bpf_intf::consts_MAX_COMM as usize;
67const MAX_LAYER_WEIGHT: u32 = bpf_intf::consts_MAX_LAYER_WEIGHT;
68const MIN_LAYER_WEIGHT: u32 = bpf_intf::consts_MIN_LAYER_WEIGHT;
69const MAX_LAYER_MATCH_ORS: usize = bpf_intf::consts_MAX_LAYER_MATCH_ORS as usize;
70const MAX_LAYER_NAME: usize = bpf_intf::consts_MAX_LAYER_NAME as usize;
71const MAX_LAYERS: usize = bpf_intf::consts_MAX_LAYERS as usize;
72const DEFAULT_LAYER_WEIGHT: u32 = bpf_intf::consts_DEFAULT_LAYER_WEIGHT;
73const USAGE_HALF_LIFE: u32 = bpf_intf::consts_USAGE_HALF_LIFE;
74const USAGE_HALF_LIFE_F64: f64 = USAGE_HALF_LIFE as f64 / 1_000_000_000.0;
75
76const LAYER_USAGE_OWNED: usize = bpf_intf::layer_usage_LAYER_USAGE_OWNED as usize;
77const LAYER_USAGE_OPEN: usize = bpf_intf::layer_usage_LAYER_USAGE_OPEN as usize;
78const LAYER_USAGE_SUM_UPTO: usize = bpf_intf::layer_usage_LAYER_USAGE_SUM_UPTO as usize;
79const LAYER_USAGE_PROTECTED: usize = bpf_intf::layer_usage_LAYER_USAGE_PROTECTED as usize;
80const LAYER_USAGE_PROTECTED_PREEMPT: usize =
81 bpf_intf::layer_usage_LAYER_USAGE_PROTECTED_PREEMPT as usize;
82const NR_LAYER_USAGES: usize = bpf_intf::layer_usage_NR_LAYER_USAGES as usize;
83
84const NR_GSTATS: usize = bpf_intf::global_stat_id_NR_GSTATS as usize;
85const NR_LSTATS: usize = bpf_intf::layer_stat_id_NR_LSTATS as usize;
86const NR_LLC_LSTATS: usize = bpf_intf::llc_layer_stat_id_NR_LLC_LSTATS as usize;
87
88const NR_LAYER_MATCH_KINDS: usize = bpf_intf::layer_match_kind_NR_LAYER_MATCH_KINDS as usize;
89
90lazy_static! {
91 static ref USAGE_DECAY: f64 = 0.5f64.powf(1.0 / USAGE_HALF_LIFE_F64);
92 static ref DFL_DISALLOW_OPEN_AFTER_US: u64 = 2 * scx_enums.SCX_SLICE_DFL / 1000;
93 static ref DFL_DISALLOW_PREEMPT_AFTER_US: u64 = 4 * scx_enums.SCX_SLICE_DFL / 1000;
94 static ref EXAMPLE_CONFIG: LayerConfig = LayerConfig {
95 specs: vec![
96 LayerSpec {
97 name: "batch".into(),
98 comment: Some("tasks under system.slice or tasks with nice value > 0".into()),
99 cpuset: None,
100 template: None,
101 matches: vec![
102 vec![LayerMatch::CgroupPrefix("system.slice/".into())],
103 vec![LayerMatch::NiceAbove(0)],
104 ],
105 kind: LayerKind::Confined {
106 util_range: (0.8, 0.9),
107 cpus_range: Some((0, 16)),
108 cpus_range_frac: None,
109 protected: false,
110 common: LayerCommon {
111 min_exec_us: 1000,
112 yield_ignore: 0.0,
113 preempt: false,
114 preempt_first: false,
115 exclusive: false,
116 allow_node_aligned: false,
117 skip_remote_node: false,
118 prev_over_idle_core: false,
119 idle_smt: None,
120 slice_us: 20000,
121 fifo: false,
122 weight: DEFAULT_LAYER_WEIGHT,
123 disallow_open_after_us: None,
124 disallow_preempt_after_us: None,
125 xllc_mig_min_us: 1000.0,
126 growth_algo: LayerGrowthAlgo::Sticky,
127 idle_resume_us: None,
128 perf: 1024,
129 nodes: vec![],
130 llcs: vec![],
131 placement: LayerPlacement::Standard,
132 },
133 },
134 },
135 LayerSpec {
136 name: "immediate".into(),
137 comment: Some("tasks under workload.slice with nice value < 0".into()),
138 cpuset: None,
139 template: None,
140 matches: vec![vec![
141 LayerMatch::CgroupPrefix("workload.slice/".into()),
142 LayerMatch::NiceBelow(0),
143 ]],
144 kind: LayerKind::Open {
145 common: LayerCommon {
146 min_exec_us: 100,
147 yield_ignore: 0.25,
148 preempt: true,
149 preempt_first: false,
150 exclusive: true,
151 allow_node_aligned: true,
152 skip_remote_node: false,
153 prev_over_idle_core: true,
154 idle_smt: None,
155 slice_us: 20000,
156 fifo: false,
157 weight: DEFAULT_LAYER_WEIGHT,
158 disallow_open_after_us: None,
159 disallow_preempt_after_us: None,
160 xllc_mig_min_us: 0.0,
161 growth_algo: LayerGrowthAlgo::Sticky,
162 perf: 1024,
163 idle_resume_us: None,
164 nodes: vec![],
165 llcs: vec![],
166 placement: LayerPlacement::Standard,
167 },
168 },
169 },
170 LayerSpec {
171 name: "stress-ng".into(),
172 comment: Some("stress-ng test layer".into()),
173 cpuset: None,
174 template: None,
175 matches: vec![
176 vec![LayerMatch::CommPrefix("stress-ng".into()),],
177 vec![LayerMatch::PcommPrefix("stress-ng".into()),]
178 ],
179 kind: LayerKind::Confined {
180 cpus_range: None,
181 util_range: (0.2, 0.8),
182 protected: false,
183 cpus_range_frac: None,
184 common: LayerCommon {
185 min_exec_us: 800,
186 yield_ignore: 0.0,
187 preempt: true,
188 preempt_first: false,
189 exclusive: false,
190 allow_node_aligned: false,
191 skip_remote_node: false,
192 prev_over_idle_core: false,
193 idle_smt: None,
194 slice_us: 800,
195 fifo: false,
196 weight: DEFAULT_LAYER_WEIGHT,
197 disallow_open_after_us: None,
198 disallow_preempt_after_us: None,
199 xllc_mig_min_us: 0.0,
200 growth_algo: LayerGrowthAlgo::Topo,
201 perf: 1024,
202 idle_resume_us: None,
203 nodes: vec![],
204 llcs: vec![],
205 placement: LayerPlacement::Standard,
206 },
207 },
208 },
209 LayerSpec {
210 name: "normal".into(),
211 comment: Some("the rest".into()),
212 cpuset: None,
213 template: None,
214 matches: vec![vec![]],
215 kind: LayerKind::Grouped {
216 cpus_range: None,
217 util_range: (0.5, 0.6),
218 util_includes_open_cputime: true,
219 protected: false,
220 cpus_range_frac: None,
221 common: LayerCommon {
222 min_exec_us: 200,
223 yield_ignore: 0.0,
224 preempt: false,
225 preempt_first: false,
226 exclusive: false,
227 allow_node_aligned: false,
228 skip_remote_node: false,
229 prev_over_idle_core: false,
230 idle_smt: None,
231 slice_us: 20000,
232 fifo: false,
233 weight: DEFAULT_LAYER_WEIGHT,
234 disallow_open_after_us: None,
235 disallow_preempt_after_us: None,
236 xllc_mig_min_us: 100.0,
237 growth_algo: LayerGrowthAlgo::Linear,
238 perf: 1024,
239 idle_resume_us: None,
240 nodes: vec![],
241 llcs: vec![],
242 placement: LayerPlacement::Standard,
243 },
244 },
245 },
246 ],
247 };
248}
249
250#[derive(Debug, Parser)]
527#[command(verbatim_doc_comment)]
528struct Opts {
529 #[clap(short = 's', long, default_value = "20000")]
531 slice_us: u64,
532
533 #[clap(short = 'M', long, default_value = "0")]
538 max_exec_us: u64,
539
540 #[clap(short = 'i', long, default_value = "0.1")]
542 interval: f64,
543
544 #[clap(short = 'n', long, default_value = "false")]
547 no_load_frac_limit: bool,
548
549 #[clap(long, default_value = "0")]
551 exit_dump_len: u32,
552
553 #[clap(short = 'v', long, action = clap::ArgAction::Count)]
556 verbose: u8,
557
558 #[arg(short = 't', long, num_args = 0..=1, default_missing_value = "true", require_equals = true)]
562 disable_topology: Option<bool>,
563
564 #[clap(long)]
566 xnuma_preemption: bool,
567
568 #[clap(long)]
570 monitor_disable: bool,
571
572 #[clap(short = 'e', long)]
574 example: Option<String>,
575
576 #[clap(long, default_value = "0.0")]
580 layer_preempt_weight_disable: f64,
581
582 #[clap(long, default_value = "0.0")]
586 layer_growth_weight_disable: f64,
587
588 #[clap(long)]
590 stats: Option<f64>,
591
592 #[clap(long)]
595 monitor: Option<f64>,
596
597 #[clap(long)]
599 run_example: bool,
600
601 #[clap(long, default_value = "false")]
604 local_llc_iteration: bool,
605
606 #[clap(long, default_value = "10000")]
611 lo_fb_wait_us: u64,
612
613 #[clap(long, default_value = ".05")]
616 lo_fb_share: f64,
617
618 #[clap(long, default_value = "false")]
620 disable_antistall: bool,
621
622 #[clap(long, default_value = "false")]
627 enable_match_debug: bool,
628
629 #[clap(long, default_value = "3")]
631 antistall_sec: u64,
632
633 #[clap(long, default_value = "false")]
635 enable_gpu_support: bool,
636
637 #[clap(long, default_value = "3")]
645 gpu_kprobe_level: u64,
646
647 #[clap(long, default_value = "false")]
649 netdev_irq_balance: bool,
650
651 #[clap(long, default_value = "false")]
653 disable_queued_wakeup: bool,
654
655 #[clap(long, default_value = "false")]
657 disable_percpu_kthread_preempt: bool,
658
659 #[clap(long, default_value = "false")]
663 percpu_kthread_preempt_all: bool,
664
665 #[clap(long)]
667 help_stats: bool,
668
669 specs: Vec<String>,
671
672 #[clap(long, default_value = "2000")]
675 layer_refresh_ms_avgruntime: u64,
676
677 #[clap(long, default_value = "")]
679 task_hint_map: String,
680
681 #[clap(long, default_value = "false")]
683 print_and_exit: bool,
684}
685
686fn read_total_cpu(reader: &procfs::ProcReader) -> Result<procfs::CpuStat> {
687 reader
688 .read_stat()
689 .context("Failed to read procfs")?
690 .total_cpu
691 .ok_or_else(|| anyhow!("Could not read total cpu stat in proc"))
692}
693
694fn calc_util(curr: &procfs::CpuStat, prev: &procfs::CpuStat) -> Result<f64> {
695 match (curr, prev) {
696 (
697 procfs::CpuStat {
698 user_usec: Some(curr_user),
699 nice_usec: Some(curr_nice),
700 system_usec: Some(curr_system),
701 idle_usec: Some(curr_idle),
702 iowait_usec: Some(curr_iowait),
703 irq_usec: Some(curr_irq),
704 softirq_usec: Some(curr_softirq),
705 stolen_usec: Some(curr_stolen),
706 ..
707 },
708 procfs::CpuStat {
709 user_usec: Some(prev_user),
710 nice_usec: Some(prev_nice),
711 system_usec: Some(prev_system),
712 idle_usec: Some(prev_idle),
713 iowait_usec: Some(prev_iowait),
714 irq_usec: Some(prev_irq),
715 softirq_usec: Some(prev_softirq),
716 stolen_usec: Some(prev_stolen),
717 ..
718 },
719 ) => {
720 let idle_usec = curr_idle.saturating_sub(*prev_idle);
721 let iowait_usec = curr_iowait.saturating_sub(*prev_iowait);
722 let user_usec = curr_user.saturating_sub(*prev_user);
723 let system_usec = curr_system.saturating_sub(*prev_system);
724 let nice_usec = curr_nice.saturating_sub(*prev_nice);
725 let irq_usec = curr_irq.saturating_sub(*prev_irq);
726 let softirq_usec = curr_softirq.saturating_sub(*prev_softirq);
727 let stolen_usec = curr_stolen.saturating_sub(*prev_stolen);
728
729 let busy_usec =
730 user_usec + system_usec + nice_usec + irq_usec + softirq_usec + stolen_usec;
731 let total_usec = idle_usec + busy_usec + iowait_usec;
732 if total_usec > 0 {
733 Ok(((busy_usec as f64) / (total_usec as f64)).clamp(0.0, 1.0))
734 } else {
735 Ok(1.0)
736 }
737 }
738 _ => bail!("Missing stats in cpustat"),
739 }
740}
741
742fn copy_into_cstr(dst: &mut [i8], src: &str) {
743 let cstr = CString::new(src).unwrap();
744 let bytes = unsafe { std::mem::transmute::<&[u8], &[i8]>(cstr.as_bytes_with_nul()) };
745 dst[0..bytes.len()].copy_from_slice(bytes);
746}
747
748fn nodemask_from_nodes(nodes: &Vec<usize>) -> usize {
749 let mut mask = 0;
750 for node in nodes {
751 mask |= 1 << node;
752 }
753 mask
754}
755
756fn llcmask_from_llcs(llcs: &BTreeMap<usize, Arc<Llc>>) -> usize {
757 let mut mask = 0;
758 for (_, cache) in llcs {
759 mask |= 1 << cache.id;
760 }
761 mask
762}
763
764fn read_cpu_ctxs(skel: &BpfSkel) -> Result<Vec<bpf_intf::cpu_ctx>> {
765 let mut cpu_ctxs = vec![];
766 let cpu_ctxs_vec = skel
767 .maps
768 .cpu_ctxs
769 .lookup_percpu(&0u32.to_ne_bytes(), libbpf_rs::MapFlags::ANY)
770 .context("Failed to lookup cpu_ctx")?
771 .unwrap();
772 for cpu in 0..*NR_CPUS_POSSIBLE {
773 cpu_ctxs.push(*unsafe {
774 &*(cpu_ctxs_vec[cpu].as_slice().as_ptr() as *const bpf_intf::cpu_ctx)
775 });
776 }
777 Ok(cpu_ctxs)
778}
779
780#[derive(Clone, Debug)]
781struct BpfStats {
782 gstats: Vec<u64>,
783 lstats: Vec<Vec<u64>>,
784 lstats_sums: Vec<u64>,
785 llc_lstats: Vec<Vec<Vec<u64>>>, }
787
788impl BpfStats {
789 fn read(skel: &BpfSkel, cpu_ctxs: &[bpf_intf::cpu_ctx]) -> Self {
790 let nr_layers = skel.maps.rodata_data.nr_layers as usize;
791 let nr_llcs = skel.maps.rodata_data.nr_llcs as usize;
792 let mut gstats = vec![0u64; NR_GSTATS];
793 let mut lstats = vec![vec![0u64; NR_LSTATS]; nr_layers];
794 let mut llc_lstats = vec![vec![vec![0u64; NR_LLC_LSTATS]; nr_llcs]; nr_layers];
795
796 for cpu in 0..*NR_CPUS_POSSIBLE {
797 for stat in 0..NR_GSTATS {
798 gstats[stat] += cpu_ctxs[cpu].gstats[stat];
799 }
800 for layer in 0..nr_layers {
801 for stat in 0..NR_LSTATS {
802 lstats[layer][stat] += cpu_ctxs[cpu].lstats[layer][stat];
803 }
804 }
805 }
806
807 let mut lstats_sums = vec![0u64; NR_LSTATS];
808 for layer in 0..nr_layers {
809 for stat in 0..NR_LSTATS {
810 lstats_sums[stat] += lstats[layer][stat];
811 }
812 }
813
814 for llc_id in 0..nr_llcs {
815 let key = llc_id as u32;
821 let llc_id_slice =
822 unsafe { std::slice::from_raw_parts((&key as *const u32) as *const u8, 4) };
823 let v = skel
824 .maps
825 .llc_data
826 .lookup(llc_id_slice, libbpf_rs::MapFlags::ANY)
827 .unwrap()
828 .unwrap();
829 let llcc = unsafe { *(v.as_slice().as_ptr() as *const bpf_intf::llc_ctx) };
830
831 for layer_id in 0..nr_layers {
832 for stat_id in 0..NR_LLC_LSTATS {
833 llc_lstats[layer_id][llc_id][stat_id] = llcc.lstats[layer_id][stat_id];
834 }
835 }
836 }
837
838 Self {
839 gstats,
840 lstats,
841 lstats_sums,
842 llc_lstats,
843 }
844 }
845}
846
847impl<'a, 'b> Sub<&'b BpfStats> for &'a BpfStats {
848 type Output = BpfStats;
849
850 fn sub(self, rhs: &'b BpfStats) -> BpfStats {
851 let vec_sub = |l: &[u64], r: &[u64]| l.iter().zip(r.iter()).map(|(l, r)| *l - *r).collect();
852 BpfStats {
853 gstats: vec_sub(&self.gstats, &rhs.gstats),
854 lstats: self
855 .lstats
856 .iter()
857 .zip(rhs.lstats.iter())
858 .map(|(l, r)| vec_sub(l, r))
859 .collect(),
860 lstats_sums: vec_sub(&self.lstats_sums, &rhs.lstats_sums),
861 llc_lstats: self
862 .llc_lstats
863 .iter()
864 .zip(rhs.llc_lstats.iter())
865 .map(|(l_layer, r_layer)| {
866 l_layer
867 .iter()
868 .zip(r_layer.iter())
869 .map(|(l_llc, r_llc)| {
870 let (l_llc, mut r_llc) = (l_llc.clone(), r_llc.clone());
871 r_llc[bpf_intf::llc_layer_stat_id_LLC_LSTAT_LAT as usize] = 0;
873 vec_sub(&l_llc, &r_llc)
874 })
875 .collect()
876 })
877 .collect(),
878 }
879 }
880}
881
882#[derive(Clone, Debug)]
883struct Stats {
884 at: Instant,
885 elapsed: Duration,
886 nr_layers: usize,
887 nr_layer_tasks: Vec<usize>,
888 nr_nodes: usize,
889
890 total_util: f64, layer_utils: Vec<Vec<f64>>,
892 prev_layer_usages: Vec<Vec<u64>>,
893
894 cpu_busy: f64, prev_total_cpu: procfs::CpuStat,
896
897 bpf_stats: BpfStats,
898 prev_bpf_stats: BpfStats,
899
900 processing_dur: Duration,
901 prev_processing_dur: Duration,
902
903 layer_slice_us: Vec<u64>,
904}
905
906impl Stats {
907 fn read_layer_usages(cpu_ctxs: &[bpf_intf::cpu_ctx], nr_layers: usize) -> Vec<Vec<u64>> {
908 let mut layer_usages = vec![vec![0u64; NR_LAYER_USAGES]; nr_layers];
909
910 for cpu in 0..*NR_CPUS_POSSIBLE {
911 for layer in 0..nr_layers {
912 for usage in 0..NR_LAYER_USAGES {
913 layer_usages[layer][usage] += cpu_ctxs[cpu].layer_usages[layer][usage];
914 }
915 }
916 }
917
918 layer_usages
919 }
920
921 fn new(skel: &mut BpfSkel, proc_reader: &procfs::ProcReader) -> Result<Self> {
922 let nr_layers = skel.maps.rodata_data.nr_layers as usize;
923 let cpu_ctxs = read_cpu_ctxs(skel)?;
924 let bpf_stats = BpfStats::read(skel, &cpu_ctxs);
925 let nr_nodes = skel.maps.rodata_data.nr_nodes as usize;
926
927 Ok(Self {
928 at: Instant::now(),
929 elapsed: Default::default(),
930 nr_layers,
931 nr_layer_tasks: vec![0; nr_layers],
932 nr_nodes,
933
934 total_util: 0.0,
935 layer_utils: vec![vec![0.0; NR_LAYER_USAGES]; nr_layers],
936 prev_layer_usages: Self::read_layer_usages(&cpu_ctxs, nr_layers),
937
938 cpu_busy: 0.0,
939 prev_total_cpu: read_total_cpu(proc_reader)?,
940
941 bpf_stats: bpf_stats.clone(),
942 prev_bpf_stats: bpf_stats,
943
944 processing_dur: Default::default(),
945 prev_processing_dur: Default::default(),
946
947 layer_slice_us: vec![0; nr_layers],
948 })
949 }
950
951 fn refresh(
952 &mut self,
953 skel: &mut BpfSkel,
954 proc_reader: &procfs::ProcReader,
955 now: Instant,
956 cur_processing_dur: Duration,
957 ) -> Result<()> {
958 let elapsed = now.duration_since(self.at);
959 let elapsed_f64 = elapsed.as_secs_f64();
960 let cpu_ctxs = read_cpu_ctxs(skel)?;
961
962 let nr_layer_tasks: Vec<usize> = skel
963 .maps
964 .bss_data
965 .layers
966 .iter()
967 .take(self.nr_layers)
968 .map(|layer| layer.nr_tasks as usize)
969 .collect();
970 let layer_slice_us: Vec<u64> = skel
971 .maps
972 .bss_data
973 .layers
974 .iter()
975 .take(self.nr_layers)
976 .map(|layer| layer.slice_ns / 1000_u64)
977 .collect();
978
979 let cur_layer_usages = Self::read_layer_usages(&cpu_ctxs, self.nr_layers);
980 let cur_layer_utils: Vec<Vec<f64>> = cur_layer_usages
981 .iter()
982 .zip(self.prev_layer_usages.iter())
983 .map(|(cur, prev)| {
984 cur.iter()
985 .zip(prev.iter())
986 .map(|(c, p)| (c - p) as f64 / 1_000_000_000.0 / elapsed_f64)
987 .collect()
988 })
989 .collect();
990 let layer_utils: Vec<Vec<f64>> = cur_layer_utils
991 .iter()
992 .zip(self.layer_utils.iter())
993 .map(|(cur, prev)| {
994 cur.iter()
995 .zip(prev.iter())
996 .map(|(c, p)| {
997 let decay = USAGE_DECAY.powf(elapsed_f64);
998 p * decay + c * (1.0 - decay)
999 })
1000 .collect()
1001 })
1002 .collect();
1003
1004 let cur_total_cpu = read_total_cpu(proc_reader)?;
1005 let cpu_busy = calc_util(&cur_total_cpu, &self.prev_total_cpu)?;
1006
1007 let cur_bpf_stats = BpfStats::read(skel, &cpu_ctxs);
1008 let bpf_stats = &cur_bpf_stats - &self.prev_bpf_stats;
1009
1010 let processing_dur = cur_processing_dur
1011 .checked_sub(self.prev_processing_dur)
1012 .unwrap();
1013
1014 *self = Self {
1015 at: now,
1016 elapsed,
1017 nr_layers: self.nr_layers,
1018 nr_layer_tasks,
1019 nr_nodes: self.nr_nodes,
1020
1021 total_util: layer_utils
1022 .iter()
1023 .map(|x| x.iter().take(LAYER_USAGE_SUM_UPTO + 1).sum::<f64>())
1024 .sum(),
1025 layer_utils,
1026 prev_layer_usages: cur_layer_usages,
1027
1028 cpu_busy,
1029 prev_total_cpu: cur_total_cpu,
1030
1031 bpf_stats,
1032 prev_bpf_stats: cur_bpf_stats,
1033
1034 processing_dur,
1035 prev_processing_dur: cur_processing_dur,
1036
1037 layer_slice_us,
1038 };
1039 Ok(())
1040 }
1041}
1042
1043#[derive(Debug)]
1044struct Layer {
1045 name: String,
1046 kind: LayerKind,
1047 growth_algo: LayerGrowthAlgo,
1048 core_order: Vec<usize>,
1049
1050 target_llc_cpus: (usize, usize),
1051 assigned_llcs: Vec<usize>,
1052
1053 nr_cpus: usize,
1054 nr_llc_cpus: Vec<usize>,
1055 cpus: Cpumask,
1056 allowed_cpus: Cpumask,
1057}
1058
1059fn get_kallsyms_addr(sym_name: &str) -> Result<u64> {
1060 fs::read_to_string("/proc/kallsyms")?
1061 .lines()
1062 .find(|line| line.contains(sym_name))
1063 .and_then(|line| line.split_whitespace().next())
1064 .and_then(|addr| u64::from_str_radix(addr, 16).ok())
1065 .ok_or_else(|| anyhow!("Symbol '{}' not found", sym_name))
1066}
1067
1068fn resolve_cpus_pct_range(
1069 cpus_range: &Option<(usize, usize)>,
1070 cpus_range_frac: &Option<(f64, f64)>,
1071 max_cpus: usize,
1072) -> Result<(usize, usize)> {
1073 match (cpus_range, cpus_range_frac) {
1074 (Some(_x), Some(_y)) => {
1075 bail!("cpus_range cannot be used with cpus_pct.");
1076 }
1077 (Some((cpus_range_min, cpus_range_max)), None) => Ok((*cpus_range_min, *cpus_range_max)),
1078 (None, Some((cpus_frac_min, cpus_frac_max))) => {
1079 if *cpus_frac_min < 0_f64
1080 || *cpus_frac_min > 1_f64
1081 || *cpus_frac_max < 0_f64
1082 || *cpus_frac_max > 1_f64
1083 {
1084 bail!("cpus_range_frac values must be between 0.0 and 1.0");
1085 }
1086 let cpus_min_count = ((max_cpus as f64) * cpus_frac_min).round_ties_even() as usize;
1087 let cpus_max_count = ((max_cpus as f64) * cpus_frac_max).round_ties_even() as usize;
1088 Ok((
1089 std::cmp::max(cpus_min_count, 1),
1090 std::cmp::min(cpus_max_count, max_cpus),
1091 ))
1092 }
1093 (None, None) => Ok((0, max_cpus)),
1094 }
1095}
1096
1097impl Layer {
1098 fn new(spec: &LayerSpec, topo: &Topology, core_order: &Vec<usize>) -> Result<Self> {
1099 let name = &spec.name;
1100 let kind = spec.kind.clone();
1101 let mut allowed_cpus = Cpumask::new();
1102 match &kind {
1103 LayerKind::Confined {
1104 cpus_range,
1105 cpus_range_frac,
1106 common: LayerCommon { nodes, llcs, .. },
1107 ..
1108 } => {
1109 let cpus_range =
1110 resolve_cpus_pct_range(cpus_range, cpus_range_frac, topo.all_cpus.len())?;
1111 if cpus_range.0 > cpus_range.1 || cpus_range.1 == 0 {
1112 bail!("invalid cpus_range {:?}", cpus_range);
1113 }
1114 if nodes.is_empty() && llcs.is_empty() {
1115 allowed_cpus.set_all();
1116 } else {
1117 for (node_id, node) in &topo.nodes {
1119 if nodes.contains(node_id) {
1121 for &id in node.all_cpus.keys() {
1122 allowed_cpus.set_cpu(id)?;
1123 }
1124 }
1125 for (llc_id, llc) in &node.llcs {
1127 if llcs.contains(llc_id) {
1128 for &id in llc.all_cpus.keys() {
1129 allowed_cpus.set_cpu(id)?;
1130 }
1131 }
1132 }
1133 }
1134 }
1135 }
1136 LayerKind::Grouped {
1137 common: LayerCommon { nodes, llcs, .. },
1138 ..
1139 }
1140 | LayerKind::Open {
1141 common: LayerCommon { nodes, llcs, .. },
1142 ..
1143 } => {
1144 if nodes.is_empty() && llcs.is_empty() {
1145 allowed_cpus.set_all();
1146 } else {
1147 for (node_id, node) in &topo.nodes {
1149 if nodes.contains(node_id) {
1151 for &id in node.all_cpus.keys() {
1152 allowed_cpus.set_cpu(id)?;
1153 }
1154 }
1155 for (llc_id, llc) in &node.llcs {
1157 if llcs.contains(llc_id) {
1158 for &id in llc.all_cpus.keys() {
1159 allowed_cpus.set_cpu(id)?;
1160 }
1161 }
1162 }
1163 }
1164 }
1165 }
1166 }
1167
1168 if let Some(util_range) = kind.util_range() {
1171 if util_range.0 < 0.0 || util_range.1 < 0.0 || util_range.0 >= util_range.1 {
1172 bail!("invalid util_range {:?}", util_range);
1173 }
1174 }
1175
1176 let layer_growth_algo = kind.common().growth_algo.clone();
1177
1178 debug!(
1179 "layer: {} algo: {:?} core order: {:?}",
1180 name, &layer_growth_algo, core_order
1181 );
1182
1183 Ok(Self {
1184 name: name.into(),
1185 kind,
1186 growth_algo: layer_growth_algo,
1187 core_order: core_order.clone(),
1188
1189 target_llc_cpus: (0, 0),
1190 assigned_llcs: vec![],
1191
1192 nr_cpus: 0,
1193 nr_llc_cpus: vec![0; topo.all_llcs.len()],
1194 cpus: Cpumask::new(),
1195 allowed_cpus,
1196 })
1197 }
1198
1199 fn free_some_cpus(&mut self, cpu_pool: &mut CpuPool, max_to_free: usize) -> Result<usize> {
1200 let cpus_to_free = match cpu_pool.next_to_free(&self.cpus, self.core_order.iter().rev())? {
1201 Some(ret) => ret.clone(),
1202 None => return Ok(0),
1203 };
1204
1205 let nr_to_free = cpus_to_free.weight();
1206
1207 Ok(if nr_to_free <= max_to_free {
1208 trace!("[{}] freeing CPUs: {}", self.name, &cpus_to_free);
1209 self.cpus &= &cpus_to_free.not();
1210 self.nr_cpus -= nr_to_free;
1211 for cpu in cpus_to_free.iter() {
1212 self.nr_llc_cpus[cpu_pool.topo.all_cpus[&cpu].llc_id] -= 1;
1213 }
1214 cpu_pool.free(&cpus_to_free)?;
1215 nr_to_free
1216 } else {
1217 0
1218 })
1219 }
1220
1221 fn alloc_some_cpus(&mut self, cpu_pool: &mut CpuPool) -> Result<usize> {
1222 let new_cpus = match cpu_pool
1223 .alloc_cpus(&self.allowed_cpus, &self.core_order)
1224 .clone()
1225 {
1226 Some(ret) => ret.clone(),
1227 None => {
1228 trace!("layer-{} can't grow, no CPUs", &self.name);
1229 return Ok(0);
1230 }
1231 };
1232
1233 let nr_new_cpus = new_cpus.weight();
1234
1235 trace!("[{}] adding CPUs: {}", &self.name, &new_cpus);
1236 self.cpus |= &new_cpus;
1237 self.nr_cpus += nr_new_cpus;
1238 for cpu in new_cpus.iter() {
1239 self.nr_llc_cpus[cpu_pool.topo.all_cpus[&cpu].llc_id] += 1;
1240 }
1241 Ok(nr_new_cpus)
1242 }
1243}
1244
1245struct Scheduler<'a> {
1246 skel: BpfSkel<'a>,
1247 struct_ops: Option<libbpf_rs::Link>,
1248 layer_specs: Vec<LayerSpec>,
1249
1250 sched_intv: Duration,
1251 layer_refresh_intv: Duration,
1252
1253 cpu_pool: CpuPool,
1254 layers: Vec<Layer>,
1255 idle_qos_enabled: bool,
1256
1257 proc_reader: procfs::ProcReader,
1258 sched_stats: Stats,
1259
1260 nr_layer_cpus_ranges: Vec<(usize, usize)>,
1261 processing_dur: Duration,
1262
1263 topo: Arc<Topology>,
1264 netdevs: BTreeMap<String, NetDev>,
1265 stats_server: StatsServer<StatsReq, StatsRes>,
1266}
1267
1268impl<'a> Scheduler<'a> {
1269 fn init_layers(skel: &mut OpenBpfSkel, specs: &[LayerSpec], topo: &Topology) -> Result<()> {
1270 skel.maps.rodata_data.nr_layers = specs.len() as u32;
1271 let mut perf_set = false;
1272
1273 let mut layer_iteration_order = (0..specs.len()).collect::<Vec<_>>();
1274 let mut layer_weights: Vec<usize> = vec![];
1275
1276 for (spec_i, spec) in specs.iter().enumerate() {
1277 let layer = &mut skel.maps.bss_data.layers[spec_i];
1278
1279 for (or_i, or) in spec.matches.iter().enumerate() {
1280 for (and_i, and) in or.iter().enumerate() {
1281 let mt = &mut layer.matches[or_i].matches[and_i];
1282
1283 mt.exclude.write(false);
1285
1286 match and {
1287 LayerMatch::CgroupPrefix(prefix) => {
1288 mt.kind = bpf_intf::layer_match_kind_MATCH_CGROUP_PREFIX as i32;
1289 copy_into_cstr(&mut mt.cgroup_prefix, prefix.as_str());
1290 }
1291 LayerMatch::CgroupSuffix(suffix) => {
1292 mt.kind = bpf_intf::layer_match_kind_MATCH_CGROUP_SUFFIX as i32;
1293 copy_into_cstr(&mut mt.cgroup_suffix, suffix.as_str());
1294 }
1295 LayerMatch::CgroupContains(substr) => {
1296 mt.kind = bpf_intf::layer_match_kind_MATCH_CGROUP_CONTAINS as i32;
1297 copy_into_cstr(&mut mt.cgroup_substr, substr.as_str());
1298 }
1299 LayerMatch::CommPrefix(prefix) => {
1300 mt.kind = bpf_intf::layer_match_kind_MATCH_COMM_PREFIX as i32;
1301 copy_into_cstr(&mut mt.comm_prefix, prefix.as_str());
1302 }
1303 LayerMatch::CommPrefixExclude(prefix) => {
1304 mt.kind = bpf_intf::layer_match_kind_MATCH_COMM_PREFIX as i32;
1305 mt.exclude.write(true);
1306 copy_into_cstr(&mut mt.comm_prefix, prefix.as_str());
1307 }
1308 LayerMatch::PcommPrefix(prefix) => {
1309 mt.kind = bpf_intf::layer_match_kind_MATCH_PCOMM_PREFIX as i32;
1310 copy_into_cstr(&mut mt.pcomm_prefix, prefix.as_str());
1311 }
1312 LayerMatch::PcommPrefixExclude(prefix) => {
1313 mt.kind = bpf_intf::layer_match_kind_MATCH_PCOMM_PREFIX as i32;
1314 mt.exclude.write(true);
1315 copy_into_cstr(&mut mt.pcomm_prefix, prefix.as_str());
1316 }
1317 LayerMatch::NiceAbove(nice) => {
1318 mt.kind = bpf_intf::layer_match_kind_MATCH_NICE_ABOVE as i32;
1319 mt.nice = *nice;
1320 }
1321 LayerMatch::NiceBelow(nice) => {
1322 mt.kind = bpf_intf::layer_match_kind_MATCH_NICE_BELOW as i32;
1323 mt.nice = *nice;
1324 }
1325 LayerMatch::NiceEquals(nice) => {
1326 mt.kind = bpf_intf::layer_match_kind_MATCH_NICE_EQUALS as i32;
1327 mt.nice = *nice;
1328 }
1329 LayerMatch::UIDEquals(user_id) => {
1330 mt.kind = bpf_intf::layer_match_kind_MATCH_USER_ID_EQUALS as i32;
1331 mt.user_id = *user_id;
1332 }
1333 LayerMatch::GIDEquals(group_id) => {
1334 mt.kind = bpf_intf::layer_match_kind_MATCH_GROUP_ID_EQUALS as i32;
1335 mt.group_id = *group_id;
1336 }
1337 LayerMatch::PIDEquals(pid) => {
1338 mt.kind = bpf_intf::layer_match_kind_MATCH_PID_EQUALS as i32;
1339 mt.pid = *pid;
1340 }
1341 LayerMatch::PPIDEquals(ppid) => {
1342 mt.kind = bpf_intf::layer_match_kind_MATCH_PPID_EQUALS as i32;
1343 mt.ppid = *ppid;
1344 }
1345 LayerMatch::TGIDEquals(tgid) => {
1346 mt.kind = bpf_intf::layer_match_kind_MATCH_TGID_EQUALS as i32;
1347 mt.tgid = *tgid;
1348 }
1349 LayerMatch::NSPIDEquals(nsid, pid) => {
1350 mt.kind = bpf_intf::layer_match_kind_MATCH_NSPID_EQUALS as i32;
1351 mt.nsid = *nsid;
1352 mt.pid = *pid;
1353 }
1354 LayerMatch::NSEquals(nsid) => {
1355 mt.kind = bpf_intf::layer_match_kind_MATCH_NS_EQUALS as i32;
1356 mt.nsid = *nsid as u64;
1357 }
1358 LayerMatch::CmdJoin(joincmd) => {
1359 mt.kind = bpf_intf::layer_match_kind_MATCH_SCXCMD_JOIN as i32;
1360 copy_into_cstr(&mut mt.comm_prefix, joincmd);
1361 }
1362 LayerMatch::IsGroupLeader(polarity) => {
1363 mt.kind = bpf_intf::layer_match_kind_MATCH_IS_GROUP_LEADER as i32;
1364 mt.is_group_leader.write(*polarity);
1365 }
1366 LayerMatch::IsKthread(polarity) => {
1367 mt.kind = bpf_intf::layer_match_kind_MATCH_IS_KTHREAD as i32;
1368 mt.is_kthread.write(*polarity);
1369 }
1370 LayerMatch::UsedGpuTid(polarity) => {
1371 mt.kind = bpf_intf::layer_match_kind_MATCH_USED_GPU_TID as i32;
1372 mt.used_gpu_tid.write(*polarity);
1373 }
1374 LayerMatch::UsedGpuPid(polarity) => {
1375 mt.kind = bpf_intf::layer_match_kind_MATCH_USED_GPU_PID as i32;
1376 mt.used_gpu_pid.write(*polarity);
1377 }
1378 LayerMatch::AvgRuntime(min, max) => {
1379 mt.kind = bpf_intf::layer_match_kind_MATCH_AVG_RUNTIME as i32;
1380 mt.min_avg_runtime_us = *min;
1381 mt.max_avg_runtime_us = *max;
1382 }
1383 }
1384 }
1385 layer.matches[or_i].nr_match_ands = or.len() as i32;
1386 }
1387
1388 layer.nr_match_ors = spec.matches.len() as u32;
1389 layer.kind = spec.kind.as_bpf_enum();
1390
1391 {
1392 let LayerCommon {
1393 min_exec_us,
1394 yield_ignore,
1395 perf,
1396 preempt,
1397 preempt_first,
1398 exclusive,
1399 allow_node_aligned,
1400 skip_remote_node,
1401 prev_over_idle_core,
1402 growth_algo,
1403 nodes,
1404 slice_us,
1405 fifo,
1406 weight,
1407 disallow_open_after_us,
1408 disallow_preempt_after_us,
1409 xllc_mig_min_us,
1410 placement,
1411 ..
1412 } = spec.kind.common();
1413
1414 layer.slice_ns = *slice_us * 1000;
1415 layer.fifo.write(*fifo);
1416 layer.min_exec_ns = min_exec_us * 1000;
1417 layer.yield_step_ns = if *yield_ignore > 0.999 {
1418 0
1419 } else if *yield_ignore < 0.001 {
1420 layer.slice_ns
1421 } else {
1422 (layer.slice_ns as f64 * (1.0 - *yield_ignore)) as u64
1423 };
1424 let mut layer_name: String = spec.name.clone();
1425 layer_name.truncate(MAX_LAYER_NAME);
1426 copy_into_cstr(&mut layer.name, layer_name.as_str());
1427 layer.preempt.write(*preempt);
1428 layer.preempt_first.write(*preempt_first);
1429 layer.excl.write(*exclusive);
1430 layer.allow_node_aligned.write(*allow_node_aligned);
1431 layer.skip_remote_node.write(*skip_remote_node);
1432 layer.prev_over_idle_core.write(*prev_over_idle_core);
1433 layer.growth_algo = growth_algo.as_bpf_enum();
1434 layer.weight = *weight;
1435 layer.disallow_open_after_ns = match disallow_open_after_us.unwrap() {
1436 v if v == u64::MAX => v,
1437 v => v * 1000,
1438 };
1439 layer.disallow_preempt_after_ns = match disallow_preempt_after_us.unwrap() {
1440 v if v == u64::MAX => v,
1441 v => v * 1000,
1442 };
1443 layer.xllc_mig_min_ns = (xllc_mig_min_us * 1000.0) as u64;
1444 layer_weights.push(layer.weight.try_into().unwrap());
1445 layer.perf = u32::try_from(*perf)?;
1446 layer.node_mask = nodemask_from_nodes(nodes) as u64;
1447 for (topo_node_id, topo_node) in &topo.nodes {
1448 if !nodes.is_empty() && !nodes.contains(topo_node_id) {
1449 continue;
1450 }
1451 layer.llc_mask |= llcmask_from_llcs(&topo_node.llcs) as u64;
1452 }
1453
1454 let task_place = |place: u32| crate::types::layer_task_place(place);
1455 layer.task_place = match placement {
1456 LayerPlacement::Standard => {
1457 task_place(bpf_intf::layer_task_place_PLACEMENT_STD as u32)
1458 }
1459 LayerPlacement::Sticky => {
1460 task_place(bpf_intf::layer_task_place_PLACEMENT_STICK as u32)
1461 }
1462 LayerPlacement::Floating => {
1463 task_place(bpf_intf::layer_task_place_PLACEMENT_FLOAT as u32)
1464 }
1465 };
1466 }
1467
1468 layer.is_protected.write(match spec.kind {
1469 LayerKind::Open { .. } => false,
1470 LayerKind::Confined { protected, .. } | LayerKind::Grouped { protected, .. } => {
1471 protected
1472 }
1473 });
1474
1475 match &spec.cpuset {
1476 Some(mask) => {
1477 Self::update_cpumask(&mask, &mut layer.cpuset);
1478 }
1479 None => {
1480 for i in 0..layer.cpuset.len() {
1481 layer.cpuset[i] = u8::MAX;
1482 }
1483 }
1484 };
1485
1486 perf_set |= layer.perf > 0;
1487 }
1488
1489 layer_iteration_order.sort_by(|i, j| layer_weights[*i].cmp(&layer_weights[*j]));
1490 for (idx, layer_idx) in layer_iteration_order.iter().enumerate() {
1491 skel.maps.rodata_data.layer_iteration_order[idx] = *layer_idx as u32;
1492 }
1493
1494 if perf_set && !compat::ksym_exists("scx_bpf_cpuperf_set")? {
1495 warn!("cpufreq support not available, ignoring perf configurations");
1496 }
1497
1498 Ok(())
1499 }
1500
1501 fn init_nodes(skel: &mut OpenBpfSkel, _opts: &Opts, topo: &Topology) {
1502 skel.maps.rodata_data.nr_nodes = topo.nodes.len() as u32;
1503 skel.maps.rodata_data.nr_llcs = 0;
1504
1505 for (&node_id, node) in &topo.nodes {
1506 debug!("configuring node {}, LLCs {:?}", node_id, node.llcs.len());
1507 skel.maps.rodata_data.nr_llcs += node.llcs.len() as u32;
1508 let raw_numa_slice = node.span.as_raw_slice();
1509 let node_cpumask_slice = &mut skel.maps.rodata_data.numa_cpumasks[node_id];
1510 let (left, _) = node_cpumask_slice.split_at_mut(raw_numa_slice.len());
1511 left.clone_from_slice(raw_numa_slice);
1512 debug!(
1513 "node {} mask: {:?}",
1514 node_id, skel.maps.rodata_data.numa_cpumasks[node_id]
1515 );
1516
1517 for llc in node.llcs.values() {
1518 debug!("configuring llc {:?} for node {:?}", llc.id, node_id);
1519 skel.maps.rodata_data.llc_numa_id_map[llc.id] = node_id as u32;
1520 }
1521 }
1522
1523 for cpu in topo.all_cpus.values() {
1524 skel.maps.rodata_data.cpu_llc_id_map[cpu.id] = cpu.llc_id as u32;
1525 }
1526 }
1527
1528 fn init_cpu_prox_map(topo: &Topology, cpu_ctxs: &mut [bpf_intf::cpu_ctx]) {
1529 let radiate = |mut vec: Vec<usize>, center_id: usize| -> Vec<usize> {
1530 vec.sort_by_key(|&id| (center_id as i32 - id as i32).abs());
1531 vec
1532 };
1533 let radiate_cpu =
1534 |mut vec: Vec<usize>, center_cpu: usize, center_core: usize| -> Vec<usize> {
1535 vec.sort_by_key(|&id| {
1536 (
1537 (center_core as i32 - topo.all_cpus.get(&id).unwrap().core_id as i32).abs(),
1538 (center_cpu as i32 - id as i32).abs(),
1539 )
1540 });
1541 vec
1542 };
1543
1544 for (&cpu_id, cpu) in &topo.all_cpus {
1545 let mut core_span = topo.all_cores[&cpu.core_id].span.clone();
1547 let llc_span = &topo.all_llcs[&cpu.llc_id].span;
1548 let node_span = &topo.nodes[&cpu.node_id].span;
1549 let sys_span = &topo.span;
1550
1551 let sys_span = sys_span.and(&node_span.not());
1553 let node_span = node_span.and(&llc_span.not());
1554 let llc_span = llc_span.and(&core_span.not());
1555 core_span.clear_cpu(cpu_id).unwrap();
1556
1557 let mut sys_order: Vec<usize> = sys_span.iter().collect();
1559 let mut node_order: Vec<usize> = node_span.iter().collect();
1560 let mut llc_order: Vec<usize> = llc_span.iter().collect();
1561 let mut core_order: Vec<usize> = core_span.iter().collect();
1562
1563 sys_order = radiate_cpu(sys_order, cpu_id, cpu.core_id);
1568 node_order = radiate(node_order, cpu.node_id);
1569 llc_order = radiate_cpu(llc_order, cpu_id, cpu.core_id);
1570 core_order = radiate_cpu(core_order, cpu_id, cpu.core_id);
1571
1572 let mut order: Vec<usize> = vec![];
1574 let mut idx: usize = 0;
1575
1576 idx += 1;
1577 order.push(cpu_id);
1578
1579 idx += core_order.len();
1580 order.append(&mut core_order);
1581 let core_end = idx;
1582
1583 idx += llc_order.len();
1584 order.append(&mut llc_order);
1585 let llc_end = idx;
1586
1587 idx += node_order.len();
1588 order.append(&mut node_order);
1589 let node_end = idx;
1590
1591 idx += sys_order.len();
1592 order.append(&mut sys_order);
1593 let sys_end = idx;
1594
1595 debug!(
1596 "CPU[{}] proximity map[{}/{}/{}/{}]: {:?}",
1597 cpu_id, core_end, llc_end, node_end, sys_end, &order
1598 );
1599
1600 let pmap = &mut cpu_ctxs[cpu_id].prox_map;
1602 for (i, &cpu) in order.iter().enumerate() {
1603 pmap.cpus[i] = cpu as u16;
1604 }
1605 pmap.core_end = core_end as u32;
1606 pmap.llc_end = llc_end as u32;
1607 pmap.node_end = node_end as u32;
1608 pmap.sys_end = sys_end as u32;
1609 }
1610 }
1611
1612 fn convert_cpu_ctxs(cpu_ctxs: Vec<bpf_intf::cpu_ctx>) -> Vec<Vec<u8>> {
1613 cpu_ctxs
1614 .into_iter()
1615 .map(|cpu_ctx| {
1616 let bytes = unsafe {
1617 std::slice::from_raw_parts(
1618 &cpu_ctx as *const bpf_intf::cpu_ctx as *const u8,
1619 std::mem::size_of::<bpf_intf::cpu_ctx>(),
1620 )
1621 };
1622 bytes.to_vec()
1623 })
1624 .collect()
1625 }
1626
1627 fn init_cpus(skel: &BpfSkel, layer_specs: &[LayerSpec], topo: &Topology) -> Result<()> {
1628 let key = (0_u32).to_ne_bytes();
1629 let mut cpu_ctxs: Vec<bpf_intf::cpu_ctx> = vec![];
1630 let cpu_ctxs_vec = skel
1631 .maps
1632 .cpu_ctxs
1633 .lookup_percpu(&key, libbpf_rs::MapFlags::ANY)
1634 .context("Failed to lookup cpu_ctx")?
1635 .unwrap();
1636
1637 let op_layers: Vec<u32> = layer_specs
1638 .iter()
1639 .enumerate()
1640 .filter(|(_idx, spec)| match &spec.kind {
1641 LayerKind::Open { .. } => spec.kind.common().preempt,
1642 _ => false,
1643 })
1644 .map(|(idx, _)| idx as u32)
1645 .collect();
1646 let on_layers: Vec<u32> = layer_specs
1647 .iter()
1648 .enumerate()
1649 .filter(|(_idx, spec)| match &spec.kind {
1650 LayerKind::Open { .. } => !spec.kind.common().preempt,
1651 _ => false,
1652 })
1653 .map(|(idx, _)| idx as u32)
1654 .collect();
1655 let gp_layers: Vec<u32> = layer_specs
1656 .iter()
1657 .enumerate()
1658 .filter(|(_idx, spec)| match &spec.kind {
1659 LayerKind::Grouped { .. } => spec.kind.common().preempt,
1660 _ => false,
1661 })
1662 .map(|(idx, _)| idx as u32)
1663 .collect();
1664 let gn_layers: Vec<u32> = layer_specs
1665 .iter()
1666 .enumerate()
1667 .filter(|(_idx, spec)| match &spec.kind {
1668 LayerKind::Grouped { .. } => !spec.kind.common().preempt,
1669 _ => false,
1670 })
1671 .map(|(idx, _)| idx as u32)
1672 .collect();
1673
1674 for cpu in 0..*NR_CPUS_POSSIBLE {
1676 cpu_ctxs.push(*unsafe {
1677 &*(cpu_ctxs_vec[cpu].as_slice().as_ptr() as *const bpf_intf::cpu_ctx)
1678 });
1679
1680 let topo_cpu = topo.all_cpus.get(&cpu).unwrap();
1681 let is_big = topo_cpu.core_type == CoreType::Big { turbo: true };
1682 cpu_ctxs[cpu].cpu = cpu as i32;
1683 cpu_ctxs[cpu].layer_id = MAX_LAYERS as u32;
1684 cpu_ctxs[cpu].task_layer_id = MAX_LAYERS as u32;
1685 cpu_ctxs[cpu].is_big = is_big;
1686
1687 fastrand::seed(cpu as u64);
1688
1689 let mut ogp_order = op_layers.clone();
1690 ogp_order.append(&mut gp_layers.clone());
1691 fastrand::shuffle(&mut ogp_order);
1692
1693 let mut ogn_order = on_layers.clone();
1694 ogn_order.append(&mut gn_layers.clone());
1695 fastrand::shuffle(&mut ogn_order);
1696
1697 let mut op_order = op_layers.clone();
1698 fastrand::shuffle(&mut op_order);
1699
1700 let mut on_order = on_layers.clone();
1701 fastrand::shuffle(&mut on_order);
1702
1703 let mut gp_order = gp_layers.clone();
1704 fastrand::shuffle(&mut gp_order);
1705
1706 let mut gn_order = gn_layers.clone();
1707 fastrand::shuffle(&mut gn_order);
1708
1709 for i in 0..MAX_LAYERS {
1710 cpu_ctxs[cpu].ogp_layer_order[i] =
1711 ogp_order.get(i).cloned().unwrap_or(MAX_LAYERS as u32);
1712 cpu_ctxs[cpu].ogn_layer_order[i] =
1713 ogn_order.get(i).cloned().unwrap_or(MAX_LAYERS as u32);
1714
1715 cpu_ctxs[cpu].op_layer_order[i] =
1716 op_order.get(i).cloned().unwrap_or(MAX_LAYERS as u32);
1717 cpu_ctxs[cpu].on_layer_order[i] =
1718 on_order.get(i).cloned().unwrap_or(MAX_LAYERS as u32);
1719 cpu_ctxs[cpu].gp_layer_order[i] =
1720 gp_order.get(i).cloned().unwrap_or(MAX_LAYERS as u32);
1721 cpu_ctxs[cpu].gn_layer_order[i] =
1722 gn_order.get(i).cloned().unwrap_or(MAX_LAYERS as u32);
1723 }
1724 }
1725
1726 Self::init_cpu_prox_map(topo, &mut cpu_ctxs);
1727
1728 skel.maps
1729 .cpu_ctxs
1730 .update_percpu(
1731 &key,
1732 &Self::convert_cpu_ctxs(cpu_ctxs),
1733 libbpf_rs::MapFlags::ANY,
1734 )
1735 .context("Failed to update cpu_ctx")?;
1736
1737 Ok(())
1738 }
1739
1740 fn init_llc_prox_map(skel: &mut BpfSkel, topo: &Topology) -> Result<()> {
1741 for (&llc_id, llc) in &topo.all_llcs {
1742 let mut node_order: Vec<usize> =
1744 topo.nodes[&llc.node_id].llcs.keys().cloned().collect();
1745 let mut sys_order: Vec<usize> = topo.all_llcs.keys().cloned().collect();
1746
1747 sys_order.retain(|id| !node_order.contains(id));
1749 node_order.retain(|&id| id != llc_id);
1750
1751 fastrand::seed(llc_id as u64);
1754 fastrand::shuffle(&mut sys_order);
1755 fastrand::shuffle(&mut node_order);
1756
1757 let mut order: Vec<usize> = vec![];
1759 let mut idx: usize = 0;
1760
1761 idx += 1;
1762 order.push(llc_id);
1763
1764 idx += node_order.len();
1765 order.append(&mut node_order);
1766 let node_end = idx;
1767
1768 idx += sys_order.len();
1769 order.append(&mut sys_order);
1770 let sys_end = idx;
1771
1772 debug!(
1773 "LLC[{}] proximity map[{}/{}]: {:?}",
1774 llc_id, node_end, sys_end, &order
1775 );
1776
1777 let key = llc_id as u32;
1782 let llc_id_slice =
1783 unsafe { std::slice::from_raw_parts((&key as *const u32) as *const u8, 4) };
1784 let v = skel
1785 .maps
1786 .llc_data
1787 .lookup(llc_id_slice, libbpf_rs::MapFlags::ANY)
1788 .unwrap()
1789 .unwrap();
1790 let mut llcc = unsafe { *(v.as_slice().as_ptr() as *const bpf_intf::llc_ctx) };
1791
1792 let pmap = &mut llcc.prox_map;
1793 for (i, &llc_id) in order.iter().enumerate() {
1794 pmap.llcs[i] = llc_id as u16;
1795 }
1796 pmap.node_end = node_end as u32;
1797 pmap.sys_end = sys_end as u32;
1798
1799 let v = unsafe {
1800 std::slice::from_raw_parts(
1801 &llcc as *const bpf_intf::llc_ctx as *const u8,
1802 std::mem::size_of::<bpf_intf::llc_ctx>(),
1803 )
1804 };
1805
1806 skel.maps
1807 .llc_data
1808 .update(llc_id_slice, v, libbpf_rs::MapFlags::ANY)?
1809 }
1810
1811 Ok(())
1812 }
1813
1814 fn init(
1815 opts: &'a Opts,
1816 layer_specs: &[LayerSpec],
1817 open_object: &'a mut MaybeUninit<OpenObject>,
1818 ) -> Result<Self> {
1819 let nr_layers = layer_specs.len();
1820 let mut disable_topology = opts.disable_topology.unwrap_or(false);
1821
1822 let topo = Arc::new(if disable_topology {
1823 Topology::with_flattened_llc_node()?
1824 } else {
1825 Topology::new()?
1826 });
1827
1828 if topo.nodes.keys().enumerate().any(|(i, &k)| i != k) {
1835 bail!("Holes in node IDs detected: {:?}", topo.nodes.keys());
1836 }
1837 if topo.all_llcs.keys().enumerate().any(|(i, &k)| i != k) {
1838 bail!("Holes in LLC IDs detected: {:?}", topo.all_llcs.keys());
1839 }
1840 if topo.all_cpus.keys().enumerate().any(|(i, &k)| i != k) {
1841 bail!("Holes in CPU IDs detected: {:?}", topo.all_cpus.keys());
1842 }
1843
1844 let netdevs = if opts.netdev_irq_balance {
1845 warn!(
1846 "Experimental netdev IRQ balancing enabled. Reset IRQ masks of network devices after use!!!"
1847 );
1848 read_netdevs()?
1849 } else {
1850 BTreeMap::new()
1851 };
1852
1853 if !disable_topology {
1854 if topo.nodes.len() == 1 && topo.nodes[&0].llcs.len() == 1 {
1855 disable_topology = true;
1856 };
1857 info!(
1858 "Topology awareness not specified, selecting {} based on hardware",
1859 if disable_topology {
1860 "disabled"
1861 } else {
1862 "enabled"
1863 }
1864 );
1865 };
1866
1867 let cpu_pool = CpuPool::new(topo.clone())?;
1868
1869 let layer_specs: Vec<_> = if disable_topology {
1872 info!("Disabling topology awareness");
1873 layer_specs
1874 .iter()
1875 .cloned()
1876 .map(|mut s| {
1877 s.kind.common_mut().nodes.clear();
1878 s.kind.common_mut().llcs.clear();
1879 s
1880 })
1881 .collect()
1882 } else {
1883 layer_specs.to_vec()
1884 };
1885
1886 let mut skel_builder = BpfSkelBuilder::default();
1888 skel_builder.obj_builder.debug(opts.verbose > 1);
1889 init_libbpf_logging(None);
1890 let mut skel = scx_ops_open!(skel_builder, open_object, layered)?;
1891
1892 if opts.enable_gpu_support {
1895 if opts.gpu_kprobe_level >= 1 {
1898 compat::cond_kprobe_enable("nvidia_open", &skel.progs.kprobe_nvidia_open)?;
1899 }
1900 if opts.gpu_kprobe_level >= 2 {
1903 compat::cond_kprobe_enable("nvidia_mmap", &skel.progs.kprobe_nvidia_mmap)?;
1904 }
1905 if opts.gpu_kprobe_level >= 3 {
1906 compat::cond_kprobe_enable("nvidia_poll", &skel.progs.kprobe_nvidia_poll)?;
1907 }
1908 }
1909
1910 let ext_sched_class_addr = get_kallsyms_addr("ext_sched_class");
1911 let idle_sched_class_addr = get_kallsyms_addr("idle_sched_class");
1912
1913 if ext_sched_class_addr.is_ok() && idle_sched_class_addr.is_ok() {
1914 skel.maps.rodata_data.ext_sched_class_addr = ext_sched_class_addr.unwrap();
1915 skel.maps.rodata_data.idle_sched_class_addr = idle_sched_class_addr.unwrap();
1916 } else {
1917 warn!(
1918 "Unable to get sched_class addresses from /proc/kallsyms, disabling skip_preempt."
1919 );
1920 }
1921
1922 skel.maps.rodata_data.slice_ns = scx_enums.SCX_SLICE_DFL;
1923 skel.maps.rodata_data.max_exec_ns = 20 * scx_enums.SCX_SLICE_DFL;
1924
1925 skel.struct_ops.layered_mut().exit_dump_len = opts.exit_dump_len;
1927
1928 if !opts.disable_queued_wakeup {
1929 match *compat::SCX_OPS_ALLOW_QUEUED_WAKEUP {
1930 0 => info!("Kernel does not support queued wakeup optimization"),
1931 v => skel.struct_ops.layered_mut().flags |= v,
1932 }
1933 }
1934
1935 skel.maps.rodata_data.percpu_kthread_preempt = !opts.disable_percpu_kthread_preempt;
1936 skel.maps.rodata_data.percpu_kthread_preempt_all =
1937 !opts.disable_percpu_kthread_preempt && opts.percpu_kthread_preempt_all;
1938 skel.maps.rodata_data.debug = opts.verbose as u32;
1939 skel.maps.rodata_data.slice_ns = opts.slice_us * 1000;
1940 skel.maps.rodata_data.max_exec_ns = if opts.max_exec_us > 0 {
1941 opts.max_exec_us * 1000
1942 } else {
1943 opts.slice_us * 1000 * 20
1944 };
1945 skel.maps.rodata_data.nr_cpu_ids = *NR_CPU_IDS as u32;
1946 skel.maps.rodata_data.nr_possible_cpus = *NR_CPUS_POSSIBLE as u32;
1947 skel.maps.rodata_data.smt_enabled = topo.smt_enabled;
1948 skel.maps.rodata_data.has_little_cores = topo.has_little_cores();
1949 skel.maps.rodata_data.xnuma_preemption = opts.xnuma_preemption;
1950 skel.maps.rodata_data.antistall_sec = opts.antistall_sec;
1951 skel.maps.rodata_data.monitor_disable = opts.monitor_disable;
1952 skel.maps.rodata_data.lo_fb_wait_ns = opts.lo_fb_wait_us * 1000;
1953 skel.maps.rodata_data.lo_fb_share_ppk = ((opts.lo_fb_share * 1024.0) as u32).clamp(1, 1024);
1954 skel.maps.rodata_data.enable_antistall = !opts.disable_antistall;
1955 skel.maps.rodata_data.enable_match_debug = opts.enable_match_debug;
1956 skel.maps.rodata_data.enable_gpu_support = opts.enable_gpu_support;
1957
1958 for (cpu, sib) in topo.sibling_cpus().iter().enumerate() {
1959 skel.maps.rodata_data.__sibling_cpu[cpu] = *sib;
1960 }
1961 for cpu in topo.all_cpus.keys() {
1962 skel.maps.rodata_data.all_cpus[cpu / 8] |= 1 << (cpu % 8);
1963 }
1964
1965 skel.maps.rodata_data.nr_op_layers = layer_specs
1966 .iter()
1967 .filter(|spec| match &spec.kind {
1968 LayerKind::Open { .. } => spec.kind.common().preempt,
1969 _ => false,
1970 })
1971 .count() as u32;
1972 skel.maps.rodata_data.nr_on_layers = layer_specs
1973 .iter()
1974 .filter(|spec| match &spec.kind {
1975 LayerKind::Open { .. } => !spec.kind.common().preempt,
1976 _ => false,
1977 })
1978 .count() as u32;
1979 skel.maps.rodata_data.nr_gp_layers = layer_specs
1980 .iter()
1981 .filter(|spec| match &spec.kind {
1982 LayerKind::Grouped { .. } => spec.kind.common().preempt,
1983 _ => false,
1984 })
1985 .count() as u32;
1986 skel.maps.rodata_data.nr_gn_layers = layer_specs
1987 .iter()
1988 .filter(|spec| match &spec.kind {
1989 LayerKind::Grouped { .. } => !spec.kind.common().preempt,
1990 _ => false,
1991 })
1992 .count() as u32;
1993 skel.maps.rodata_data.nr_excl_layers = layer_specs
1994 .iter()
1995 .filter(|spec| spec.kind.common().exclusive)
1996 .count() as u32;
1997
1998 let mut min_open = u64::MAX;
1999 let mut min_preempt = u64::MAX;
2000
2001 for spec in layer_specs.iter() {
2002 if let LayerKind::Open { common, .. } = &spec.kind {
2003 min_open = min_open.min(common.disallow_open_after_us.unwrap());
2004 min_preempt = min_preempt.min(common.disallow_preempt_after_us.unwrap());
2005 }
2006 }
2007
2008 skel.maps.rodata_data.min_open_layer_disallow_open_after_ns = match min_open {
2009 u64::MAX => *DFL_DISALLOW_OPEN_AFTER_US,
2010 v => v,
2011 };
2012 skel.maps
2013 .rodata_data
2014 .min_open_layer_disallow_preempt_after_ns = match min_preempt {
2015 u64::MAX => *DFL_DISALLOW_PREEMPT_AFTER_US,
2016 v => v,
2017 };
2018
2019 for i in 0..layer_specs.len() {
2021 skel.maps.bss_data.empty_layer_ids[i] = i as u32;
2022 }
2023 skel.maps.bss_data.nr_empty_layer_ids = nr_layers as u32;
2024
2025 Self::init_layers(&mut skel, &layer_specs, &topo)?;
2026 Self::init_nodes(&mut skel, opts, &topo);
2027
2028 let layered_task_hint_map_path = &opts.task_hint_map;
2033 let hint_map = &mut skel.maps.scx_layered_task_hint_map;
2034 if layered_task_hint_map_path.is_empty() == false {
2036 hint_map.set_pin_path(layered_task_hint_map_path).unwrap();
2037 }
2038
2039 let mut skel = scx_ops_load!(skel, layered, uei)?;
2040
2041 let mut layers = vec![];
2042 let layer_growth_orders =
2043 LayerGrowthAlgo::layer_core_orders(&cpu_pool, &layer_specs, &topo)?;
2044 for (idx, spec) in layer_specs.iter().enumerate() {
2045 let growth_order = layer_growth_orders
2046 .get(&idx)
2047 .with_context(|| "layer has no growth order".to_string())?;
2048 layers.push(Layer::new(spec, &topo, growth_order)?);
2049 }
2050
2051 let mut idle_qos_enabled = layers
2052 .iter()
2053 .any(|layer| layer.kind.common().idle_resume_us.unwrap_or(0) > 0);
2054 if idle_qos_enabled && !cpu_idle_resume_latency_supported() {
2055 warn!("idle_resume_us not supported, ignoring");
2056 idle_qos_enabled = false;
2057 }
2058
2059 Self::init_cpus(&skel, &layer_specs, &topo)?;
2060 Self::init_llc_prox_map(&mut skel, &topo)?;
2061
2062 let proc_reader = procfs::ProcReader::new();
2064
2065 let input = ProgramInput {
2067 ..Default::default()
2068 };
2069 let prog = &mut skel.progs.initialize_pid_namespace;
2070
2071 let _ = prog.test_run(input);
2072
2073 if layered_task_hint_map_path.is_empty() == false {
2082 let path = CString::new(layered_task_hint_map_path.as_bytes()).unwrap();
2083 let mode: libc::mode_t = 0o666;
2084 unsafe {
2085 if libc::chmod(path.as_ptr(), mode) != 0 {
2086 trace!("'chmod' to 666 of task hint map failed, continuing...");
2087 }
2088 }
2089 }
2090
2091 let struct_ops = scx_ops_attach!(skel, layered)?;
2093 let stats_server = StatsServer::new(stats::server_data()).launch()?;
2094
2095 let sched = Self {
2096 struct_ops: Some(struct_ops),
2097 layer_specs,
2098
2099 sched_intv: Duration::from_secs_f64(opts.interval),
2100 layer_refresh_intv: Duration::from_millis(opts.layer_refresh_ms_avgruntime),
2101
2102 cpu_pool,
2103 layers,
2104 idle_qos_enabled,
2105
2106 sched_stats: Stats::new(&mut skel, &proc_reader)?,
2107
2108 nr_layer_cpus_ranges: vec![(0, 0); nr_layers],
2109 processing_dur: Default::default(),
2110
2111 proc_reader,
2112 skel,
2113
2114 topo,
2115 netdevs,
2116 stats_server,
2117 };
2118
2119 info!("Layered Scheduler Attached. Run `scx_layered --monitor` for metrics.");
2120
2121 Ok(sched)
2122 }
2123
2124 fn update_cpumask(mask: &Cpumask, bpfmask: &mut [u8]) {
2125 for cpu in 0..mask.len() {
2126 if mask.test_cpu(cpu) {
2127 bpfmask[cpu / 8] |= 1 << (cpu % 8);
2128 } else {
2129 bpfmask[cpu / 8] &= !(1 << (cpu % 8));
2130 }
2131 }
2132 }
2133
2134 fn update_bpf_layer_cpumask(layer: &Layer, bpf_layer: &mut types::layer) {
2135 trace!("[{}] Updating BPF CPUs: {}", layer.name, &layer.cpus);
2136 Self::update_cpumask(&layer.cpus, &mut bpf_layer.cpus);
2137
2138 bpf_layer.nr_cpus = layer.nr_cpus as u32;
2139 for (llc_id, &nr_llc_cpus) in layer.nr_llc_cpus.iter().enumerate() {
2140 bpf_layer.nr_llc_cpus[llc_id] = nr_llc_cpus as u32;
2141 }
2142
2143 bpf_layer.refresh_cpus = 1;
2144 }
2145
2146 fn update_netdev_cpumasks(&mut self) -> Result<()> {
2147 let available_cpus = self.cpu_pool.available_cpus();
2148 if available_cpus.is_empty() {
2149 return Ok(());
2150 }
2151
2152 for (iface, netdev) in self.netdevs.iter_mut() {
2153 let node = self
2154 .topo
2155 .nodes
2156 .values()
2157 .take_while(|n| n.id == netdev.node())
2158 .next()
2159 .ok_or_else(|| anyhow!("Failed to get netdev node"))?;
2160 let node_cpus = node.span.clone();
2161 for (irq, irqmask) in netdev.irqs.iter_mut() {
2162 irqmask.clear_all();
2163 for cpu in available_cpus.iter() {
2164 if !node_cpus.test_cpu(cpu) {
2165 continue;
2166 }
2167 let _ = irqmask.set_cpu(cpu);
2168 }
2169 if irqmask.weight() == 0 {
2171 for cpu in node_cpus.iter() {
2172 let _ = irqmask.set_cpu(cpu);
2173 }
2174 }
2175 trace!("{} updating irq {} cpumask {:?}", iface, irq, irqmask);
2176 }
2177 netdev.apply_cpumasks()?;
2178 }
2179
2180 Ok(())
2181 }
2182
2183 fn calc_target_nr_cpus(&self) -> Vec<(usize, usize)> {
2189 let nr_cpus = self.cpu_pool.topo.all_cpus.len();
2190 let utils = &self.sched_stats.layer_utils;
2191
2192 let mut records: Vec<(u64, u64, u64, usize, usize, usize)> = vec![];
2193 let mut targets: Vec<(usize, usize)> = vec![];
2194
2195 for (idx, layer) in self.layers.iter().enumerate() {
2196 targets.push(match &layer.kind {
2197 LayerKind::Confined {
2198 util_range,
2199 cpus_range,
2200 cpus_range_frac,
2201 ..
2202 }
2203 | LayerKind::Grouped {
2204 util_range,
2205 cpus_range,
2206 cpus_range_frac,
2207 ..
2208 } => {
2209 let owned = utils[idx][LAYER_USAGE_OWNED];
2214 let open = utils[idx][LAYER_USAGE_OPEN];
2215
2216 let mut util = owned;
2217 if layer.kind.util_includes_open_cputime() || layer.nr_cpus == 0 {
2218 util += open;
2219 }
2220
2221 let util = if util < 0.01 { 0.0 } else { util };
2222 let low = (util / util_range.1).ceil() as usize;
2223 let high = ((util / util_range.0).floor() as usize).max(low);
2224 let target = layer.cpus.weight().clamp(low, high);
2225 let cpus_range =
2226 resolve_cpus_pct_range(cpus_range, cpus_range_frac, nr_cpus).unwrap();
2227
2228 records.push((
2229 (owned * 100.0) as u64,
2230 (open * 100.0) as u64,
2231 (util * 100.0) as u64,
2232 low,
2233 high,
2234 target,
2235 ));
2236
2237 (target.clamp(cpus_range.0, cpus_range.1), cpus_range.0)
2238 }
2239 LayerKind::Open { .. } => (0, 0),
2240 });
2241 }
2242
2243 trace!("initial targets: {:?}", &targets);
2244 trace!("(owned, open, util, low, high, target): {:?}", &records);
2245 targets
2246 }
2247
2248 fn weighted_target_nr_cpus(&self, targets: &[(usize, usize)]) -> Vec<usize> {
2252 let mut nr_left = self.cpu_pool.topo.all_cpus.len();
2253 let weights: Vec<usize> = self
2254 .layers
2255 .iter()
2256 .map(|layer| layer.kind.common().weight as usize)
2257 .collect();
2258 let mut cands: BTreeMap<usize, (usize, usize, usize)> = targets
2259 .iter()
2260 .zip(&weights)
2261 .enumerate()
2262 .map(|(i, ((target, min), weight))| (i, (*target, *min, *weight)))
2263 .collect();
2264 let mut weight_sum: usize = weights.iter().sum();
2265 let mut weighted: Vec<usize> = vec![0; self.layers.len()];
2266
2267 trace!("cands: {:?}", &cands);
2268
2269 cands.retain(|&i, &mut (target, min, weight)| {
2271 if target <= min {
2272 let target = target.min(nr_left);
2273 weighted[i] = target;
2274 weight_sum -= weight;
2275 nr_left -= target;
2276 false
2277 } else {
2278 true
2279 }
2280 });
2281
2282 trace!("cands after accepting mins: {:?}", &cands);
2283
2284 let calc_share = |nr_left, weight, weight_sum| {
2286 (((nr_left * weight) as f64 / weight_sum as f64).ceil() as usize).min(nr_left)
2287 };
2288
2289 while !cands.is_empty() {
2290 let mut progress = false;
2291
2292 cands.retain(|&i, &mut (target, _min, weight)| {
2293 let share = calc_share(nr_left, weight, weight_sum);
2294 if target <= share {
2295 weighted[i] = target;
2296 weight_sum -= weight;
2297 nr_left -= target;
2298 progress = true;
2299 false
2300 } else {
2301 true
2302 }
2303 });
2304
2305 if !progress {
2306 break;
2307 }
2308 }
2309
2310 trace!("cands after accepting under allotted: {:?}", &cands);
2311
2312 let nr_to_share = nr_left;
2315 for (i, (_target, _min, weight)) in cands.into_iter() {
2316 let share = calc_share(nr_to_share, weight, weight_sum).min(nr_left);
2317 weighted[i] = share;
2318 nr_left -= share;
2319 }
2320
2321 trace!("weighted: {:?}", &weighted);
2322
2323 weighted
2324 }
2325
2326 fn compute_target_llcs(target: usize, topo: &Topology) -> (usize, usize) {
2330 let cores_per_llc = topo.all_cores.len() / topo.all_llcs.len();
2332 let cpus_per_core = topo.all_cores.first_key_value().unwrap().1.cpus.len();
2334 let cpus_per_llc = cores_per_llc * cpus_per_core;
2335
2336 let full = target / cpus_per_llc;
2337 let extra = target % cpus_per_llc;
2338
2339 (full, extra.div_ceil(cpus_per_core))
2340 }
2341
2342 fn recompute_layer_core_order(&mut self, layer_targets: &Vec<(usize, usize)>) {
2350 debug!(
2352 " free: before pass: free_llcs={:?}",
2353 self.cpu_pool.free_llcs
2354 );
2355 for &(idx, target) in layer_targets.iter().rev() {
2356 let layer = &mut self.layers[idx];
2357 let old_tlc = layer.target_llc_cpus;
2358 let new_tlc = Self::compute_target_llcs(target, &self.topo);
2359
2360 if layer.growth_algo != LayerGrowthAlgo::StickyDynamic {
2361 continue;
2362 }
2363
2364 let mut to_free = (old_tlc.0 as i32 - new_tlc.0 as i32).max(0) as usize;
2365
2366 debug!(
2367 " free: layer={} old_tlc={:?} new_tlc={:?} to_free={} assigned={} free={}",
2368 layer.name,
2369 old_tlc,
2370 new_tlc,
2371 to_free,
2372 layer.assigned_llcs.len(),
2373 self.cpu_pool.free_llcs.len()
2374 );
2375
2376 while to_free > 0 && layer.assigned_llcs.len() > 0 {
2377 let llc = layer.assigned_llcs.pop().unwrap();
2378 self.cpu_pool.free_llcs.push((llc, 0));
2379 to_free -= 1;
2380
2381 debug!(" layer={} freed_llc={}", layer.name, llc);
2382 }
2383 }
2384 debug!(" free: after pass: free_llcs={:?}", self.cpu_pool.free_llcs);
2385
2386 for &(idx, target) in layer_targets.iter().rev() {
2388 let layer = &mut self.layers[idx];
2389 let old_tlc = layer.target_llc_cpus;
2390 let new_tlc = Self::compute_target_llcs(target, &self.topo);
2391
2392 if layer.growth_algo != LayerGrowthAlgo::StickyDynamic {
2393 continue;
2394 }
2395
2396 let mut to_alloc = (new_tlc.0 as i32 - old_tlc.0 as i32).max(0) as usize;
2397
2398 debug!(
2399 " alloc: layer={} old_tlc={:?} new_tlc={:?} to_alloc={} assigned={} free={}",
2400 layer.name,
2401 old_tlc,
2402 new_tlc,
2403 to_alloc,
2404 layer.assigned_llcs.len(),
2405 self.cpu_pool.free_llcs.len()
2406 );
2407
2408 while to_alloc > 0
2409 && self.cpu_pool.free_llcs.len() > 0
2410 && to_alloc <= self.cpu_pool.free_llcs.len()
2411 {
2412 let llc = self.cpu_pool.free_llcs.pop().unwrap().0;
2413 layer.assigned_llcs.push(llc);
2414 to_alloc -= 1;
2415
2416 debug!(" layer={} alloc_llc={}", layer.name, llc);
2417 }
2418
2419 debug!(
2420 " alloc: layer={} assigned_llcs={:?}",
2421 layer.name, layer.assigned_llcs
2422 );
2423
2424 layer.target_llc_cpus = new_tlc;
2426 }
2427
2428 for &(idx, _) in layer_targets.iter() {
2431 let mut core_order = vec![];
2432 let layer = &mut self.layers[idx];
2433
2434 if layer.growth_algo != LayerGrowthAlgo::StickyDynamic {
2435 continue;
2436 }
2437
2438 let tlc = layer.target_llc_cpus;
2439 let mut extra = tlc.1;
2440 let cores_per_llc = self.topo.all_cores.len() / self.topo.all_llcs.len();
2442 let cpus_per_core = self.topo.all_cores.first_key_value().unwrap().1.cpus.len();
2443 let cpus_per_llc = cores_per_llc * cpus_per_core;
2444
2445 for i in 0..self.cpu_pool.free_llcs.len() {
2447 let free_vec = &mut self.cpu_pool.free_llcs;
2448 let avail = cpus_per_llc - free_vec[i].1;
2450 let mut used = extra.min(avail);
2452
2453 let shift = free_vec[i].1;
2454 free_vec[i].1 += used;
2455
2456 let llc_id = free_vec[i].0;
2457 let llc = self.topo.all_llcs.get(&llc_id).unwrap();
2458
2459 for core in llc.cores.iter().skip(shift) {
2460 core_order.push(core.1.id);
2461 if used == 0 {
2462 break;
2463 }
2464 used -= 1;
2465 }
2466
2467 extra -= used;
2468 if extra == 0 {
2469 break;
2470 }
2471 }
2472
2473 core_order.reverse();
2474 layer.core_order = core_order;
2475 }
2476
2477 for i in 0..self.cpu_pool.free_llcs.len() {
2479 self.cpu_pool.free_llcs[i].1 = 0;
2480 }
2481
2482 for &(idx, _) in layer_targets.iter() {
2483 let layer = &mut self.layers[idx];
2484
2485 if layer.growth_algo != LayerGrowthAlgo::StickyDynamic {
2486 continue;
2487 }
2488
2489 for core in self.topo.all_cores.iter() {
2490 let llc_id = core.1.llc_id;
2491 if layer.assigned_llcs.contains(&llc_id) {
2492 layer.core_order.push(core.1.id);
2493 }
2494 }
2495 layer.core_order.reverse();
2497
2498 debug!(
2499 " alloc: layer={} core_order={:?}",
2500 layer.name, layer.core_order
2501 );
2502 }
2503 }
2504
2505 fn refresh_cpumasks(&mut self) -> Result<()> {
2506 let layer_is_open = |layer: &Layer| matches!(layer.kind, LayerKind::Open { .. });
2507
2508 let mut updated = false;
2509 let targets = self.calc_target_nr_cpus();
2510 let targets = self.weighted_target_nr_cpus(&targets);
2511
2512 let mut ascending: Vec<(usize, usize)> = targets.iter().copied().enumerate().collect();
2513 ascending.sort_by(|a, b| a.1.cmp(&b.1));
2514
2515 self.recompute_layer_core_order(&ascending);
2516
2517 let mut force_free = self
2520 .layers
2521 .iter()
2522 .zip(targets.iter())
2523 .any(|(layer, &target)| layer.nr_cpus < target);
2524
2525 for &(idx, target) in ascending.iter().rev() {
2529 let layer = &mut self.layers[idx];
2530 if layer_is_open(layer) {
2531 continue;
2532 }
2533
2534 let nr_cur = layer.cpus.weight();
2535 if nr_cur <= target {
2536 continue;
2537 }
2538 let mut nr_to_free = nr_cur - target;
2539
2540 let nr_to_break_at = nr_to_free / 2;
2545
2546 let mut freed = false;
2547
2548 while nr_to_free > 0 {
2549 let max_to_free = if force_free {
2550 force_free = false;
2551 layer.nr_cpus
2552 } else {
2553 nr_to_free
2554 };
2555
2556 let nr_freed = layer.free_some_cpus(&mut self.cpu_pool, max_to_free)?;
2557 if nr_freed == 0 {
2558 break;
2559 }
2560
2561 nr_to_free = nr_to_free.saturating_sub(nr_freed);
2562 freed = true;
2563
2564 if nr_to_free <= nr_to_break_at {
2565 break;
2566 }
2567 }
2568
2569 if freed {
2570 Self::update_bpf_layer_cpumask(layer, &mut self.skel.maps.bss_data.layers[idx]);
2571 updated = true;
2572 }
2573 }
2574
2575 for &(idx, target) in &ascending {
2581 let layer = &mut self.layers[idx];
2582
2583 if layer_is_open(layer) {
2584 continue;
2585 }
2586
2587 let nr_cur = layer.cpus.weight();
2588 if nr_cur >= target {
2589 continue;
2590 }
2591
2592 let mut nr_to_alloc = target - nr_cur;
2593 let mut alloced = false;
2594
2595 while nr_to_alloc > 0 {
2596 let nr_alloced = layer.alloc_some_cpus(&mut self.cpu_pool)?;
2597 if nr_alloced == 0 {
2598 break;
2599 }
2600 alloced = true;
2601 nr_to_alloc -= nr_alloced.min(nr_to_alloc);
2602 }
2603
2604 if alloced {
2605 Self::update_bpf_layer_cpumask(layer, &mut self.skel.maps.bss_data.layers[idx]);
2606 updated = true;
2607 }
2608 }
2609
2610 if updated {
2612 for (idx, layer) in self.layers.iter_mut().enumerate() {
2613 if !layer_is_open(layer) {
2614 continue;
2615 }
2616
2617 let bpf_layer = &mut self.skel.maps.bss_data.layers[idx];
2618 let available_cpus = self.cpu_pool.available_cpus().and(&layer.allowed_cpus);
2619 let nr_available_cpus = available_cpus.weight();
2620
2621 layer.cpus = available_cpus;
2624 layer.nr_cpus = nr_available_cpus;
2625 Self::update_bpf_layer_cpumask(layer, bpf_layer);
2626 }
2627
2628 self.skel.maps.bss_data.fallback_cpu = self.cpu_pool.fallback_cpu as u32;
2629
2630 for (lidx, layer) in self.layers.iter().enumerate() {
2631 self.nr_layer_cpus_ranges[lidx] = (
2632 self.nr_layer_cpus_ranges[lidx].0.min(layer.nr_cpus),
2633 self.nr_layer_cpus_ranges[lidx].1.max(layer.nr_cpus),
2634 );
2635 }
2636
2637 let input = ProgramInput {
2639 ..Default::default()
2640 };
2641 let prog = &mut self.skel.progs.refresh_layer_cpumasks;
2642 let _ = prog.test_run(input);
2643
2644 let empty_layer_ids: Vec<u32> = self
2646 .layers
2647 .iter()
2648 .enumerate()
2649 .filter(|(_idx, layer)| layer.nr_cpus == 0)
2650 .map(|(idx, _layer)| idx as u32)
2651 .collect();
2652 for i in 0..self.layers.len() {
2653 self.skel.maps.bss_data.empty_layer_ids[i] =
2654 empty_layer_ids.get(i).cloned().unwrap_or(MAX_LAYERS as u32);
2655 }
2656 self.skel.maps.bss_data.nr_empty_layer_ids = empty_layer_ids.len() as u32;
2657 }
2658
2659 let _ = self.update_netdev_cpumasks();
2660 Ok(())
2661 }
2662
2663 fn refresh_idle_qos(&mut self) -> Result<()> {
2664 if !self.idle_qos_enabled {
2665 return Ok(());
2666 }
2667
2668 let mut cpu_idle_qos = vec![0; *NR_CPU_IDS];
2669 for layer in self.layers.iter() {
2670 let idle_resume_us = layer.kind.common().idle_resume_us.unwrap_or(0) as i32;
2671 for cpu in layer.cpus.iter() {
2672 cpu_idle_qos[cpu] = idle_resume_us;
2673 }
2674 }
2675
2676 for (cpu, idle_resume_usec) in cpu_idle_qos.iter().enumerate() {
2677 update_cpu_idle_resume_latency(cpu, *idle_resume_usec)?;
2678 }
2679
2680 Ok(())
2681 }
2682
2683 fn step(&mut self) -> Result<()> {
2684 let started_at = Instant::now();
2685 self.sched_stats.refresh(
2686 &mut self.skel,
2687 &self.proc_reader,
2688 started_at,
2689 self.processing_dur,
2690 )?;
2691 self.refresh_cpumasks()?;
2692 self.refresh_idle_qos()?;
2693 self.processing_dur += Instant::now().duration_since(started_at);
2694 Ok(())
2695 }
2696
2697 fn generate_sys_stats(
2698 &mut self,
2699 stats: &Stats,
2700 cpus_ranges: &mut [(usize, usize)],
2701 ) -> Result<SysStats> {
2702 let bstats = &stats.bpf_stats;
2703 let mut sys_stats = SysStats::new(stats, bstats, self.cpu_pool.fallback_cpu)?;
2704
2705 for (lidx, (spec, layer)) in self.layer_specs.iter().zip(self.layers.iter()).enumerate() {
2706 let layer_stats = LayerStats::new(lidx, layer, stats, bstats, cpus_ranges[lidx]);
2707 sys_stats.layers.insert(spec.name.to_string(), layer_stats);
2708 cpus_ranges[lidx] = (layer.nr_cpus, layer.nr_cpus);
2709 }
2710
2711 Ok(sys_stats)
2712 }
2713
2714 fn run(&mut self, shutdown: Arc<AtomicBool>) -> Result<UserExitInfo> {
2715 let (res_ch, req_ch) = self.stats_server.channels();
2716 let mut next_sched_at = Instant::now() + self.sched_intv;
2717 let enable_layer_refresh = !self.layer_refresh_intv.is_zero();
2718 let mut next_layer_refresh_at = Instant::now() + self.layer_refresh_intv;
2719 let mut cpus_ranges = HashMap::<ThreadId, Vec<(usize, usize)>>::new();
2720
2721 while !shutdown.load(Ordering::Relaxed) && !uei_exited!(&self.skel, uei) {
2722 let now = Instant::now();
2723
2724 if now >= next_sched_at {
2725 self.step()?;
2726 while next_sched_at < now {
2727 next_sched_at += self.sched_intv;
2728 }
2729 }
2730
2731 if enable_layer_refresh && now >= next_layer_refresh_at {
2732 self.skel.maps.bss_data.layer_refresh_seq_avgruntime += 1;
2733 while next_layer_refresh_at < now {
2734 next_layer_refresh_at += self.layer_refresh_intv;
2735 }
2736 }
2737
2738 match req_ch.recv_deadline(next_sched_at) {
2739 Ok(StatsReq::Hello(tid)) => {
2740 cpus_ranges.insert(
2741 tid,
2742 self.layers.iter().map(|l| (l.nr_cpus, l.nr_cpus)).collect(),
2743 );
2744 let stats = Stats::new(&mut self.skel, &self.proc_reader)?;
2745 res_ch.send(StatsRes::Hello(stats))?;
2746 }
2747 Ok(StatsReq::Refresh(tid, mut stats)) => {
2748 for i in 0..self.nr_layer_cpus_ranges.len() {
2750 for (_, ranges) in cpus_ranges.iter_mut() {
2751 ranges[i] = (
2752 ranges[i].0.min(self.nr_layer_cpus_ranges[i].0),
2753 ranges[i].1.max(self.nr_layer_cpus_ranges[i].1),
2754 );
2755 }
2756 self.nr_layer_cpus_ranges[i] =
2757 (self.layers[i].nr_cpus, self.layers[i].nr_cpus);
2758 }
2759
2760 stats.refresh(&mut self.skel, &self.proc_reader, now, self.processing_dur)?;
2761 let sys_stats =
2762 self.generate_sys_stats(&stats, cpus_ranges.get_mut(&tid).unwrap())?;
2763 res_ch.send(StatsRes::Refreshed((stats, sys_stats)))?;
2764 }
2765 Ok(StatsReq::Bye(tid)) => {
2766 cpus_ranges.remove(&tid);
2767 res_ch.send(StatsRes::Bye)?;
2768 }
2769 Err(RecvTimeoutError::Timeout) => {}
2770 Err(e) => Err(e)?,
2771 }
2772 }
2773
2774 let _ = self.struct_ops.take();
2775 uei_report!(&self.skel, uei)
2776 }
2777}
2778
2779impl Drop for Scheduler<'_> {
2780 fn drop(&mut self) {
2781 if let Some(struct_ops) = self.struct_ops.take() {
2782 drop(struct_ops);
2783 }
2784 }
2785}
2786
2787fn write_example_file(path: &str) -> Result<()> {
2788 let mut f = fs::OpenOptions::new()
2789 .create_new(true)
2790 .write(true)
2791 .open(path)?;
2792 Ok(f.write_all(serde_json::to_string_pretty(&*EXAMPLE_CONFIG)?.as_bytes())?)
2793}
2794
2795fn verify_layer_specs(specs: &[LayerSpec]) -> Result<()> {
2796 let nr_specs = specs.len();
2797 if nr_specs == 0 {
2798 bail!("No layer spec");
2799 }
2800 if nr_specs > MAX_LAYERS {
2801 bail!("Too many layer specs");
2802 }
2803
2804 for (idx, spec) in specs.iter().enumerate() {
2805 if idx < nr_specs - 1 {
2806 if spec.matches.is_empty() {
2807 bail!("Non-terminal spec {:?} has NULL matches", spec.name);
2808 }
2809 } else {
2810 if spec.matches.len() != 1 || !spec.matches[0].is_empty() {
2811 bail!("Terminal spec {:?} must have an empty match", spec.name);
2812 }
2813 }
2814
2815 if spec.matches.len() > MAX_LAYER_MATCH_ORS {
2816 bail!(
2817 "Spec {:?} has too many ({}) OR match blocks",
2818 spec.name,
2819 spec.matches.len()
2820 );
2821 }
2822
2823 for (ands_idx, ands) in spec.matches.iter().enumerate() {
2824 if ands.len() > NR_LAYER_MATCH_KINDS {
2825 bail!(
2826 "Spec {:?}'s {}th OR block has too many ({}) match conditions",
2827 spec.name,
2828 ands_idx,
2829 ands.len()
2830 );
2831 }
2832 for one in ands.iter() {
2833 match one {
2834 LayerMatch::CgroupPrefix(prefix) => {
2835 if prefix.len() > MAX_PATH {
2836 bail!("Spec {:?} has too long a cgroup prefix", spec.name);
2837 }
2838 }
2839 LayerMatch::CgroupSuffix(suffix) => {
2840 if suffix.len() > MAX_PATH {
2841 bail!("Spec {:?} has too long a cgroup suffix", spec.name);
2842 }
2843 }
2844 LayerMatch::CgroupContains(substr) => {
2845 if substr.len() > MAX_PATH {
2846 bail!("Spec {:?} has too long a cgroup substr", spec.name);
2847 }
2848 }
2849 LayerMatch::CommPrefix(prefix) => {
2850 if prefix.len() > MAX_COMM {
2851 bail!("Spec {:?} has too long a comm prefix", spec.name);
2852 }
2853 }
2854 LayerMatch::PcommPrefix(prefix) => {
2855 if prefix.len() > MAX_COMM {
2856 bail!("Spec {:?} has too long a process name prefix", spec.name);
2857 }
2858 }
2859 _ => {}
2860 }
2861 }
2862 }
2863
2864 match spec.kind {
2865 LayerKind::Confined {
2866 cpus_range,
2867 util_range,
2868 ..
2869 }
2870 | LayerKind::Grouped {
2871 cpus_range,
2872 util_range,
2873 ..
2874 } => {
2875 if let Some((cpus_min, cpus_max)) = cpus_range {
2876 if cpus_min > cpus_max {
2877 bail!(
2878 "Spec {:?} has invalid cpus_range({}, {})",
2879 spec.name,
2880 cpus_min,
2881 cpus_max
2882 );
2883 }
2884 }
2885 if util_range.0 >= util_range.1 {
2886 bail!(
2887 "Spec {:?} has invalid util_range ({}, {})",
2888 spec.name,
2889 util_range.0,
2890 util_range.1
2891 );
2892 }
2893 }
2894 _ => {}
2895 }
2896 }
2897
2898 Ok(())
2899}
2900
2901fn name_suffix(cgroup: &str, len: usize) -> String {
2902 let suffixlen = std::cmp::min(len, cgroup.len());
2903 let suffixrev: String = cgroup.chars().rev().take(suffixlen).collect();
2904
2905 suffixrev.chars().rev().collect()
2906}
2907
2908fn traverse_sysfs(dir: &Path) -> Result<Vec<PathBuf>> {
2909 let mut paths = vec![];
2910
2911 if !dir.is_dir() {
2912 panic!("path {:?} does not correspond to directory", dir);
2913 }
2914
2915 let direntries = fs::read_dir(dir)?;
2916
2917 for entry in direntries {
2918 let path = entry?.path();
2919 if path.is_dir() {
2920 paths.append(&mut traverse_sysfs(&path)?);
2921 paths.push(path);
2922 }
2923 }
2924
2925 Ok(paths)
2926}
2927
2928fn find_cpumask(cgroup: &str) -> Cpumask {
2929 let mut path = String::from(cgroup);
2930 path.push_str("/cpuset.cpus.effective");
2931
2932 let description = fs::read_to_string(&mut path).unwrap();
2933
2934 Cpumask::from_cpulist(&description).unwrap()
2935}
2936
2937fn expand_template(rule: &LayerMatch) -> Result<Vec<(LayerMatch, Cpumask)>> {
2938 match rule {
2939 LayerMatch::CgroupSuffix(suffix) => Ok(traverse_sysfs(Path::new("/sys/fs/cgroup"))?
2940 .into_iter()
2941 .map(|cgroup| String::from(cgroup.to_str().expect("could not parse cgroup path")))
2942 .filter(|cgroup| cgroup.ends_with(suffix))
2943 .map(|cgroup| {
2944 (
2945 {
2946 let mut slashterminated = cgroup.clone();
2947 slashterminated.push('/');
2948 LayerMatch::CgroupSuffix(name_suffix(&slashterminated, 64))
2949 },
2950 find_cpumask(&cgroup),
2951 )
2952 })
2953 .collect()),
2954 _ => panic!("Unimplemented template enum {:?}", rule),
2955 }
2956}
2957
2958fn main() -> Result<()> {
2959 let opts = Opts::parse();
2960
2961 if opts.help_stats {
2962 stats::server_data().describe_meta(&mut std::io::stdout(), None)?;
2963 return Ok(());
2964 }
2965
2966 if opts.no_load_frac_limit {
2967 warn!("--no-load-frac-limit is deprecated and noop");
2968 }
2969 if opts.layer_preempt_weight_disable != 0.0 {
2970 warn!("--layer-preempt-weight-disable is deprecated and noop");
2971 }
2972 if opts.layer_growth_weight_disable != 0.0 {
2973 warn!("--layer-growth-weight-disable is deprecated and noop");
2974 }
2975 if opts.local_llc_iteration {
2976 warn!("--local_llc_iteration is deprecated and noop");
2977 }
2978
2979 let llv = match opts.verbose {
2980 0 => simplelog::LevelFilter::Info,
2981 1 => simplelog::LevelFilter::Debug,
2982 _ => simplelog::LevelFilter::Trace,
2983 };
2984 let mut lcfg = simplelog::ConfigBuilder::new();
2985 lcfg.set_time_offset_to_local()
2986 .expect("Failed to set local time offset")
2987 .set_time_level(simplelog::LevelFilter::Error)
2988 .set_location_level(simplelog::LevelFilter::Off)
2989 .set_target_level(simplelog::LevelFilter::Off)
2990 .set_thread_level(simplelog::LevelFilter::Off);
2991 simplelog::TermLogger::init(
2992 llv,
2993 lcfg.build(),
2994 simplelog::TerminalMode::Stderr,
2995 simplelog::ColorChoice::Auto,
2996 )?;
2997
2998 debug!("opts={:?}", &opts);
2999
3000 let shutdown = Arc::new(AtomicBool::new(false));
3001 let shutdown_clone = shutdown.clone();
3002 ctrlc::set_handler(move || {
3003 shutdown_clone.store(true, Ordering::Relaxed);
3004 })
3005 .context("Error setting Ctrl-C handler")?;
3006
3007 if let Some(intv) = opts.monitor.or(opts.stats) {
3008 let shutdown_copy = shutdown.clone();
3009 let jh = std::thread::spawn(move || {
3010 match stats::monitor(Duration::from_secs_f64(intv), shutdown_copy) {
3011 Ok(_) => {
3012 debug!("stats monitor thread finished successfully")
3013 }
3014 Err(error_object) => {
3015 warn!(
3016 "stats monitor thread finished because of an error {}",
3017 error_object
3018 )
3019 }
3020 }
3021 });
3022 if opts.monitor.is_some() {
3023 let _ = jh.join();
3024 return Ok(());
3025 }
3026 }
3027
3028 if let Some(path) = &opts.example {
3029 write_example_file(path)?;
3030 return Ok(());
3031 }
3032
3033 let mut layer_config = match opts.run_example {
3034 true => EXAMPLE_CONFIG.clone(),
3035 false => LayerConfig { specs: vec![] },
3036 };
3037
3038 for (idx, input) in opts.specs.iter().enumerate() {
3039 let specs = LayerSpec::parse(input)
3040 .context(format!("Failed to parse specs[{}] ({:?})", idx, input))?;
3041
3042 for spec in specs {
3043 match spec.template {
3044 Some(ref rule) => {
3045 let matches = expand_template(&rule)?;
3046 if matches.is_empty() {
3049 layer_config.specs.push(spec);
3050 } else {
3051 for (mt, mask) in matches {
3052 let mut genspec = spec.clone();
3053
3054 genspec.cpuset = Some(mask);
3055
3056 for orterm in &mut genspec.matches {
3058 orterm.push(mt.clone());
3059 }
3060
3061 match &mt {
3062 LayerMatch::CgroupSuffix(cgroup) => genspec.name.push_str(cgroup),
3063 _ => bail!("Template match has unexpected type"),
3064 }
3065
3066 layer_config.specs.push(genspec);
3068 }
3069 }
3070 }
3071
3072 None => {
3073 layer_config.specs.push(spec);
3074 }
3075 }
3076 }
3077 }
3078
3079 for spec in layer_config.specs.iter_mut() {
3080 let common = spec.kind.common_mut();
3081
3082 if common.slice_us == 0 {
3083 common.slice_us = opts.slice_us;
3084 }
3085
3086 if common.weight == 0 {
3087 common.weight = DEFAULT_LAYER_WEIGHT;
3088 }
3089 common.weight = common.weight.clamp(MIN_LAYER_WEIGHT, MAX_LAYER_WEIGHT);
3090
3091 if common.preempt {
3092 if common.disallow_open_after_us.is_some() {
3093 warn!(
3094 "Preempt layer {} has non-null disallow_open_after_us, ignored",
3095 &spec.name
3096 );
3097 }
3098 if common.disallow_preempt_after_us.is_some() {
3099 warn!(
3100 "Preempt layer {} has non-null disallow_preempt_after_us, ignored",
3101 &spec.name
3102 );
3103 }
3104 common.disallow_open_after_us = Some(u64::MAX);
3105 common.disallow_preempt_after_us = Some(u64::MAX);
3106 } else {
3107 if common.disallow_open_after_us.is_none() {
3108 common.disallow_open_after_us = Some(*DFL_DISALLOW_OPEN_AFTER_US);
3109 }
3110
3111 if common.disallow_preempt_after_us.is_none() {
3112 common.disallow_preempt_after_us = Some(*DFL_DISALLOW_PREEMPT_AFTER_US);
3113 }
3114 }
3115
3116 if common.idle_smt.is_some() {
3117 warn!("Layer {} has deprecated flag \"idle_smt\"", &spec.name);
3118 }
3119 }
3120
3121 if opts.print_and_exit {
3122 println!("specs={}", serde_json::to_string_pretty(&layer_config)?);
3123 return Ok(());
3124 }
3125
3126 debug!("specs={}", serde_json::to_string_pretty(&layer_config)?);
3127 verify_layer_specs(&layer_config.specs)?;
3128
3129 let mut open_object = MaybeUninit::uninit();
3130 loop {
3131 let mut sched = Scheduler::init(&opts, &layer_config.specs, &mut open_object)?;
3132 if !sched.run(shutdown.clone())?.should_restart() {
3133 break;
3134 }
3135 }
3136
3137 Ok(())
3138}