1mod bpf_skel;
6mod stats;
7
8use std::collections::BTreeMap;
9use std::collections::HashMap;
10use std::ffi::CString;
11use std::fs;
12use std::io::Write;
13use std::mem::MaybeUninit;
14use std::ops::Sub;
15use std::sync::Arc;
16use std::sync::atomic::AtomicBool;
17use std::sync::atomic::Ordering;
18use std::thread::ThreadId;
19use std::time::Duration;
20use std::time::Instant;
21
22use ::fb_procfs as procfs;
23use anyhow::Context;
24use anyhow::Result;
25use anyhow::anyhow;
26use anyhow::bail;
27pub use bpf_skel::*;
28use clap::Parser;
29use crossbeam::channel::RecvTimeoutError;
30use lazy_static::lazy_static;
31use libbpf_rs::MapCore as _;
32use libbpf_rs::OpenObject;
33use libbpf_rs::ProgramInput;
34use log::debug;
35use log::info;
36use log::trace;
37use log::warn;
38use scx_layered::*;
39use scx_stats::prelude::*;
40use scx_utils::CoreType;
41use scx_utils::Cpumask;
42use scx_utils::Llc;
43use scx_utils::NR_CPU_IDS;
44use scx_utils::NR_CPUS_POSSIBLE;
45use scx_utils::NetDev;
46use scx_utils::Topology;
47use scx_utils::UserExitInfo;
48use scx_utils::compat;
49use scx_utils::init_libbpf_logging;
50use scx_utils::pm::{cpu_idle_resume_latency_supported, update_cpu_idle_resume_latency};
51use scx_utils::read_netdevs;
52use scx_utils::scx_enums;
53use scx_utils::scx_ops_attach;
54use scx_utils::scx_ops_load;
55use scx_utils::scx_ops_open;
56use scx_utils::uei_exited;
57use scx_utils::uei_report;
58use stats::LayerStats;
59use stats::StatsReq;
60use stats::StatsRes;
61use stats::SysStats;
62
63const MAX_PATH: usize = bpf_intf::consts_MAX_PATH as usize;
64const MAX_COMM: usize = bpf_intf::consts_MAX_COMM as usize;
65const MAX_LAYER_WEIGHT: u32 = bpf_intf::consts_MAX_LAYER_WEIGHT;
66const MIN_LAYER_WEIGHT: u32 = bpf_intf::consts_MIN_LAYER_WEIGHT;
67const MAX_LAYER_MATCH_ORS: usize = bpf_intf::consts_MAX_LAYER_MATCH_ORS as usize;
68const MAX_LAYER_NAME: usize = bpf_intf::consts_MAX_LAYER_NAME as usize;
69const MAX_LAYERS: usize = bpf_intf::consts_MAX_LAYERS as usize;
70const DEFAULT_LAYER_WEIGHT: u32 = bpf_intf::consts_DEFAULT_LAYER_WEIGHT;
71const USAGE_HALF_LIFE: u32 = bpf_intf::consts_USAGE_HALF_LIFE;
72const USAGE_HALF_LIFE_F64: f64 = USAGE_HALF_LIFE as f64 / 1_000_000_000.0;
73
74const LAYER_USAGE_OWNED: usize = bpf_intf::layer_usage_LAYER_USAGE_OWNED as usize;
75const LAYER_USAGE_OPEN: usize = bpf_intf::layer_usage_LAYER_USAGE_OPEN as usize;
76const LAYER_USAGE_SUM_UPTO: usize = bpf_intf::layer_usage_LAYER_USAGE_SUM_UPTO as usize;
77const LAYER_USAGE_PROTECTED: usize = bpf_intf::layer_usage_LAYER_USAGE_PROTECTED as usize;
78const LAYER_USAGE_PROTECTED_PREEMPT: usize =
79 bpf_intf::layer_usage_LAYER_USAGE_PROTECTED_PREEMPT as usize;
80const NR_LAYER_USAGES: usize = bpf_intf::layer_usage_NR_LAYER_USAGES as usize;
81
82const NR_GSTATS: usize = bpf_intf::global_stat_id_NR_GSTATS as usize;
83const NR_LSTATS: usize = bpf_intf::layer_stat_id_NR_LSTATS as usize;
84const NR_LLC_LSTATS: usize = bpf_intf::llc_layer_stat_id_NR_LLC_LSTATS as usize;
85
86const NR_LAYER_MATCH_KINDS: usize = bpf_intf::layer_match_kind_NR_LAYER_MATCH_KINDS as usize;
87
88lazy_static! {
89 static ref USAGE_DECAY: f64 = 0.5f64.powf(1.0 / USAGE_HALF_LIFE_F64);
90 static ref DFL_DISALLOW_OPEN_AFTER_US: u64 = 2 * scx_enums.SCX_SLICE_DFL / 1000;
91 static ref DFL_DISALLOW_PREEMPT_AFTER_US: u64 = 4 * scx_enums.SCX_SLICE_DFL / 1000;
92 static ref EXAMPLE_CONFIG: LayerConfig = LayerConfig {
93 specs: vec![
94 LayerSpec {
95 name: "batch".into(),
96 comment: Some("tasks under system.slice or tasks with nice value > 0".into()),
97 matches: vec![
98 vec![LayerMatch::CgroupPrefix("system.slice/".into())],
99 vec![LayerMatch::NiceAbove(0)],
100 ],
101 kind: LayerKind::Confined {
102 util_range: (0.8, 0.9),
103 cpus_range: Some((0, 16)),
104 cpus_range_frac: None,
105 protected: false,
106 common: LayerCommon {
107 min_exec_us: 1000,
108 yield_ignore: 0.0,
109 preempt: false,
110 preempt_first: false,
111 exclusive: false,
112 allow_node_aligned: false,
113 skip_remote_node: false,
114 prev_over_idle_core: false,
115 idle_smt: None,
116 slice_us: 20000,
117 fifo: false,
118 weight: DEFAULT_LAYER_WEIGHT,
119 disallow_open_after_us: None,
120 disallow_preempt_after_us: None,
121 xllc_mig_min_us: 1000.0,
122 growth_algo: LayerGrowthAlgo::Sticky,
123 idle_resume_us: None,
124 perf: 1024,
125 nodes: vec![],
126 llcs: vec![],
127 },
128 },
129 },
130 LayerSpec {
131 name: "immediate".into(),
132 comment: Some("tasks under workload.slice with nice value < 0".into()),
133 matches: vec![vec![
134 LayerMatch::CgroupPrefix("workload.slice/".into()),
135 LayerMatch::NiceBelow(0),
136 ]],
137 kind: LayerKind::Open {
138 common: LayerCommon {
139 min_exec_us: 100,
140 yield_ignore: 0.25,
141 preempt: true,
142 preempt_first: false,
143 exclusive: true,
144 allow_node_aligned: true,
145 skip_remote_node: false,
146 prev_over_idle_core: true,
147 idle_smt: None,
148 slice_us: 20000,
149 fifo: false,
150 weight: DEFAULT_LAYER_WEIGHT,
151 disallow_open_after_us: None,
152 disallow_preempt_after_us: None,
153 xllc_mig_min_us: 0.0,
154 growth_algo: LayerGrowthAlgo::Sticky,
155 perf: 1024,
156 idle_resume_us: None,
157 nodes: vec![],
158 llcs: vec![],
159 },
160 },
161 },
162 LayerSpec {
163 name: "stress-ng".into(),
164 comment: Some("stress-ng test layer".into()),
165 matches: vec![
166 vec![LayerMatch::CommPrefix("stress-ng".into()),],
167 vec![LayerMatch::PcommPrefix("stress-ng".into()),]
168 ],
169 kind: LayerKind::Confined {
170 cpus_range: None,
171 util_range: (0.2, 0.8),
172 protected: false,
173 cpus_range_frac: None,
174 common: LayerCommon {
175 min_exec_us: 800,
176 yield_ignore: 0.0,
177 preempt: true,
178 preempt_first: false,
179 exclusive: false,
180 allow_node_aligned: false,
181 skip_remote_node: false,
182 prev_over_idle_core: false,
183 idle_smt: None,
184 slice_us: 800,
185 fifo: false,
186 weight: DEFAULT_LAYER_WEIGHT,
187 disallow_open_after_us: None,
188 disallow_preempt_after_us: None,
189 xllc_mig_min_us: 0.0,
190 growth_algo: LayerGrowthAlgo::Topo,
191 perf: 1024,
192 idle_resume_us: None,
193 nodes: vec![],
194 llcs: vec![],
195 },
196 },
197 },
198 LayerSpec {
199 name: "normal".into(),
200 comment: Some("the rest".into()),
201 matches: vec![vec![]],
202 kind: LayerKind::Grouped {
203 cpus_range: None,
204 util_range: (0.5, 0.6),
205 protected: false,
206 cpus_range_frac: None,
207 common: LayerCommon {
208 min_exec_us: 200,
209 yield_ignore: 0.0,
210 preempt: false,
211 preempt_first: false,
212 exclusive: false,
213 allow_node_aligned: false,
214 skip_remote_node: false,
215 prev_over_idle_core: false,
216 idle_smt: None,
217 slice_us: 20000,
218 fifo: false,
219 weight: DEFAULT_LAYER_WEIGHT,
220 disallow_open_after_us: None,
221 disallow_preempt_after_us: None,
222 xllc_mig_min_us: 100.0,
223 growth_algo: LayerGrowthAlgo::Linear,
224 perf: 1024,
225 idle_resume_us: None,
226 nodes: vec![],
227 llcs: vec![],
228 },
229 },
230 },
231 ],
232 };
233}
234
235#[derive(Debug, Parser)]
490#[command(verbatim_doc_comment)]
491struct Opts {
492 #[clap(short = 's', long, default_value = "20000")]
494 slice_us: u64,
495
496 #[clap(short = 'M', long, default_value = "0")]
501 max_exec_us: u64,
502
503 #[clap(short = 'i', long, default_value = "0.1")]
505 interval: f64,
506
507 #[clap(short = 'n', long, default_value = "false")]
510 no_load_frac_limit: bool,
511
512 #[clap(long, default_value = "0")]
514 exit_dump_len: u32,
515
516 #[clap(short = 'v', long, action = clap::ArgAction::Count)]
519 verbose: u8,
520
521 #[arg(short = 't', long, num_args = 0..=1, default_missing_value = "true", require_equals = true)]
525 disable_topology: Option<bool>,
526
527 #[clap(long)]
529 xnuma_preemption: bool,
530
531 #[clap(long)]
533 monitor_disable: bool,
534
535 #[clap(short = 'e', long)]
537 example: Option<String>,
538
539 #[clap(long, default_value = "0.0")]
543 layer_preempt_weight_disable: f64,
544
545 #[clap(long, default_value = "0.0")]
549 layer_growth_weight_disable: f64,
550
551 #[clap(long)]
553 stats: Option<f64>,
554
555 #[clap(long)]
558 monitor: Option<f64>,
559
560 #[clap(long)]
562 run_example: bool,
563
564 #[clap(long, default_value = "false")]
567 local_llc_iteration: bool,
568
569 #[clap(long, default_value = "10000")]
574 lo_fb_wait_us: u64,
575
576 #[clap(long, default_value = ".05")]
579 lo_fb_share: f64,
580
581 #[clap(long, default_value = "false")]
583 disable_antistall: bool,
584
585 #[clap(long, default_value = "3")]
587 antistall_sec: u64,
588
589 #[clap(long, default_value = "false")]
591 enable_gpu_support: bool,
592
593 #[clap(long, default_value = "3")]
601 gpu_kprobe_level: u64,
602
603 #[clap(long, default_value = "false")]
605 netdev_irq_balance: bool,
606
607 #[clap(long, default_value = "false")]
609 disable_queued_wakeup: bool,
610
611 #[clap(long, default_value = "false")]
613 disable_percpu_kthread_preempt: bool,
614
615 #[clap(long)]
617 help_stats: bool,
618
619 specs: Vec<String>,
621}
622
623fn read_total_cpu(reader: &procfs::ProcReader) -> Result<procfs::CpuStat> {
624 reader
625 .read_stat()
626 .context("Failed to read procfs")?
627 .total_cpu
628 .ok_or_else(|| anyhow!("Could not read total cpu stat in proc"))
629}
630
631fn calc_util(curr: &procfs::CpuStat, prev: &procfs::CpuStat) -> Result<f64> {
632 match (curr, prev) {
633 (
634 procfs::CpuStat {
635 user_usec: Some(curr_user),
636 nice_usec: Some(curr_nice),
637 system_usec: Some(curr_system),
638 idle_usec: Some(curr_idle),
639 iowait_usec: Some(curr_iowait),
640 irq_usec: Some(curr_irq),
641 softirq_usec: Some(curr_softirq),
642 stolen_usec: Some(curr_stolen),
643 ..
644 },
645 procfs::CpuStat {
646 user_usec: Some(prev_user),
647 nice_usec: Some(prev_nice),
648 system_usec: Some(prev_system),
649 idle_usec: Some(prev_idle),
650 iowait_usec: Some(prev_iowait),
651 irq_usec: Some(prev_irq),
652 softirq_usec: Some(prev_softirq),
653 stolen_usec: Some(prev_stolen),
654 ..
655 },
656 ) => {
657 let idle_usec = curr_idle.saturating_sub(*prev_idle);
658 let iowait_usec = curr_iowait.saturating_sub(*prev_iowait);
659 let user_usec = curr_user.saturating_sub(*prev_user);
660 let system_usec = curr_system.saturating_sub(*prev_system);
661 let nice_usec = curr_nice.saturating_sub(*prev_nice);
662 let irq_usec = curr_irq.saturating_sub(*prev_irq);
663 let softirq_usec = curr_softirq.saturating_sub(*prev_softirq);
664 let stolen_usec = curr_stolen.saturating_sub(*prev_stolen);
665
666 let busy_usec =
667 user_usec + system_usec + nice_usec + irq_usec + softirq_usec + stolen_usec;
668 let total_usec = idle_usec + busy_usec + iowait_usec;
669 if total_usec > 0 {
670 Ok(((busy_usec as f64) / (total_usec as f64)).clamp(0.0, 1.0))
671 } else {
672 Ok(1.0)
673 }
674 }
675 _ => bail!("Missing stats in cpustat"),
676 }
677}
678
679fn copy_into_cstr(dst: &mut [i8], src: &str) {
680 let cstr = CString::new(src).unwrap();
681 let bytes = unsafe { std::mem::transmute::<&[u8], &[i8]>(cstr.as_bytes_with_nul()) };
682 dst[0..bytes.len()].copy_from_slice(bytes);
683}
684
685fn nodemask_from_nodes(nodes: &Vec<usize>) -> usize {
686 let mut mask = 0;
687 for node in nodes {
688 mask |= 1 << node;
689 }
690 mask
691}
692
693fn llcmask_from_llcs(llcs: &BTreeMap<usize, Arc<Llc>>) -> usize {
694 let mut mask = 0;
695 for (_, cache) in llcs {
696 mask |= 1 << cache.id;
697 }
698 mask
699}
700
701fn read_cpu_ctxs(skel: &BpfSkel) -> Result<Vec<bpf_intf::cpu_ctx>> {
702 let mut cpu_ctxs = vec![];
703 let cpu_ctxs_vec = skel
704 .maps
705 .cpu_ctxs
706 .lookup_percpu(&0u32.to_ne_bytes(), libbpf_rs::MapFlags::ANY)
707 .context("Failed to lookup cpu_ctx")?
708 .unwrap();
709 for cpu in 0..*NR_CPUS_POSSIBLE {
710 cpu_ctxs.push(*unsafe {
711 &*(cpu_ctxs_vec[cpu].as_slice().as_ptr() as *const bpf_intf::cpu_ctx)
712 });
713 }
714 Ok(cpu_ctxs)
715}
716
717#[derive(Clone, Debug)]
718struct BpfStats {
719 gstats: Vec<u64>,
720 lstats: Vec<Vec<u64>>,
721 lstats_sums: Vec<u64>,
722 llc_lstats: Vec<Vec<Vec<u64>>>, }
724
725impl BpfStats {
726 fn read(skel: &BpfSkel, cpu_ctxs: &[bpf_intf::cpu_ctx]) -> Self {
727 let nr_layers = skel.maps.rodata_data.nr_layers as usize;
728 let nr_llcs = skel.maps.rodata_data.nr_llcs as usize;
729 let mut gstats = vec![0u64; NR_GSTATS];
730 let mut lstats = vec![vec![0u64; NR_LSTATS]; nr_layers];
731 let mut llc_lstats = vec![vec![vec![0u64; NR_LLC_LSTATS]; nr_llcs]; nr_layers];
732
733 for cpu in 0..*NR_CPUS_POSSIBLE {
734 for stat in 0..NR_GSTATS {
735 gstats[stat] += cpu_ctxs[cpu].gstats[stat];
736 }
737 for layer in 0..nr_layers {
738 for stat in 0..NR_LSTATS {
739 lstats[layer][stat] += cpu_ctxs[cpu].lstats[layer][stat];
740 }
741 }
742 }
743
744 let mut lstats_sums = vec![0u64; NR_LSTATS];
745 for layer in 0..nr_layers {
746 for stat in 0..NR_LSTATS {
747 lstats_sums[stat] += lstats[layer][stat];
748 }
749 }
750
751 for llc_id in 0..nr_llcs {
752 let key = llc_id as u32;
758 let llc_id_slice =
759 unsafe { std::slice::from_raw_parts((&key as *const u32) as *const u8, 4) };
760 let v = skel
761 .maps
762 .llc_data
763 .lookup(llc_id_slice, libbpf_rs::MapFlags::ANY)
764 .unwrap()
765 .unwrap();
766 let llcc = unsafe { *(v.as_slice().as_ptr() as *const bpf_intf::llc_ctx) };
767
768 for layer_id in 0..nr_layers {
769 for stat_id in 0..NR_LLC_LSTATS {
770 llc_lstats[layer_id][llc_id][stat_id] = llcc.lstats[layer_id][stat_id];
771 }
772 }
773 }
774
775 Self {
776 gstats,
777 lstats,
778 lstats_sums,
779 llc_lstats,
780 }
781 }
782}
783
784impl<'a, 'b> Sub<&'b BpfStats> for &'a BpfStats {
785 type Output = BpfStats;
786
787 fn sub(self, rhs: &'b BpfStats) -> BpfStats {
788 let vec_sub = |l: &[u64], r: &[u64]| l.iter().zip(r.iter()).map(|(l, r)| *l - *r).collect();
789 BpfStats {
790 gstats: vec_sub(&self.gstats, &rhs.gstats),
791 lstats: self
792 .lstats
793 .iter()
794 .zip(rhs.lstats.iter())
795 .map(|(l, r)| vec_sub(l, r))
796 .collect(),
797 lstats_sums: vec_sub(&self.lstats_sums, &rhs.lstats_sums),
798 llc_lstats: self
799 .llc_lstats
800 .iter()
801 .zip(rhs.llc_lstats.iter())
802 .map(|(l_layer, r_layer)| {
803 l_layer
804 .iter()
805 .zip(r_layer.iter())
806 .map(|(l_llc, r_llc)| {
807 let (l_llc, mut r_llc) = (l_llc.clone(), r_llc.clone());
808 r_llc[bpf_intf::llc_layer_stat_id_LLC_LSTAT_LAT as usize] = 0;
810 vec_sub(&l_llc, &r_llc)
811 })
812 .collect()
813 })
814 .collect(),
815 }
816 }
817}
818
819#[derive(Clone, Debug)]
820struct Stats {
821 at: Instant,
822 elapsed: Duration,
823 nr_layers: usize,
824 nr_layer_tasks: Vec<usize>,
825 nr_nodes: usize,
826
827 total_util: f64, layer_utils: Vec<Vec<f64>>,
829 prev_layer_usages: Vec<Vec<u64>>,
830
831 cpu_busy: f64, prev_total_cpu: procfs::CpuStat,
833
834 bpf_stats: BpfStats,
835 prev_bpf_stats: BpfStats,
836
837 processing_dur: Duration,
838 prev_processing_dur: Duration,
839
840 layer_slice_us: Vec<u64>,
841}
842
843impl Stats {
844 fn read_layer_usages(cpu_ctxs: &[bpf_intf::cpu_ctx], nr_layers: usize) -> Vec<Vec<u64>> {
845 let mut layer_usages = vec![vec![0u64; NR_LAYER_USAGES]; nr_layers];
846
847 for cpu in 0..*NR_CPUS_POSSIBLE {
848 for layer in 0..nr_layers {
849 for usage in 0..NR_LAYER_USAGES {
850 layer_usages[layer][usage] += cpu_ctxs[cpu].layer_usages[layer][usage];
851 }
852 }
853 }
854
855 layer_usages
856 }
857
858 fn new(skel: &mut BpfSkel, proc_reader: &procfs::ProcReader) -> Result<Self> {
859 let nr_layers = skel.maps.rodata_data.nr_layers as usize;
860 let cpu_ctxs = read_cpu_ctxs(skel)?;
861 let bpf_stats = BpfStats::read(skel, &cpu_ctxs);
862 let nr_nodes = skel.maps.rodata_data.nr_nodes as usize;
863
864 Ok(Self {
865 at: Instant::now(),
866 elapsed: Default::default(),
867 nr_layers,
868 nr_layer_tasks: vec![0; nr_layers],
869 nr_nodes,
870
871 total_util: 0.0,
872 layer_utils: vec![vec![0.0; NR_LAYER_USAGES]; nr_layers],
873 prev_layer_usages: Self::read_layer_usages(&cpu_ctxs, nr_layers),
874
875 cpu_busy: 0.0,
876 prev_total_cpu: read_total_cpu(proc_reader)?,
877
878 bpf_stats: bpf_stats.clone(),
879 prev_bpf_stats: bpf_stats,
880
881 processing_dur: Default::default(),
882 prev_processing_dur: Default::default(),
883
884 layer_slice_us: vec![0; nr_layers],
885 })
886 }
887
888 fn refresh(
889 &mut self,
890 skel: &mut BpfSkel,
891 proc_reader: &procfs::ProcReader,
892 now: Instant,
893 cur_processing_dur: Duration,
894 ) -> Result<()> {
895 let elapsed = now.duration_since(self.at);
896 let elapsed_f64 = elapsed.as_secs_f64();
897 let cpu_ctxs = read_cpu_ctxs(skel)?;
898
899 let nr_layer_tasks: Vec<usize> = skel
900 .maps
901 .bss_data
902 .layers
903 .iter()
904 .take(self.nr_layers)
905 .map(|layer| layer.nr_tasks as usize)
906 .collect();
907 let layer_slice_us: Vec<u64> = skel
908 .maps
909 .bss_data
910 .layers
911 .iter()
912 .take(self.nr_layers)
913 .map(|layer| layer.slice_ns / 1000_u64)
914 .collect();
915
916 let cur_layer_usages = Self::read_layer_usages(&cpu_ctxs, self.nr_layers);
917 let cur_layer_utils: Vec<Vec<f64>> = cur_layer_usages
918 .iter()
919 .zip(self.prev_layer_usages.iter())
920 .map(|(cur, prev)| {
921 cur.iter()
922 .zip(prev.iter())
923 .map(|(c, p)| (c - p) as f64 / 1_000_000_000.0 / elapsed_f64)
924 .collect()
925 })
926 .collect();
927 let layer_utils: Vec<Vec<f64>> = cur_layer_utils
928 .iter()
929 .zip(self.layer_utils.iter())
930 .map(|(cur, prev)| {
931 cur.iter()
932 .zip(prev.iter())
933 .map(|(c, p)| {
934 let decay = USAGE_DECAY.powf(elapsed_f64);
935 p * decay + c * (1.0 - decay)
936 })
937 .collect()
938 })
939 .collect();
940
941 let cur_total_cpu = read_total_cpu(proc_reader)?;
942 let cpu_busy = calc_util(&cur_total_cpu, &self.prev_total_cpu)?;
943
944 let cur_bpf_stats = BpfStats::read(skel, &cpu_ctxs);
945 let bpf_stats = &cur_bpf_stats - &self.prev_bpf_stats;
946
947 let processing_dur = cur_processing_dur
948 .checked_sub(self.prev_processing_dur)
949 .unwrap();
950
951 *self = Self {
952 at: now,
953 elapsed,
954 nr_layers: self.nr_layers,
955 nr_layer_tasks,
956 nr_nodes: self.nr_nodes,
957
958 total_util: layer_utils
959 .iter()
960 .map(|x| x.iter().take(LAYER_USAGE_SUM_UPTO + 1).sum::<f64>())
961 .sum(),
962 layer_utils,
963 prev_layer_usages: cur_layer_usages,
964
965 cpu_busy,
966 prev_total_cpu: cur_total_cpu,
967
968 bpf_stats,
969 prev_bpf_stats: cur_bpf_stats,
970
971 processing_dur,
972 prev_processing_dur: cur_processing_dur,
973
974 layer_slice_us,
975 };
976 Ok(())
977 }
978}
979
980#[derive(Debug)]
981struct Layer {
982 name: String,
983 kind: LayerKind,
984 growth_algo: LayerGrowthAlgo,
985 core_order: Vec<usize>,
986
987 target_llc_cpus: (usize, usize),
988 assigned_llcs: Vec<usize>,
989
990 nr_cpus: usize,
991 nr_llc_cpus: Vec<usize>,
992 cpus: Cpumask,
993 allowed_cpus: Cpumask,
994}
995
996fn resolve_cpus_pct_range(
997 cpus_range: &Option<(usize, usize)>,
998 cpus_range_frac: &Option<(f64, f64)>,
999 max_cpus: usize,
1000) -> Result<(usize, usize)> {
1001 match (cpus_range, cpus_range_frac) {
1002 (Some(_x), Some(_y)) => {
1003 bail!("cpus_range cannot be used with cpus_pct.");
1004 }
1005 (Some((cpus_range_min, cpus_range_max)), None) => Ok((*cpus_range_min, *cpus_range_max)),
1006 (None, Some((cpus_frac_min, cpus_frac_max))) => {
1007 if *cpus_frac_min < 0_f64
1008 || *cpus_frac_min > 1_f64
1009 || *cpus_frac_max < 0_f64
1010 || *cpus_frac_max > 1_f64
1011 {
1012 bail!("cpus_range_frac values must be between 0.0 and 1.0");
1013 }
1014 let cpus_min_count = ((max_cpus as f64) * cpus_frac_min).round_ties_even() as usize;
1015 let cpus_max_count = ((max_cpus as f64) * cpus_frac_max).round_ties_even() as usize;
1016 Ok((
1017 std::cmp::max(cpus_min_count, 1),
1018 std::cmp::min(cpus_max_count, max_cpus),
1019 ))
1020 }
1021 (None, None) => Ok((0, max_cpus)),
1022 }
1023}
1024
1025impl Layer {
1026 fn new(spec: &LayerSpec, topo: &Topology, core_order: &Vec<usize>) -> Result<Self> {
1027 let name = &spec.name;
1028 let kind = spec.kind.clone();
1029 let mut allowed_cpus = Cpumask::new();
1030 match &kind {
1031 LayerKind::Confined {
1032 cpus_range,
1033 cpus_range_frac,
1034 util_range,
1035 common: LayerCommon { nodes, llcs, .. },
1036 ..
1037 } => {
1038 let cpus_range =
1039 resolve_cpus_pct_range(cpus_range, cpus_range_frac, topo.all_cpus.len())?;
1040 if cpus_range.0 > cpus_range.1 || cpus_range.1 == 0 {
1041 bail!("invalid cpus_range {:?}", cpus_range);
1042 }
1043 if nodes.is_empty() && llcs.is_empty() {
1044 allowed_cpus.set_all();
1045 } else {
1046 for (node_id, node) in &topo.nodes {
1048 if nodes.contains(node_id) {
1050 for &id in node.all_cpus.keys() {
1051 allowed_cpus.set_cpu(id)?;
1052 }
1053 }
1054 for (llc_id, llc) in &node.llcs {
1056 if llcs.contains(llc_id) {
1057 for &id in llc.all_cpus.keys() {
1058 allowed_cpus.set_cpu(id)?;
1059 }
1060 }
1061 }
1062 }
1063 }
1064
1065 if util_range.0 < 0.0
1066 || util_range.0 > 1.0
1067 || util_range.1 < 0.0
1068 || util_range.1 > 1.0
1069 || util_range.0 >= util_range.1
1070 {
1071 bail!("invalid util_range {:?}", util_range);
1072 }
1073 }
1074 LayerKind::Grouped {
1075 common: LayerCommon { nodes, llcs, .. },
1076 ..
1077 }
1078 | LayerKind::Open {
1079 common: LayerCommon { nodes, llcs, .. },
1080 ..
1081 } => {
1082 if nodes.is_empty() && llcs.is_empty() {
1083 allowed_cpus.set_all();
1084 } else {
1085 for (node_id, node) in &topo.nodes {
1087 if nodes.contains(node_id) {
1089 for &id in node.all_cpus.keys() {
1090 allowed_cpus.set_cpu(id)?;
1091 }
1092 }
1093 for (llc_id, llc) in &node.llcs {
1095 if llcs.contains(llc_id) {
1096 for &id in llc.all_cpus.keys() {
1097 allowed_cpus.set_cpu(id)?;
1098 }
1099 }
1100 }
1101 }
1102 }
1103 }
1104 }
1105
1106 let layer_growth_algo = kind.common().growth_algo.clone();
1107
1108 debug!(
1109 "layer: {} algo: {:?} core order: {:?}",
1110 name, &layer_growth_algo, core_order
1111 );
1112
1113 Ok(Self {
1114 name: name.into(),
1115 kind,
1116 growth_algo: layer_growth_algo,
1117 core_order: core_order.clone(),
1118
1119 target_llc_cpus: (0, 0),
1120 assigned_llcs: vec![],
1121
1122 nr_cpus: 0,
1123 nr_llc_cpus: vec![0; topo.all_llcs.len()],
1124 cpus: Cpumask::new(),
1125 allowed_cpus,
1126 })
1127 }
1128
1129 fn free_some_cpus(&mut self, cpu_pool: &mut CpuPool, max_to_free: usize) -> Result<usize> {
1130 let cpus_to_free = match cpu_pool.next_to_free(&self.cpus, self.core_order.iter().rev())? {
1131 Some(ret) => ret.clone(),
1132 None => return Ok(0),
1133 };
1134
1135 let nr_to_free = cpus_to_free.weight();
1136
1137 Ok(if nr_to_free <= max_to_free {
1138 trace!("[{}] freeing CPUs: {}", self.name, &cpus_to_free);
1139 self.cpus &= &cpus_to_free.not();
1140 self.nr_cpus -= nr_to_free;
1141 for cpu in cpus_to_free.iter() {
1142 self.nr_llc_cpus[cpu_pool.topo.all_cpus[&cpu].llc_id] -= 1;
1143 }
1144 cpu_pool.free(&cpus_to_free)?;
1145 nr_to_free
1146 } else {
1147 0
1148 })
1149 }
1150
1151 fn alloc_some_cpus(&mut self, cpu_pool: &mut CpuPool) -> Result<usize> {
1152 let new_cpus = match cpu_pool
1153 .alloc_cpus(&self.allowed_cpus, &self.core_order)
1154 .clone()
1155 {
1156 Some(ret) => ret.clone(),
1157 None => {
1158 trace!("layer-{} can't grow, no CPUs", &self.name);
1159 return Ok(0);
1160 }
1161 };
1162
1163 let nr_new_cpus = new_cpus.weight();
1164
1165 trace!("[{}] adding CPUs: {}", &self.name, &new_cpus);
1166 self.cpus |= &new_cpus;
1167 self.nr_cpus += nr_new_cpus;
1168 for cpu in new_cpus.iter() {
1169 self.nr_llc_cpus[cpu_pool.topo.all_cpus[&cpu].llc_id] += 1;
1170 }
1171 Ok(nr_new_cpus)
1172 }
1173}
1174
1175struct Scheduler<'a> {
1176 skel: BpfSkel<'a>,
1177 struct_ops: Option<libbpf_rs::Link>,
1178 layer_specs: Vec<LayerSpec>,
1179
1180 sched_intv: Duration,
1181
1182 cpu_pool: CpuPool,
1183 layers: Vec<Layer>,
1184 idle_qos_enabled: bool,
1185
1186 proc_reader: procfs::ProcReader,
1187 sched_stats: Stats,
1188
1189 nr_layer_cpus_ranges: Vec<(usize, usize)>,
1190 processing_dur: Duration,
1191
1192 topo: Arc<Topology>,
1193 netdevs: BTreeMap<String, NetDev>,
1194 stats_server: StatsServer<StatsReq, StatsRes>,
1195}
1196
1197impl<'a> Scheduler<'a> {
1198 fn init_layers(skel: &mut OpenBpfSkel, specs: &[LayerSpec], topo: &Topology) -> Result<()> {
1199 skel.maps.rodata_data.nr_layers = specs.len() as u32;
1200 let mut perf_set = false;
1201
1202 let mut layer_iteration_order = (0..specs.len()).collect::<Vec<_>>();
1203 let mut layer_weights: Vec<usize> = vec![];
1204
1205 for (spec_i, spec) in specs.iter().enumerate() {
1206 let layer = &mut skel.maps.bss_data.layers[spec_i];
1207
1208 for (or_i, or) in spec.matches.iter().enumerate() {
1209 for (and_i, and) in or.iter().enumerate() {
1210 let mt = &mut layer.matches[or_i].matches[and_i];
1211
1212 mt.exclude.write(false);
1214
1215 match and {
1216 LayerMatch::CgroupPrefix(prefix) => {
1217 mt.kind = bpf_intf::layer_match_kind_MATCH_CGROUP_PREFIX as i32;
1218 copy_into_cstr(&mut mt.cgroup_prefix, prefix.as_str());
1219 }
1220 LayerMatch::CommPrefix(prefix) => {
1221 mt.kind = bpf_intf::layer_match_kind_MATCH_COMM_PREFIX as i32;
1222 copy_into_cstr(&mut mt.comm_prefix, prefix.as_str());
1223 }
1224 LayerMatch::CommPrefixExclude(prefix) => {
1225 mt.kind = bpf_intf::layer_match_kind_MATCH_COMM_PREFIX as i32;
1226 mt.exclude.write(true);
1227 copy_into_cstr(&mut mt.comm_prefix, prefix.as_str());
1228 }
1229 LayerMatch::PcommPrefix(prefix) => {
1230 mt.kind = bpf_intf::layer_match_kind_MATCH_PCOMM_PREFIX as i32;
1231 copy_into_cstr(&mut mt.pcomm_prefix, prefix.as_str());
1232 }
1233 LayerMatch::PcommPrefixExclude(prefix) => {
1234 mt.kind = bpf_intf::layer_match_kind_MATCH_PCOMM_PREFIX as i32;
1235 mt.exclude.write(true);
1236 copy_into_cstr(&mut mt.pcomm_prefix, prefix.as_str());
1237 }
1238 LayerMatch::NiceAbove(nice) => {
1239 mt.kind = bpf_intf::layer_match_kind_MATCH_NICE_ABOVE as i32;
1240 mt.nice = *nice;
1241 }
1242 LayerMatch::NiceBelow(nice) => {
1243 mt.kind = bpf_intf::layer_match_kind_MATCH_NICE_BELOW as i32;
1244 mt.nice = *nice;
1245 }
1246 LayerMatch::NiceEquals(nice) => {
1247 mt.kind = bpf_intf::layer_match_kind_MATCH_NICE_EQUALS as i32;
1248 mt.nice = *nice;
1249 }
1250 LayerMatch::UIDEquals(user_id) => {
1251 mt.kind = bpf_intf::layer_match_kind_MATCH_USER_ID_EQUALS as i32;
1252 mt.user_id = *user_id;
1253 }
1254 LayerMatch::GIDEquals(group_id) => {
1255 mt.kind = bpf_intf::layer_match_kind_MATCH_GROUP_ID_EQUALS as i32;
1256 mt.group_id = *group_id;
1257 }
1258 LayerMatch::PIDEquals(pid) => {
1259 mt.kind = bpf_intf::layer_match_kind_MATCH_PID_EQUALS as i32;
1260 mt.pid = *pid;
1261 }
1262 LayerMatch::PPIDEquals(ppid) => {
1263 mt.kind = bpf_intf::layer_match_kind_MATCH_PPID_EQUALS as i32;
1264 mt.ppid = *ppid;
1265 }
1266 LayerMatch::TGIDEquals(tgid) => {
1267 mt.kind = bpf_intf::layer_match_kind_MATCH_TGID_EQUALS as i32;
1268 mt.tgid = *tgid;
1269 }
1270 LayerMatch::NSPIDEquals(nsid, pid) => {
1271 mt.kind = bpf_intf::layer_match_kind_MATCH_NSPID_EQUALS as i32;
1272 mt.nsid = *nsid;
1273 mt.pid = *pid;
1274 }
1275 LayerMatch::NSEquals(nsid) => {
1276 mt.kind = bpf_intf::layer_match_kind_MATCH_NS_EQUALS as i32;
1277 mt.nsid = *nsid as u64;
1278 }
1279 LayerMatch::CmdJoin(joincmd) => {
1280 mt.kind = bpf_intf::layer_match_kind_MATCH_SCXCMD_JOIN as i32;
1281 copy_into_cstr(&mut mt.comm_prefix, joincmd);
1282 }
1283 LayerMatch::IsGroupLeader(polarity) => {
1284 mt.kind = bpf_intf::layer_match_kind_MATCH_IS_GROUP_LEADER as i32;
1285 mt.is_group_leader.write(*polarity);
1286 }
1287 LayerMatch::IsKthread(polarity) => {
1288 mt.kind = bpf_intf::layer_match_kind_MATCH_IS_KTHREAD as i32;
1289 mt.is_kthread.write(*polarity);
1290 }
1291 LayerMatch::UsedGpuTid(polarity) => {
1292 mt.kind = bpf_intf::layer_match_kind_MATCH_USED_GPU_TID as i32;
1293 mt.used_gpu_tid.write(*polarity);
1294 }
1295 LayerMatch::UsedGpuPid(polarity) => {
1296 mt.kind = bpf_intf::layer_match_kind_MATCH_USED_GPU_PID as i32;
1297 mt.used_gpu_pid.write(*polarity);
1298 }
1299 }
1300 }
1301 layer.matches[or_i].nr_match_ands = or.len() as i32;
1302 }
1303
1304 layer.nr_match_ors = spec.matches.len() as u32;
1305 layer.kind = spec.kind.as_bpf_enum();
1306
1307 {
1308 let LayerCommon {
1309 min_exec_us,
1310 yield_ignore,
1311 perf,
1312 preempt,
1313 preempt_first,
1314 exclusive,
1315 allow_node_aligned,
1316 skip_remote_node,
1317 prev_over_idle_core,
1318 growth_algo,
1319 nodes,
1320 slice_us,
1321 fifo,
1322 weight,
1323 disallow_open_after_us,
1324 disallow_preempt_after_us,
1325 xllc_mig_min_us,
1326 ..
1327 } = spec.kind.common();
1328
1329 layer.slice_ns = *slice_us * 1000;
1330 layer.fifo.write(*fifo);
1331 layer.min_exec_ns = min_exec_us * 1000;
1332 layer.yield_step_ns = if *yield_ignore > 0.999 {
1333 0
1334 } else if *yield_ignore < 0.001 {
1335 layer.slice_ns
1336 } else {
1337 (layer.slice_ns as f64 * (1.0 - *yield_ignore)) as u64
1338 };
1339 let mut layer_name: String = spec.name.clone();
1340 layer_name.truncate(MAX_LAYER_NAME);
1341 copy_into_cstr(&mut layer.name, layer_name.as_str());
1342 layer.preempt.write(*preempt);
1343 layer.preempt_first.write(*preempt_first);
1344 layer.exclusive.write(*exclusive);
1345 layer.allow_node_aligned.write(*allow_node_aligned);
1346 layer.skip_remote_node.write(*skip_remote_node);
1347 layer.prev_over_idle_core.write(*prev_over_idle_core);
1348 layer.growth_algo = growth_algo.as_bpf_enum();
1349 layer.weight = *weight;
1350 layer.disallow_open_after_ns = match disallow_open_after_us.unwrap() {
1351 v if v == u64::MAX => v,
1352 v => v * 1000,
1353 };
1354 layer.disallow_preempt_after_ns = match disallow_preempt_after_us.unwrap() {
1355 v if v == u64::MAX => v,
1356 v => v * 1000,
1357 };
1358 layer.xllc_mig_min_ns = (xllc_mig_min_us * 1000.0) as u64;
1359 layer_weights.push(layer.weight.try_into().unwrap());
1360 layer.perf = u32::try_from(*perf)?;
1361 layer.node_mask = nodemask_from_nodes(nodes) as u64;
1362 for (topo_node_id, topo_node) in &topo.nodes {
1363 if !nodes.is_empty() && !nodes.contains(topo_node_id) {
1364 continue;
1365 }
1366 layer.llc_mask |= llcmask_from_llcs(&topo_node.llcs) as u64;
1367 }
1368 }
1369
1370 layer.is_protected.write(match spec.kind {
1371 LayerKind::Open { .. } => false,
1372 LayerKind::Confined { protected, .. } | LayerKind::Grouped { protected, .. } => {
1373 protected
1374 }
1375 });
1376
1377 perf_set |= layer.perf > 0;
1378 }
1379
1380 layer_iteration_order.sort_by(|i, j| layer_weights[*i].cmp(&layer_weights[*j]));
1381 for (idx, layer_idx) in layer_iteration_order.iter().enumerate() {
1382 skel.maps.rodata_data.layer_iteration_order[idx] = *layer_idx as u32;
1383 }
1384
1385 if perf_set && !compat::ksym_exists("scx_bpf_cpuperf_set")? {
1386 warn!("cpufreq support not available, ignoring perf configurations");
1387 }
1388
1389 Ok(())
1390 }
1391
1392 fn init_nodes(skel: &mut OpenBpfSkel, _opts: &Opts, topo: &Topology) {
1393 skel.maps.rodata_data.nr_nodes = topo.nodes.len() as u32;
1394 skel.maps.rodata_data.nr_llcs = 0;
1395
1396 for (&node_id, node) in &topo.nodes {
1397 debug!("configuring node {}, LLCs {:?}", node_id, node.llcs.len());
1398 skel.maps.rodata_data.nr_llcs += node.llcs.len() as u32;
1399 let raw_numa_slice = node.span.as_raw_slice();
1400 let node_cpumask_slice = &mut skel.maps.rodata_data.numa_cpumasks[node_id];
1401 let (left, _) = node_cpumask_slice.split_at_mut(raw_numa_slice.len());
1402 left.clone_from_slice(raw_numa_slice);
1403 debug!(
1404 "node {} mask: {:?}",
1405 node_id, skel.maps.rodata_data.numa_cpumasks[node_id]
1406 );
1407
1408 for llc in node.llcs.values() {
1409 debug!("configuring llc {:?} for node {:?}", llc.id, node_id);
1410 skel.maps.rodata_data.llc_numa_id_map[llc.id] = node_id as u32;
1411 }
1412 }
1413
1414 for cpu in topo.all_cpus.values() {
1415 skel.maps.rodata_data.cpu_llc_id_map[cpu.id] = cpu.llc_id as u32;
1416 }
1417 }
1418
1419 fn init_cpu_prox_map(topo: &Topology, cpu_ctxs: &mut [bpf_intf::cpu_ctx]) {
1420 let radiate = |mut vec: Vec<usize>, center_id: usize| -> Vec<usize> {
1421 vec.sort_by_key(|&id| (center_id as i32 - id as i32).abs());
1422 vec
1423 };
1424 let radiate_cpu =
1425 |mut vec: Vec<usize>, center_cpu: usize, center_core: usize| -> Vec<usize> {
1426 vec.sort_by_key(|&id| {
1427 (
1428 (center_core as i32 - topo.all_cpus.get(&id).unwrap().core_id as i32).abs(),
1429 (center_cpu as i32 - id as i32).abs(),
1430 )
1431 });
1432 vec
1433 };
1434
1435 for (&cpu_id, cpu) in &topo.all_cpus {
1436 let mut core_span = topo.all_cores[&cpu.core_id].span.clone();
1438 let llc_span = &topo.all_llcs[&cpu.llc_id].span;
1439 let node_span = &topo.nodes[&cpu.node_id].span;
1440 let sys_span = &topo.span;
1441
1442 let sys_span = sys_span.and(&node_span.not());
1444 let node_span = node_span.and(&llc_span.not());
1445 let llc_span = llc_span.and(&core_span.not());
1446 core_span.clear_cpu(cpu_id).unwrap();
1447
1448 let mut sys_order: Vec<usize> = sys_span.iter().collect();
1450 let mut node_order: Vec<usize> = node_span.iter().collect();
1451 let mut llc_order: Vec<usize> = llc_span.iter().collect();
1452 let mut core_order: Vec<usize> = core_span.iter().collect();
1453
1454 sys_order = radiate_cpu(sys_order, cpu_id, cpu.core_id);
1459 node_order = radiate(node_order, cpu.node_id);
1460 llc_order = radiate_cpu(llc_order, cpu_id, cpu.core_id);
1461 core_order = radiate_cpu(core_order, cpu_id, cpu.core_id);
1462
1463 let mut order: Vec<usize> = vec![];
1465 let mut idx: usize = 0;
1466
1467 idx += 1;
1468 order.push(cpu_id);
1469
1470 idx += core_order.len();
1471 order.append(&mut core_order);
1472 let core_end = idx;
1473
1474 idx += llc_order.len();
1475 order.append(&mut llc_order);
1476 let llc_end = idx;
1477
1478 idx += node_order.len();
1479 order.append(&mut node_order);
1480 let node_end = idx;
1481
1482 idx += sys_order.len();
1483 order.append(&mut sys_order);
1484 let sys_end = idx;
1485
1486 debug!(
1487 "CPU[{}] proximity map[{}/{}/{}/{}]: {:?}",
1488 cpu_id, core_end, llc_end, node_end, sys_end, &order
1489 );
1490
1491 let pmap = &mut cpu_ctxs[cpu_id].prox_map;
1493 for (i, &cpu) in order.iter().enumerate() {
1494 pmap.cpus[i] = cpu as u16;
1495 }
1496 pmap.core_end = core_end as u32;
1497 pmap.llc_end = llc_end as u32;
1498 pmap.node_end = node_end as u32;
1499 pmap.sys_end = sys_end as u32;
1500 }
1501 }
1502
1503 fn convert_cpu_ctxs(cpu_ctxs: Vec<bpf_intf::cpu_ctx>) -> Vec<Vec<u8>> {
1504 cpu_ctxs
1505 .into_iter()
1506 .map(|cpu_ctx| {
1507 let bytes = unsafe {
1508 std::slice::from_raw_parts(
1509 &cpu_ctx as *const bpf_intf::cpu_ctx as *const u8,
1510 std::mem::size_of::<bpf_intf::cpu_ctx>(),
1511 )
1512 };
1513 bytes.to_vec()
1514 })
1515 .collect()
1516 }
1517
1518 fn init_cpus(skel: &BpfSkel, layer_specs: &[LayerSpec], topo: &Topology) -> Result<()> {
1519 let key = (0_u32).to_ne_bytes();
1520 let mut cpu_ctxs: Vec<bpf_intf::cpu_ctx> = vec![];
1521 let cpu_ctxs_vec = skel
1522 .maps
1523 .cpu_ctxs
1524 .lookup_percpu(&key, libbpf_rs::MapFlags::ANY)
1525 .context("Failed to lookup cpu_ctx")?
1526 .unwrap();
1527
1528 let op_layers: Vec<u32> = layer_specs
1529 .iter()
1530 .enumerate()
1531 .filter(|(_idx, spec)| match &spec.kind {
1532 LayerKind::Open { .. } => spec.kind.common().preempt,
1533 _ => false,
1534 })
1535 .map(|(idx, _)| idx as u32)
1536 .collect();
1537 let on_layers: Vec<u32> = layer_specs
1538 .iter()
1539 .enumerate()
1540 .filter(|(_idx, spec)| match &spec.kind {
1541 LayerKind::Open { .. } => !spec.kind.common().preempt,
1542 _ => false,
1543 })
1544 .map(|(idx, _)| idx as u32)
1545 .collect();
1546 let gp_layers: Vec<u32> = layer_specs
1547 .iter()
1548 .enumerate()
1549 .filter(|(_idx, spec)| match &spec.kind {
1550 LayerKind::Grouped { .. } => spec.kind.common().preempt,
1551 _ => false,
1552 })
1553 .map(|(idx, _)| idx as u32)
1554 .collect();
1555 let gn_layers: Vec<u32> = layer_specs
1556 .iter()
1557 .enumerate()
1558 .filter(|(_idx, spec)| match &spec.kind {
1559 LayerKind::Grouped { .. } => !spec.kind.common().preempt,
1560 _ => false,
1561 })
1562 .map(|(idx, _)| idx as u32)
1563 .collect();
1564
1565 for cpu in 0..*NR_CPUS_POSSIBLE {
1567 cpu_ctxs.push(*unsafe {
1568 &*(cpu_ctxs_vec[cpu].as_slice().as_ptr() as *const bpf_intf::cpu_ctx)
1569 });
1570
1571 let topo_cpu = topo.all_cpus.get(&cpu).unwrap();
1572 let is_big = topo_cpu.core_type == CoreType::Big { turbo: true };
1573 cpu_ctxs[cpu].cpu = cpu as i32;
1574 cpu_ctxs[cpu].layer_id = MAX_LAYERS as u32;
1575 cpu_ctxs[cpu].task_layer_id = MAX_LAYERS as u32;
1576 cpu_ctxs[cpu].is_big = is_big;
1577
1578 fastrand::seed(cpu as u64);
1579
1580 let mut ogp_order = op_layers.clone();
1581 ogp_order.append(&mut gp_layers.clone());
1582 fastrand::shuffle(&mut ogp_order);
1583
1584 let mut ogn_order = on_layers.clone();
1585 ogn_order.append(&mut gn_layers.clone());
1586 fastrand::shuffle(&mut ogn_order);
1587
1588 let mut op_order = op_layers.clone();
1589 fastrand::shuffle(&mut op_order);
1590
1591 let mut on_order = on_layers.clone();
1592 fastrand::shuffle(&mut on_order);
1593
1594 let mut gp_order = gp_layers.clone();
1595 fastrand::shuffle(&mut gp_order);
1596
1597 let mut gn_order = gn_layers.clone();
1598 fastrand::shuffle(&mut gn_order);
1599
1600 for i in 0..MAX_LAYERS {
1601 cpu_ctxs[cpu].ogp_layer_order[i] =
1602 ogp_order.get(i).cloned().unwrap_or(MAX_LAYERS as u32);
1603 cpu_ctxs[cpu].ogn_layer_order[i] =
1604 ogn_order.get(i).cloned().unwrap_or(MAX_LAYERS as u32);
1605
1606 cpu_ctxs[cpu].op_layer_order[i] =
1607 op_order.get(i).cloned().unwrap_or(MAX_LAYERS as u32);
1608 cpu_ctxs[cpu].on_layer_order[i] =
1609 on_order.get(i).cloned().unwrap_or(MAX_LAYERS as u32);
1610 cpu_ctxs[cpu].gp_layer_order[i] =
1611 gp_order.get(i).cloned().unwrap_or(MAX_LAYERS as u32);
1612 cpu_ctxs[cpu].gn_layer_order[i] =
1613 gn_order.get(i).cloned().unwrap_or(MAX_LAYERS as u32);
1614 }
1615 }
1616
1617 Self::init_cpu_prox_map(topo, &mut cpu_ctxs);
1618
1619 skel.maps
1620 .cpu_ctxs
1621 .update_percpu(
1622 &key,
1623 &Self::convert_cpu_ctxs(cpu_ctxs),
1624 libbpf_rs::MapFlags::ANY,
1625 )
1626 .context("Failed to update cpu_ctx")?;
1627
1628 Ok(())
1629 }
1630
1631 fn init_llc_prox_map(skel: &mut BpfSkel, topo: &Topology) -> Result<()> {
1632 for (&llc_id, llc) in &topo.all_llcs {
1633 let mut node_order: Vec<usize> =
1635 topo.nodes[&llc.node_id].llcs.keys().cloned().collect();
1636 let mut sys_order: Vec<usize> = topo.all_llcs.keys().cloned().collect();
1637
1638 sys_order.retain(|id| !node_order.contains(id));
1640 node_order.retain(|&id| id != llc_id);
1641
1642 fastrand::seed(llc_id as u64);
1645 fastrand::shuffle(&mut sys_order);
1646 fastrand::shuffle(&mut node_order);
1647
1648 let mut order: Vec<usize> = vec![];
1650 let mut idx: usize = 0;
1651
1652 idx += 1;
1653 order.push(llc_id);
1654
1655 idx += node_order.len();
1656 order.append(&mut node_order);
1657 let node_end = idx;
1658
1659 idx += sys_order.len();
1660 order.append(&mut sys_order);
1661 let sys_end = idx;
1662
1663 debug!(
1664 "LLC[{}] proximity map[{}/{}]: {:?}",
1665 llc_id, node_end, sys_end, &order
1666 );
1667
1668 let key = llc_id as u32;
1673 let llc_id_slice =
1674 unsafe { std::slice::from_raw_parts((&key as *const u32) as *const u8, 4) };
1675 let v = skel
1676 .maps
1677 .llc_data
1678 .lookup(llc_id_slice, libbpf_rs::MapFlags::ANY)
1679 .unwrap()
1680 .unwrap();
1681 let mut llcc = unsafe { *(v.as_slice().as_ptr() as *const bpf_intf::llc_ctx) };
1682
1683 let pmap = &mut llcc.prox_map;
1684 for (i, &llc_id) in order.iter().enumerate() {
1685 pmap.llcs[i] = llc_id as u16;
1686 }
1687 pmap.node_end = node_end as u32;
1688 pmap.sys_end = sys_end as u32;
1689
1690 let v = unsafe {
1691 std::slice::from_raw_parts(
1692 &llcc as *const bpf_intf::llc_ctx as *const u8,
1693 std::mem::size_of::<bpf_intf::llc_ctx>(),
1694 )
1695 };
1696
1697 skel.maps
1698 .llc_data
1699 .update(llc_id_slice, v, libbpf_rs::MapFlags::ANY)?
1700 }
1701
1702 Ok(())
1703 }
1704
1705 fn init(
1706 opts: &'a Opts,
1707 layer_specs: &[LayerSpec],
1708 open_object: &'a mut MaybeUninit<OpenObject>,
1709 ) -> Result<Self> {
1710 let nr_layers = layer_specs.len();
1711 let mut disable_topology = opts.disable_topology.unwrap_or(false);
1712
1713 let topo = Arc::new(if disable_topology {
1714 Topology::with_flattened_llc_node()?
1715 } else {
1716 Topology::new()?
1717 });
1718
1719 if topo.nodes.keys().enumerate().any(|(i, &k)| i != k) {
1726 bail!("Holes in node IDs detected: {:?}", topo.nodes.keys());
1727 }
1728 if topo.all_llcs.keys().enumerate().any(|(i, &k)| i != k) {
1729 bail!("Holes in LLC IDs detected: {:?}", topo.all_llcs.keys());
1730 }
1731 if topo.all_cpus.keys().enumerate().any(|(i, &k)| i != k) {
1732 bail!("Holes in CPU IDs detected: {:?}", topo.all_cpus.keys());
1733 }
1734
1735 let netdevs = if opts.netdev_irq_balance {
1736 warn!(
1737 "Experimental netdev IRQ balancing enabled. Reset IRQ masks of network devices after use!!!"
1738 );
1739 read_netdevs()?
1740 } else {
1741 BTreeMap::new()
1742 };
1743
1744 if !disable_topology {
1745 if topo.nodes.len() == 1 && topo.nodes[&0].llcs.len() == 1 {
1746 disable_topology = true;
1747 };
1748 info!(
1749 "Topology awareness not specified, selecting {} based on hardware",
1750 if disable_topology {
1751 "disabled"
1752 } else {
1753 "enabled"
1754 }
1755 );
1756 };
1757
1758 let cpu_pool = CpuPool::new(topo.clone())?;
1759
1760 let layer_specs: Vec<_> = if disable_topology {
1763 info!("Disabling topology awareness");
1764 layer_specs
1765 .iter()
1766 .cloned()
1767 .map(|mut s| {
1768 s.kind.common_mut().nodes.clear();
1769 s.kind.common_mut().llcs.clear();
1770 s
1771 })
1772 .collect()
1773 } else {
1774 layer_specs.to_vec()
1775 };
1776
1777 let mut skel_builder = BpfSkelBuilder::default();
1779 skel_builder.obj_builder.debug(opts.verbose > 1);
1780 init_libbpf_logging(None);
1781 let mut skel = scx_ops_open!(skel_builder, open_object, layered)?;
1782
1783 if opts.enable_gpu_support {
1786 if opts.gpu_kprobe_level >= 1 {
1789 compat::cond_kprobe_enable("nvidia_open", &skel.progs.kprobe_nvidia_open)?;
1790 }
1791 if opts.gpu_kprobe_level >= 2 {
1794 compat::cond_kprobe_enable("nvidia_mmap", &skel.progs.kprobe_nvidia_mmap)?;
1795 }
1796 if opts.gpu_kprobe_level >= 3 {
1797 compat::cond_kprobe_enable("nvidia_poll", &skel.progs.kprobe_nvidia_poll)?;
1798 }
1799 }
1800
1801 skel.maps.rodata_data.slice_ns = scx_enums.SCX_SLICE_DFL;
1802 skel.maps.rodata_data.max_exec_ns = 20 * scx_enums.SCX_SLICE_DFL;
1803
1804 skel.struct_ops.layered_mut().exit_dump_len = opts.exit_dump_len;
1806
1807 if !opts.disable_queued_wakeup {
1808 match *compat::SCX_OPS_ALLOW_QUEUED_WAKEUP {
1809 0 => info!("Kernel does not support queued wakeup optimization"),
1810 v => skel.struct_ops.layered_mut().flags |= v,
1811 }
1812 }
1813
1814 skel.maps.rodata_data.percpu_kthread_preempt = !opts.disable_percpu_kthread_preempt;
1815 skel.maps.rodata_data.debug = opts.verbose as u32;
1816 skel.maps.rodata_data.slice_ns = opts.slice_us * 1000;
1817 skel.maps.rodata_data.max_exec_ns = if opts.max_exec_us > 0 {
1818 opts.max_exec_us * 1000
1819 } else {
1820 opts.slice_us * 1000 * 20
1821 };
1822 skel.maps.rodata_data.nr_cpu_ids = *NR_CPU_IDS as u32;
1823 skel.maps.rodata_data.nr_possible_cpus = *NR_CPUS_POSSIBLE as u32;
1824 skel.maps.rodata_data.smt_enabled = topo.smt_enabled;
1825 skel.maps.rodata_data.has_little_cores = topo.has_little_cores();
1826 skel.maps.rodata_data.xnuma_preemption = opts.xnuma_preemption;
1827 skel.maps.rodata_data.antistall_sec = opts.antistall_sec;
1828 skel.maps.rodata_data.monitor_disable = opts.monitor_disable;
1829 skel.maps.rodata_data.lo_fb_wait_ns = opts.lo_fb_wait_us * 1000;
1830 skel.maps.rodata_data.lo_fb_share_ppk = ((opts.lo_fb_share * 1024.0) as u32).clamp(1, 1024);
1831 skel.maps.rodata_data.enable_antistall = !opts.disable_antistall;
1832 skel.maps.rodata_data.enable_gpu_support = opts.enable_gpu_support;
1833
1834 for (cpu, sib) in topo.sibling_cpus().iter().enumerate() {
1835 skel.maps.rodata_data.__sibling_cpu[cpu] = *sib;
1836 }
1837 for cpu in topo.all_cpus.keys() {
1838 skel.maps.rodata_data.all_cpus[cpu / 8] |= 1 << (cpu % 8);
1839 }
1840
1841 skel.maps.rodata_data.nr_op_layers = layer_specs
1842 .iter()
1843 .filter(|spec| match &spec.kind {
1844 LayerKind::Open { .. } => spec.kind.common().preempt,
1845 _ => false,
1846 })
1847 .count() as u32;
1848 skel.maps.rodata_data.nr_on_layers = layer_specs
1849 .iter()
1850 .filter(|spec| match &spec.kind {
1851 LayerKind::Open { .. } => !spec.kind.common().preempt,
1852 _ => false,
1853 })
1854 .count() as u32;
1855 skel.maps.rodata_data.nr_gp_layers = layer_specs
1856 .iter()
1857 .filter(|spec| match &spec.kind {
1858 LayerKind::Grouped { .. } => spec.kind.common().preempt,
1859 _ => false,
1860 })
1861 .count() as u32;
1862 skel.maps.rodata_data.nr_gn_layers = layer_specs
1863 .iter()
1864 .filter(|spec| match &spec.kind {
1865 LayerKind::Grouped { .. } => !spec.kind.common().preempt,
1866 _ => false,
1867 })
1868 .count() as u32;
1869
1870 let mut min_open = u64::MAX;
1871 let mut min_preempt = u64::MAX;
1872
1873 for spec in layer_specs.iter() {
1874 if let LayerKind::Open { common, .. } = &spec.kind {
1875 min_open = min_open.min(common.disallow_open_after_us.unwrap());
1876 min_preempt = min_preempt.min(common.disallow_preempt_after_us.unwrap());
1877 }
1878 }
1879
1880 skel.maps.rodata_data.min_open_layer_disallow_open_after_ns = match min_open {
1881 u64::MAX => *DFL_DISALLOW_OPEN_AFTER_US,
1882 v => v,
1883 };
1884 skel.maps
1885 .rodata_data
1886 .min_open_layer_disallow_preempt_after_ns = match min_preempt {
1887 u64::MAX => *DFL_DISALLOW_PREEMPT_AFTER_US,
1888 v => v,
1889 };
1890
1891 for i in 0..layer_specs.len() {
1893 skel.maps.bss_data.empty_layer_ids[i] = i as u32;
1894 }
1895 skel.maps.bss_data.nr_empty_layer_ids = nr_layers as u32;
1896
1897 Self::init_layers(&mut skel, &layer_specs, &topo)?;
1898 Self::init_nodes(&mut skel, opts, &topo);
1899
1900 let mut skel = scx_ops_load!(skel, layered, uei)?;
1901
1902 let mut layers = vec![];
1903 let layer_growth_orders =
1904 LayerGrowthAlgo::layer_core_orders(&cpu_pool, &layer_specs, &topo)?;
1905 for (idx, spec) in layer_specs.iter().enumerate() {
1906 let growth_order = layer_growth_orders
1907 .get(&idx)
1908 .with_context(|| "layer has no growth order".to_string())?;
1909 layers.push(Layer::new(spec, &topo, growth_order)?);
1910 }
1911
1912 let mut idle_qos_enabled = layers
1913 .iter()
1914 .any(|layer| layer.kind.common().idle_resume_us.unwrap_or(0) > 0);
1915 if idle_qos_enabled && !cpu_idle_resume_latency_supported() {
1916 warn!("idle_resume_us not supported, ignoring");
1917 idle_qos_enabled = false;
1918 }
1919
1920 Self::init_cpus(&skel, &layer_specs, &topo)?;
1921 Self::init_llc_prox_map(&mut skel, &topo)?;
1922
1923 let proc_reader = procfs::ProcReader::new();
1925
1926 let input = ProgramInput {
1928 ..Default::default()
1929 };
1930 let prog = &mut skel.progs.initialize_pid_namespace;
1931
1932 let _ = prog.test_run(input);
1933
1934 let struct_ops = scx_ops_attach!(skel, layered)?;
1942 let stats_server = StatsServer::new(stats::server_data()).launch()?;
1943
1944 let sched = Self {
1945 struct_ops: Some(struct_ops),
1946 layer_specs,
1947
1948 sched_intv: Duration::from_secs_f64(opts.interval),
1949
1950 cpu_pool,
1951 layers,
1952 idle_qos_enabled,
1953
1954 sched_stats: Stats::new(&mut skel, &proc_reader)?,
1955
1956 nr_layer_cpus_ranges: vec![(0, 0); nr_layers],
1957 processing_dur: Default::default(),
1958
1959 proc_reader,
1960 skel,
1961
1962 topo,
1963 netdevs,
1964 stats_server,
1965 };
1966
1967 info!("Layered Scheduler Attached. Run `scx_layered --monitor` for metrics.");
1968
1969 Ok(sched)
1970 }
1971
1972 fn update_bpf_layer_cpumask(layer: &Layer, bpf_layer: &mut types::layer) {
1973 trace!("[{}] Updating BPF CPUs: {}", layer.name, &layer.cpus);
1974 for cpu in 0..layer.cpus.len() {
1975 if layer.cpus.test_cpu(cpu) {
1976 bpf_layer.cpus[cpu / 8] |= 1 << (cpu % 8);
1977 } else {
1978 bpf_layer.cpus[cpu / 8] &= !(1 << (cpu % 8));
1979 }
1980 }
1981
1982 bpf_layer.nr_cpus = layer.nr_cpus as u32;
1983 for (llc_id, &nr_llc_cpus) in layer.nr_llc_cpus.iter().enumerate() {
1984 bpf_layer.nr_llc_cpus[llc_id] = nr_llc_cpus as u32;
1985 }
1986
1987 bpf_layer.refresh_cpus = 1;
1988 }
1989
1990 fn update_netdev_cpumasks(&mut self) -> Result<()> {
1991 let available_cpus = self.cpu_pool.available_cpus();
1992 if available_cpus.is_empty() {
1993 return Ok(());
1994 }
1995
1996 for (iface, netdev) in self.netdevs.iter_mut() {
1997 let node = self
1998 .topo
1999 .nodes
2000 .values()
2001 .take_while(|n| n.id == netdev.node())
2002 .next()
2003 .ok_or_else(|| anyhow!("Failed to get netdev node"))?;
2004 let node_cpus = node.span.clone();
2005 for (irq, irqmask) in netdev.irqs.iter_mut() {
2006 irqmask.clear_all();
2007 for cpu in available_cpus.iter() {
2008 if !node_cpus.test_cpu(cpu) {
2009 continue;
2010 }
2011 let _ = irqmask.set_cpu(cpu);
2012 }
2013 if irqmask.weight() == 0 {
2015 for cpu in node_cpus.iter() {
2016 let _ = irqmask.set_cpu(cpu);
2017 }
2018 }
2019 trace!("{} updating irq {} cpumask {:?}", iface, irq, irqmask);
2020 }
2021 netdev.apply_cpumasks()?;
2022 }
2023
2024 Ok(())
2025 }
2026
2027 fn calc_target_nr_cpus(&self) -> Vec<(usize, usize)> {
2033 let nr_cpus = self.cpu_pool.topo.all_cpus.len();
2034 let utils = &self.sched_stats.layer_utils;
2035
2036 let mut records: Vec<(u64, u64, u64, usize, usize, usize)> = vec![];
2037 let mut targets: Vec<(usize, usize)> = vec![];
2038
2039 for (idx, layer) in self.layers.iter().enumerate() {
2040 targets.push(match &layer.kind {
2041 LayerKind::Confined {
2042 util_range,
2043 cpus_range,
2044 cpus_range_frac,
2045 ..
2046 }
2047 | LayerKind::Grouped {
2048 util_range,
2049 cpus_range,
2050 cpus_range_frac,
2051 ..
2052 } => {
2053 let owned = utils[idx][LAYER_USAGE_OWNED];
2059 let open = utils[idx][LAYER_USAGE_OPEN];
2060
2061 let mut util = owned;
2062 if layer.nr_cpus == 0 {
2063 util += open;
2064 }
2065
2066 let util = if util < 0.01 { 0.0 } else { util };
2067 let low = (util / util_range.1).ceil() as usize;
2068 let high = ((util / util_range.0).floor() as usize).max(low);
2069 let target = layer.cpus.weight().clamp(low, high);
2070 let cpus_range =
2071 resolve_cpus_pct_range(cpus_range, cpus_range_frac, nr_cpus).unwrap();
2072
2073 records.push((
2074 (owned * 100.0) as u64,
2075 (open * 100.0) as u64,
2076 (util * 100.0) as u64,
2077 low,
2078 high,
2079 target,
2080 ));
2081
2082 (target.clamp(cpus_range.0, cpus_range.1), cpus_range.0)
2083 }
2084 LayerKind::Open { .. } => (0, 0),
2085 });
2086 }
2087
2088 trace!("initial targets: {:?}", &targets);
2089 trace!("(owned, open, util, low, high, target): {:?}", &records);
2090 targets
2091 }
2092
2093 fn weighted_target_nr_cpus(&self, targets: &[(usize, usize)]) -> Vec<usize> {
2097 let mut nr_left = self.cpu_pool.topo.all_cpus.len();
2098 let weights: Vec<usize> = self
2099 .layers
2100 .iter()
2101 .map(|layer| layer.kind.common().weight as usize)
2102 .collect();
2103 let mut cands: BTreeMap<usize, (usize, usize, usize)> = targets
2104 .iter()
2105 .zip(&weights)
2106 .enumerate()
2107 .map(|(i, ((target, min), weight))| (i, (*target, *min, *weight)))
2108 .collect();
2109 let mut weight_sum: usize = weights.iter().sum();
2110 let mut weighted: Vec<usize> = vec![0; self.layers.len()];
2111
2112 trace!("cands: {:?}", &cands);
2113
2114 cands.retain(|&i, &mut (target, min, weight)| {
2116 if target <= min {
2117 let target = target.min(nr_left);
2118 weighted[i] = target;
2119 weight_sum -= weight;
2120 nr_left -= target;
2121 false
2122 } else {
2123 true
2124 }
2125 });
2126
2127 trace!("cands after accepting mins: {:?}", &cands);
2128
2129 let calc_share = |nr_left, weight, weight_sum| {
2131 (((nr_left * weight) as f64 / weight_sum as f64).ceil() as usize).min(nr_left)
2132 };
2133
2134 while !cands.is_empty() {
2135 let mut progress = false;
2136
2137 cands.retain(|&i, &mut (target, _min, weight)| {
2138 let share = calc_share(nr_left, weight, weight_sum);
2139 if target <= share {
2140 weighted[i] = target;
2141 weight_sum -= weight;
2142 nr_left -= target;
2143 progress = true;
2144 false
2145 } else {
2146 true
2147 }
2148 });
2149
2150 if !progress {
2151 break;
2152 }
2153 }
2154
2155 trace!("cands after accepting under allotted: {:?}", &cands);
2156
2157 let nr_to_share = nr_left;
2160 for (i, (_target, _min, weight)) in cands.into_iter() {
2161 let share = calc_share(nr_to_share, weight, weight_sum).min(nr_left);
2162 weighted[i] = share;
2163 nr_left -= share;
2164 }
2165
2166 trace!("weighted: {:?}", &weighted);
2167
2168 weighted
2169 }
2170
2171 fn compute_target_llcs(target: usize, topo: &Topology) -> (usize, usize) {
2175 let cores_per_llc = topo.all_cores.len() / topo.all_llcs.len();
2177 let cpus_per_core = topo.all_cores.first_key_value().unwrap().1.cpus.len();
2179 let cpus_per_llc = cores_per_llc * cpus_per_core;
2180
2181 let full = target / cpus_per_llc;
2182 let extra = target % cpus_per_llc;
2183
2184 (full, extra.div_ceil(cpus_per_core))
2185 }
2186
2187 fn recompute_layer_core_order(&mut self, layer_targets: &Vec<(usize, usize)>) {
2195 debug!(
2197 " free: before pass: free_llcs={:?}",
2198 self.cpu_pool.free_llcs
2199 );
2200 for &(idx, target) in layer_targets.iter().rev() {
2201 let layer = &mut self.layers[idx];
2202 let old_tlc = layer.target_llc_cpus;
2203 let new_tlc = Self::compute_target_llcs(target, &self.topo);
2204
2205 if layer.growth_algo != LayerGrowthAlgo::StickyDynamic {
2206 continue;
2207 }
2208
2209 let mut to_free = (old_tlc.0 as i32 - new_tlc.0 as i32).max(0) as usize;
2210
2211 debug!(
2212 " free: layer={} old_tlc={:?} new_tlc={:?} to_free={} assigned={} free={}",
2213 layer.name,
2214 old_tlc,
2215 new_tlc,
2216 to_free,
2217 layer.assigned_llcs.len(),
2218 self.cpu_pool.free_llcs.len()
2219 );
2220
2221 while to_free > 0 && layer.assigned_llcs.len() > 0 {
2222 let llc = layer.assigned_llcs.pop().unwrap();
2223 self.cpu_pool.free_llcs.push((llc, 0));
2224 to_free -= 1;
2225
2226 debug!(" layer={} freed_llc={}", layer.name, llc);
2227 }
2228 }
2229 debug!(" free: after pass: free_llcs={:?}", self.cpu_pool.free_llcs);
2230
2231 for &(idx, target) in layer_targets.iter().rev() {
2233 let layer = &mut self.layers[idx];
2234 let old_tlc = layer.target_llc_cpus;
2235 let new_tlc = Self::compute_target_llcs(target, &self.topo);
2236
2237 if layer.growth_algo != LayerGrowthAlgo::StickyDynamic {
2238 continue;
2239 }
2240
2241 let mut to_alloc = (new_tlc.0 as i32 - old_tlc.0 as i32).max(0) as usize;
2242
2243 debug!(
2244 " alloc: layer={} old_tlc={:?} new_tlc={:?} to_alloc={} assigned={} free={}",
2245 layer.name,
2246 old_tlc,
2247 new_tlc,
2248 to_alloc,
2249 layer.assigned_llcs.len(),
2250 self.cpu_pool.free_llcs.len()
2251 );
2252
2253 while to_alloc > 0
2254 && self.cpu_pool.free_llcs.len() > 0
2255 && to_alloc <= self.cpu_pool.free_llcs.len()
2256 {
2257 let llc = self.cpu_pool.free_llcs.pop().unwrap().0;
2258 layer.assigned_llcs.push(llc);
2259 to_alloc -= 1;
2260
2261 debug!(" layer={} alloc_llc={}", layer.name, llc);
2262 }
2263
2264 debug!(
2265 " alloc: layer={} assigned_llcs={:?}",
2266 layer.name, layer.assigned_llcs
2267 );
2268
2269 layer.target_llc_cpus = new_tlc;
2271 }
2272
2273 for &(idx, _) in layer_targets.iter() {
2276 let mut core_order = vec![];
2277 let layer = &mut self.layers[idx];
2278
2279 if layer.growth_algo != LayerGrowthAlgo::StickyDynamic {
2280 continue;
2281 }
2282
2283 let tlc = layer.target_llc_cpus;
2284 let mut extra = tlc.1;
2285 let cores_per_llc = self.topo.all_cores.len() / self.topo.all_llcs.len();
2287 let cpus_per_core = self.topo.all_cores.first_key_value().unwrap().1.cpus.len();
2288 let cpus_per_llc = cores_per_llc * cpus_per_core;
2289
2290 for i in 0..self.cpu_pool.free_llcs.len() {
2292 let free_vec = &mut self.cpu_pool.free_llcs;
2293 let avail = cpus_per_llc - free_vec[i].1;
2295 let mut used = extra.min(avail);
2297
2298 let shift = free_vec[i].1;
2299 free_vec[i].1 += used;
2300
2301 let llc_id = free_vec[i].0;
2302 let llc = self.topo.all_llcs.get(&llc_id).unwrap();
2303
2304 for core in llc.cores.iter().skip(shift) {
2305 core_order.push(core.1.id);
2306 if used == 0 {
2307 break;
2308 }
2309 used -= 1;
2310 }
2311
2312 extra -= used;
2313 if extra == 0 {
2314 break;
2315 }
2316 }
2317
2318 core_order.reverse();
2319 layer.core_order = core_order;
2320 }
2321
2322 for i in 0..self.cpu_pool.free_llcs.len() {
2324 self.cpu_pool.free_llcs[i].1 = 0;
2325 }
2326
2327 for &(idx, _) in layer_targets.iter() {
2328 let layer = &mut self.layers[idx];
2329
2330 if layer.growth_algo != LayerGrowthAlgo::StickyDynamic {
2331 continue;
2332 }
2333
2334 for core in self.topo.all_cores.iter() {
2335 let llc_id = core.1.llc_id;
2336 if layer.assigned_llcs.contains(&llc_id) {
2337 layer.core_order.push(core.1.id);
2338 }
2339 }
2340 layer.core_order.reverse();
2342
2343 debug!(
2344 " alloc: layer={} core_order={:?}",
2345 layer.name, layer.core_order
2346 );
2347 }
2348 }
2349
2350 fn refresh_cpumasks(&mut self) -> Result<()> {
2351 let layer_is_open = |layer: &Layer| matches!(layer.kind, LayerKind::Open { .. });
2352
2353 let mut updated = false;
2354 let targets = self.calc_target_nr_cpus();
2355 let targets = self.weighted_target_nr_cpus(&targets);
2356
2357 let mut ascending: Vec<(usize, usize)> = targets.iter().copied().enumerate().collect();
2358 ascending.sort_by(|a, b| a.1.cmp(&b.1));
2359
2360 self.recompute_layer_core_order(&ascending);
2361
2362 let mut force_free = self
2365 .layers
2366 .iter()
2367 .zip(targets.iter())
2368 .any(|(layer, &target)| layer.nr_cpus < target);
2369
2370 for &(idx, target) in ascending.iter().rev() {
2374 let layer = &mut self.layers[idx];
2375 if layer_is_open(layer) {
2376 continue;
2377 }
2378
2379 let nr_cur = layer.cpus.weight();
2380 if nr_cur <= target {
2381 continue;
2382 }
2383 let mut nr_to_free = nr_cur - target;
2384
2385 let nr_to_break_at = nr_to_free / 2;
2390
2391 let mut freed = false;
2392
2393 while nr_to_free > 0 {
2394 let max_to_free = if force_free {
2395 force_free = false;
2396 layer.nr_cpus
2397 } else {
2398 nr_to_free
2399 };
2400
2401 let nr_freed = layer.free_some_cpus(&mut self.cpu_pool, max_to_free)?;
2402 if nr_freed == 0 {
2403 break;
2404 }
2405
2406 nr_to_free = nr_to_free.saturating_sub(nr_freed);
2407 freed = true;
2408
2409 if nr_to_free <= nr_to_break_at {
2410 break;
2411 }
2412 }
2413
2414 if freed {
2415 Self::update_bpf_layer_cpumask(layer, &mut self.skel.maps.bss_data.layers[idx]);
2416 updated = true;
2417 }
2418 }
2419
2420 for &(idx, target) in &ascending {
2426 let layer = &mut self.layers[idx];
2427
2428 if layer_is_open(layer) {
2429 continue;
2430 }
2431
2432 let nr_cur = layer.cpus.weight();
2433 if nr_cur >= target {
2434 continue;
2435 }
2436
2437 let mut nr_to_alloc = target - nr_cur;
2438 let mut alloced = false;
2439
2440 while nr_to_alloc > 0 {
2441 let nr_alloced = layer.alloc_some_cpus(&mut self.cpu_pool)?;
2442 if nr_alloced == 0 {
2443 break;
2444 }
2445 alloced = true;
2446 nr_to_alloc -= nr_alloced.min(nr_to_alloc);
2447 }
2448
2449 if alloced {
2450 Self::update_bpf_layer_cpumask(layer, &mut self.skel.maps.bss_data.layers[idx]);
2451 updated = true;
2452 }
2453 }
2454
2455 if updated {
2457 for (idx, layer) in self.layers.iter_mut().enumerate() {
2458 if !layer_is_open(layer) {
2459 continue;
2460 }
2461
2462 let bpf_layer = &mut self.skel.maps.bss_data.layers[idx];
2463 let available_cpus = self.cpu_pool.available_cpus().and(&layer.allowed_cpus);
2464 let nr_available_cpus = available_cpus.weight();
2465
2466 layer.cpus = available_cpus;
2469 layer.nr_cpus = nr_available_cpus;
2470 Self::update_bpf_layer_cpumask(layer, bpf_layer);
2471 }
2472
2473 self.skel.maps.bss_data.fallback_cpu = self.cpu_pool.fallback_cpu as u32;
2474
2475 for (lidx, layer) in self.layers.iter().enumerate() {
2476 self.nr_layer_cpus_ranges[lidx] = (
2477 self.nr_layer_cpus_ranges[lidx].0.min(layer.nr_cpus),
2478 self.nr_layer_cpus_ranges[lidx].1.max(layer.nr_cpus),
2479 );
2480 }
2481
2482 let input = ProgramInput {
2484 ..Default::default()
2485 };
2486 let prog = &mut self.skel.progs.refresh_layer_cpumasks;
2487 let _ = prog.test_run(input);
2488
2489 let empty_layer_ids: Vec<u32> = self
2491 .layers
2492 .iter()
2493 .enumerate()
2494 .filter(|(_idx, layer)| layer.nr_cpus == 0)
2495 .map(|(idx, _layer)| idx as u32)
2496 .collect();
2497 for i in 0..self.layers.len() {
2498 self.skel.maps.bss_data.empty_layer_ids[i] =
2499 empty_layer_ids.get(i).cloned().unwrap_or(MAX_LAYERS as u32);
2500 }
2501 self.skel.maps.bss_data.nr_empty_layer_ids = empty_layer_ids.len() as u32;
2502 }
2503
2504 let _ = self.update_netdev_cpumasks();
2505 Ok(())
2506 }
2507
2508 fn refresh_idle_qos(&mut self) -> Result<()> {
2509 if !self.idle_qos_enabled {
2510 return Ok(());
2511 }
2512
2513 let mut cpu_idle_qos = vec![0; *NR_CPU_IDS];
2514 for layer in self.layers.iter() {
2515 let idle_resume_us = layer.kind.common().idle_resume_us.unwrap_or(0) as i32;
2516 for cpu in layer.cpus.iter() {
2517 cpu_idle_qos[cpu] = idle_resume_us;
2518 }
2519 }
2520
2521 for (cpu, idle_resume_usec) in cpu_idle_qos.iter().enumerate() {
2522 update_cpu_idle_resume_latency(cpu, *idle_resume_usec)?;
2523 }
2524
2525 Ok(())
2526 }
2527
2528 fn step(&mut self) -> Result<()> {
2529 let started_at = Instant::now();
2530 self.sched_stats.refresh(
2531 &mut self.skel,
2532 &self.proc_reader,
2533 started_at,
2534 self.processing_dur,
2535 )?;
2536 self.refresh_cpumasks()?;
2537 self.refresh_idle_qos()?;
2538 self.processing_dur += Instant::now().duration_since(started_at);
2539 Ok(())
2540 }
2541
2542 fn generate_sys_stats(
2543 &mut self,
2544 stats: &Stats,
2545 cpus_ranges: &mut [(usize, usize)],
2546 ) -> Result<SysStats> {
2547 let bstats = &stats.bpf_stats;
2548 let mut sys_stats = SysStats::new(stats, bstats, self.cpu_pool.fallback_cpu)?;
2549
2550 for (lidx, (spec, layer)) in self.layer_specs.iter().zip(self.layers.iter()).enumerate() {
2551 let layer_stats = LayerStats::new(lidx, layer, stats, bstats, cpus_ranges[lidx]);
2552 sys_stats.layers.insert(spec.name.to_string(), layer_stats);
2553 cpus_ranges[lidx] = (layer.nr_cpus, layer.nr_cpus);
2554 }
2555
2556 Ok(sys_stats)
2557 }
2558
2559 fn run(&mut self, shutdown: Arc<AtomicBool>) -> Result<UserExitInfo> {
2560 let (res_ch, req_ch) = self.stats_server.channels();
2561 let mut next_sched_at = Instant::now() + self.sched_intv;
2562 let mut cpus_ranges = HashMap::<ThreadId, Vec<(usize, usize)>>::new();
2563
2564 while !shutdown.load(Ordering::Relaxed) && !uei_exited!(&self.skel, uei) {
2565 let now = Instant::now();
2566
2567 if now >= next_sched_at {
2568 self.step()?;
2569 while next_sched_at < now {
2570 next_sched_at += self.sched_intv;
2571 }
2572 }
2573
2574 match req_ch.recv_deadline(next_sched_at) {
2575 Ok(StatsReq::Hello(tid)) => {
2576 cpus_ranges.insert(
2577 tid,
2578 self.layers.iter().map(|l| (l.nr_cpus, l.nr_cpus)).collect(),
2579 );
2580 let stats = Stats::new(&mut self.skel, &self.proc_reader)?;
2581 res_ch.send(StatsRes::Hello(stats))?;
2582 }
2583 Ok(StatsReq::Refresh(tid, mut stats)) => {
2584 for i in 0..self.nr_layer_cpus_ranges.len() {
2586 for (_, ranges) in cpus_ranges.iter_mut() {
2587 ranges[i] = (
2588 ranges[i].0.min(self.nr_layer_cpus_ranges[i].0),
2589 ranges[i].1.max(self.nr_layer_cpus_ranges[i].1),
2590 );
2591 }
2592 self.nr_layer_cpus_ranges[i] =
2593 (self.layers[i].nr_cpus, self.layers[i].nr_cpus);
2594 }
2595
2596 stats.refresh(&mut self.skel, &self.proc_reader, now, self.processing_dur)?;
2597 let sys_stats =
2598 self.generate_sys_stats(&stats, cpus_ranges.get_mut(&tid).unwrap())?;
2599 res_ch.send(StatsRes::Refreshed((stats, sys_stats)))?;
2600 }
2601 Ok(StatsReq::Bye(tid)) => {
2602 cpus_ranges.remove(&tid);
2603 res_ch.send(StatsRes::Bye)?;
2604 }
2605 Err(RecvTimeoutError::Timeout) => {}
2606 Err(e) => Err(e)?,
2607 }
2608 }
2609
2610 self.struct_ops.take();
2611 uei_report!(&self.skel, uei)
2612 }
2613}
2614
2615impl Drop for Scheduler<'_> {
2616 fn drop(&mut self) {
2617 if let Some(struct_ops) = self.struct_ops.take() {
2618 drop(struct_ops);
2619 }
2620 }
2621}
2622
2623fn write_example_file(path: &str) -> Result<()> {
2624 let mut f = fs::OpenOptions::new()
2625 .create_new(true)
2626 .write(true)
2627 .open(path)?;
2628 Ok(f.write_all(serde_json::to_string_pretty(&*EXAMPLE_CONFIG)?.as_bytes())?)
2629}
2630
2631fn verify_layer_specs(specs: &[LayerSpec]) -> Result<()> {
2632 let nr_specs = specs.len();
2633 if nr_specs == 0 {
2634 bail!("No layer spec");
2635 }
2636 if nr_specs > MAX_LAYERS {
2637 bail!("Too many layer specs");
2638 }
2639
2640 for (idx, spec) in specs.iter().enumerate() {
2641 if idx < nr_specs - 1 {
2642 if spec.matches.is_empty() {
2643 bail!("Non-terminal spec {:?} has NULL matches", spec.name);
2644 }
2645 } else {
2646 if spec.matches.len() != 1 || !spec.matches[0].is_empty() {
2647 bail!("Terminal spec {:?} must have an empty match", spec.name);
2648 }
2649 }
2650
2651 if spec.matches.len() > MAX_LAYER_MATCH_ORS {
2652 bail!(
2653 "Spec {:?} has too many ({}) OR match blocks",
2654 spec.name,
2655 spec.matches.len()
2656 );
2657 }
2658
2659 for (ands_idx, ands) in spec.matches.iter().enumerate() {
2660 if ands.len() > NR_LAYER_MATCH_KINDS {
2661 bail!(
2662 "Spec {:?}'s {}th OR block has too many ({}) match conditions",
2663 spec.name,
2664 ands_idx,
2665 ands.len()
2666 );
2667 }
2668 for one in ands.iter() {
2669 match one {
2670 LayerMatch::CgroupPrefix(prefix) => {
2671 if prefix.len() > MAX_PATH {
2672 bail!("Spec {:?} has too long a cgroup prefix", spec.name);
2673 }
2674 }
2675 LayerMatch::CommPrefix(prefix) => {
2676 if prefix.len() > MAX_COMM {
2677 bail!("Spec {:?} has too long a comm prefix", spec.name);
2678 }
2679 }
2680 LayerMatch::PcommPrefix(prefix) => {
2681 if prefix.len() > MAX_COMM {
2682 bail!("Spec {:?} has too long a process name prefix", spec.name);
2683 }
2684 }
2685 _ => {}
2686 }
2687 }
2688 }
2689
2690 match spec.kind {
2691 LayerKind::Confined {
2692 cpus_range,
2693 util_range,
2694 ..
2695 }
2696 | LayerKind::Grouped {
2697 cpus_range,
2698 util_range,
2699 ..
2700 } => {
2701 if let Some((cpus_min, cpus_max)) = cpus_range {
2702 if cpus_min > cpus_max {
2703 bail!(
2704 "Spec {:?} has invalid cpus_range({}, {})",
2705 spec.name,
2706 cpus_min,
2707 cpus_max
2708 );
2709 }
2710 }
2711 if util_range.0 >= util_range.1 {
2712 bail!(
2713 "Spec {:?} has invalid util_range ({}, {})",
2714 spec.name,
2715 util_range.0,
2716 util_range.1
2717 );
2718 }
2719 }
2720 _ => {}
2721 }
2722 }
2723
2724 Ok(())
2725}
2726
2727fn main() -> Result<()> {
2728 let opts = Opts::parse();
2729
2730 if opts.help_stats {
2731 stats::server_data().describe_meta(&mut std::io::stdout(), None)?;
2732 return Ok(());
2733 }
2734
2735 if opts.no_load_frac_limit {
2736 warn!("--no-load-frac-limit is deprecated and noop");
2737 }
2738 if opts.layer_preempt_weight_disable != 0.0 {
2739 warn!("--layer-preempt-weight-disable is deprecated and noop");
2740 }
2741 if opts.layer_growth_weight_disable != 0.0 {
2742 warn!("--layer-growth-weight-disable is deprecated and noop");
2743 }
2744 if opts.local_llc_iteration {
2745 warn!("--local_llc_iteration is deprecated and noop");
2746 }
2747
2748 let llv = match opts.verbose {
2749 0 => simplelog::LevelFilter::Info,
2750 1 => simplelog::LevelFilter::Debug,
2751 _ => simplelog::LevelFilter::Trace,
2752 };
2753 let mut lcfg = simplelog::ConfigBuilder::new();
2754 lcfg.set_time_level(simplelog::LevelFilter::Error)
2755 .set_location_level(simplelog::LevelFilter::Off)
2756 .set_target_level(simplelog::LevelFilter::Off)
2757 .set_thread_level(simplelog::LevelFilter::Off);
2758 simplelog::TermLogger::init(
2759 llv,
2760 lcfg.build(),
2761 simplelog::TerminalMode::Stderr,
2762 simplelog::ColorChoice::Auto,
2763 )?;
2764
2765 debug!("opts={:?}", &opts);
2766
2767 let shutdown = Arc::new(AtomicBool::new(false));
2768 let shutdown_clone = shutdown.clone();
2769 ctrlc::set_handler(move || {
2770 shutdown_clone.store(true, Ordering::Relaxed);
2771 })
2772 .context("Error setting Ctrl-C handler")?;
2773
2774 if let Some(intv) = opts.monitor.or(opts.stats) {
2775 let shutdown_copy = shutdown.clone();
2776 let jh = std::thread::spawn(move || {
2777 match stats::monitor(Duration::from_secs_f64(intv), shutdown_copy) {
2778 Ok(_) => {
2779 debug!("stats monitor thread finished successfully")
2780 }
2781 Err(error_object) => {
2782 warn!(
2783 "stats monitor thread finished because of an error {}",
2784 error_object
2785 )
2786 }
2787 }
2788 });
2789 if opts.monitor.is_some() {
2790 let _ = jh.join();
2791 return Ok(());
2792 }
2793 }
2794
2795 if let Some(path) = &opts.example {
2796 write_example_file(path)?;
2797 return Ok(());
2798 }
2799
2800 let mut layer_config = match opts.run_example {
2801 true => EXAMPLE_CONFIG.clone(),
2802 false => LayerConfig { specs: vec![] },
2803 };
2804
2805 for (idx, input) in opts.specs.iter().enumerate() {
2806 layer_config.specs.append(
2807 &mut LayerSpec::parse(input)
2808 .context(format!("Failed to parse specs[{}] ({:?})", idx, input))?,
2809 );
2810 }
2811
2812 for spec in layer_config.specs.iter_mut() {
2813 let common = spec.kind.common_mut();
2814
2815 if common.slice_us == 0 {
2816 common.slice_us = opts.slice_us;
2817 }
2818
2819 if common.weight == 0 {
2820 common.weight = DEFAULT_LAYER_WEIGHT;
2821 }
2822 common.weight = common.weight.clamp(MIN_LAYER_WEIGHT, MAX_LAYER_WEIGHT);
2823
2824 if common.preempt {
2825 if common.disallow_open_after_us.is_some() {
2826 warn!(
2827 "Preempt layer {} has non-null disallow_open_after_us, ignored",
2828 &spec.name
2829 );
2830 }
2831 if common.disallow_preempt_after_us.is_some() {
2832 warn!(
2833 "Preempt layer {} has non-null disallow_preempt_after_us, ignored",
2834 &spec.name
2835 );
2836 }
2837 common.disallow_open_after_us = Some(u64::MAX);
2838 common.disallow_preempt_after_us = Some(u64::MAX);
2839 } else {
2840 if common.disallow_open_after_us.is_none() {
2841 common.disallow_open_after_us = Some(*DFL_DISALLOW_OPEN_AFTER_US);
2842 }
2843
2844 if common.disallow_preempt_after_us.is_none() {
2845 common.disallow_preempt_after_us = Some(*DFL_DISALLOW_PREEMPT_AFTER_US);
2846 }
2847 }
2848
2849 if common.idle_smt.is_some() {
2850 warn!("Layer {} has deprecated flag \"idle_smt\"", &spec.name);
2851 }
2852 }
2853
2854 debug!("specs={}", serde_json::to_string_pretty(&layer_config)?);
2855 verify_layer_specs(&layer_config.specs)?;
2856
2857 let mut open_object = MaybeUninit::uninit();
2858 loop {
2859 let mut sched = Scheduler::init(&opts, &layer_config.specs, &mut open_object)?;
2860 if !sched.run(shutdown.clone())?.should_restart() {
2861 break;
2862 }
2863 }
2864
2865 Ok(())
2866}