1use std::collections::BTreeMap;
2use std::io::Write;
3use std::sync::atomic::AtomicBool;
4use std::sync::atomic::Ordering;
5use std::sync::Arc;
6use std::thread::current;
7use std::thread::ThreadId;
8use std::time::Duration;
9use std::time::SystemTime;
10use std::time::UNIX_EPOCH;
11
12use anyhow::bail;
13use anyhow::Result;
14use chrono::DateTime;
15use chrono::Local;
16use scx_stats::prelude::*;
17use scx_stats_derive::stat_doc;
18use scx_stats_derive::Stats;
19use scx_utils::Cpumask;
20use scx_utils::Topology;
21use serde::Deserialize;
22use serde::Serialize;
23use tracing::warn;
24
25use crate::bpf_intf;
26use crate::BpfStats;
27use crate::Layer;
28use crate::LayerKind;
29use crate::Stats;
30use crate::LAYER_USAGE_OPEN;
31use crate::LAYER_USAGE_PROTECTED;
32use crate::LAYER_USAGE_PROTECTED_PREEMPT;
33use crate::LAYER_USAGE_SUM_UPTO;
34
35const GSTAT_EXCL_IDLE: usize = bpf_intf::global_stat_id_GSTAT_EXCL_IDLE as usize;
36const GSTAT_EXCL_WAKEUP: usize = bpf_intf::global_stat_id_GSTAT_EXCL_WAKEUP as usize;
37const GSTAT_HI_FB_EVENTS: usize = bpf_intf::global_stat_id_GSTAT_HI_FB_EVENTS as usize;
38const GSTAT_HI_FB_USAGE: usize = bpf_intf::global_stat_id_GSTAT_HI_FB_USAGE as usize;
39const GSTAT_LO_FB_EVENTS: usize = bpf_intf::global_stat_id_GSTAT_LO_FB_EVENTS as usize;
40const GSTAT_LO_FB_USAGE: usize = bpf_intf::global_stat_id_GSTAT_LO_FB_USAGE as usize;
41const GSTAT_FB_CPU_USAGE: usize = bpf_intf::global_stat_id_GSTAT_FB_CPU_USAGE as usize;
42const GSTAT_ANTISTALL: usize = bpf_intf::global_stat_id_GSTAT_ANTISTALL as usize;
43const GSTAT_SKIP_PREEMPT: usize = bpf_intf::global_stat_id_GSTAT_SKIP_PREEMPT as usize;
44const GSTAT_FIXUP_VTIME: usize = bpf_intf::global_stat_id_GSTAT_FIXUP_VTIME as usize;
45const GSTAT_PREEMPTING_MISMATCH: usize =
46 bpf_intf::global_stat_id_GSTAT_PREEMPTING_MISMATCH as usize;
47
48const LSTAT_SEL_LOCAL: usize = bpf_intf::layer_stat_id_LSTAT_SEL_LOCAL as usize;
49const LSTAT_ENQ_LOCAL: usize = bpf_intf::layer_stat_id_LSTAT_ENQ_LOCAL as usize;
50const LSTAT_ENQ_WAKEUP: usize = bpf_intf::layer_stat_id_LSTAT_ENQ_WAKEUP as usize;
51const LSTAT_ENQ_EXPIRE: usize = bpf_intf::layer_stat_id_LSTAT_ENQ_EXPIRE as usize;
52const LSTAT_ENQ_REENQ: usize = bpf_intf::layer_stat_id_LSTAT_ENQ_REENQ as usize;
53const LSTAT_ENQ_DSQ: usize = bpf_intf::layer_stat_id_LSTAT_ENQ_DSQ as usize;
54const LSTAT_MIN_EXEC: usize = bpf_intf::layer_stat_id_LSTAT_MIN_EXEC as usize;
55const LSTAT_MIN_EXEC_NS: usize = bpf_intf::layer_stat_id_LSTAT_MIN_EXEC_NS as usize;
56const LSTAT_OPEN_IDLE: usize = bpf_intf::layer_stat_id_LSTAT_OPEN_IDLE as usize;
57const LSTAT_AFFN_VIOL: usize = bpf_intf::layer_stat_id_LSTAT_AFFN_VIOL as usize;
58const LSTAT_KEEP: usize = bpf_intf::layer_stat_id_LSTAT_KEEP as usize;
59const LSTAT_KEEP_FAIL_MAX_EXEC: usize = bpf_intf::layer_stat_id_LSTAT_KEEP_FAIL_MAX_EXEC as usize;
60const LSTAT_KEEP_FAIL_BUSY: usize = bpf_intf::layer_stat_id_LSTAT_KEEP_FAIL_BUSY as usize;
61const LSTAT_PREEMPT: usize = bpf_intf::layer_stat_id_LSTAT_PREEMPT as usize;
62const LSTAT_PREEMPT_FIRST: usize = bpf_intf::layer_stat_id_LSTAT_PREEMPT_FIRST as usize;
63const LSTAT_PREEMPT_XLLC: usize = bpf_intf::layer_stat_id_LSTAT_PREEMPT_XLLC as usize;
64const LSTAT_PREEMPT_XNUMA: usize = bpf_intf::layer_stat_id_LSTAT_PREEMPT_XNUMA as usize;
65const LSTAT_PREEMPT_IDLE: usize = bpf_intf::layer_stat_id_LSTAT_PREEMPT_IDLE as usize;
66const LSTAT_PREEMPT_FAIL: usize = bpf_intf::layer_stat_id_LSTAT_PREEMPT_FAIL as usize;
67const LSTAT_EXCL_COLLISION: usize = bpf_intf::layer_stat_id_LSTAT_EXCL_COLLISION as usize;
68const LSTAT_EXCL_PREEMPT: usize = bpf_intf::layer_stat_id_LSTAT_EXCL_PREEMPT as usize;
69const LSTAT_YIELD: usize = bpf_intf::layer_stat_id_LSTAT_YIELD as usize;
70const LSTAT_YIELD_IGNORE: usize = bpf_intf::layer_stat_id_LSTAT_YIELD_IGNORE as usize;
71const LSTAT_MIGRATION: usize = bpf_intf::layer_stat_id_LSTAT_MIGRATION as usize;
72const LSTAT_XNUMA_MIGRATION: usize = bpf_intf::layer_stat_id_LSTAT_XNUMA_MIGRATION as usize;
73const LSTAT_XLLC_MIGRATION: usize = bpf_intf::layer_stat_id_LSTAT_XLLC_MIGRATION as usize;
74const LSTAT_XLLC_MIGRATION_SKIP: usize = bpf_intf::layer_stat_id_LSTAT_XLLC_MIGRATION_SKIP as usize;
75const LSTAT_XLAYER_WAKE: usize = bpf_intf::layer_stat_id_LSTAT_XLAYER_WAKE as usize;
76const LSTAT_XLAYER_REWAKE: usize = bpf_intf::layer_stat_id_LSTAT_XLAYER_REWAKE as usize;
77const LSTAT_LLC_DRAIN_TRY: usize = bpf_intf::layer_stat_id_LSTAT_LLC_DRAIN_TRY as usize;
78const LSTAT_LLC_DRAIN: usize = bpf_intf::layer_stat_id_LSTAT_LLC_DRAIN as usize;
79const LSTAT_SKIP_REMOTE_NODE: usize = bpf_intf::layer_stat_id_LSTAT_SKIP_REMOTE_NODE as usize;
80
81const LLC_LSTAT_LAT: usize = bpf_intf::llc_layer_stat_id_LLC_LSTAT_LAT as usize;
82const LLC_LSTAT_CNT: usize = bpf_intf::llc_layer_stat_id_LLC_LSTAT_CNT as usize;
83
84fn calc_frac(a: f64, b: f64) -> f64 {
85 if b != 0.0 {
86 a / b * 100.0
87 } else {
88 0.0
89 }
90}
91
92fn fmt_pct(v: f64) -> String {
93 if v >= 99.95 {
94 format!("{:4.0}", v)
95 } else if v >= 10.0 {
96 format!("{:4.1}", v)
97 } else if v > 0.0 && v < 0.01 {
98 format!("{:4.2}", 0.01)
99 } else {
100 format!("{:4.2}", v)
101 }
102}
103
104fn fmt_duration_ms(ms: f64) -> String {
105 if ms >= 60_000.0 {
106 let min = ms / 60_000.0;
107 if min >= 100.0 {
108 format!("{:.0}min", min)
109 } else {
110 format!("{:.1}min", min)
111 }
112 } else if ms >= 1_000.0 {
113 let s = ms / 1_000.0;
114 if s >= 100.0 {
115 format!("{:.0}s", s)
116 } else {
117 format!("{:.1}s", s)
118 }
119 } else if ms >= 10.0 {
120 format!("{:.0}ms", ms)
121 } else {
122 format!("{:.1}ms", ms)
123 }
124}
125
126fn fmt_num(v: u64) -> String {
127 if v > 1_000_000 {
128 format!("{:5.1}m", v as f64 / 1_000_000.0)
129 } else if v > 1_000 {
130 format!("{:5.1}k", v as f64 / 1_000.0)
131 } else {
132 format!("{:5.0} ", v)
133 }
134}
135
136#[stat_doc]
137#[derive(Clone, Debug, Default, Serialize, Deserialize, Stats)]
138#[stat(_om_prefix = "l_", _om_label = "layer_name")]
139pub struct LayerStats {
140 #[stat(desc = "index", _om_skip)]
141 pub index: usize,
142 #[stat(desc = "Total CPU utilization (100% means one full CPU)")]
143 pub util: f64,
144 #[stat(desc = "Compensated CPU utilization (adjusted for irq/softirq/stolen)")]
145 pub util_compensated: f64,
146 #[stat(desc = "Protected CPU utilization %")]
147 pub util_protected_frac: f64,
148 #[stat(desc = "Preempt-protected CPU utilization %")]
149 pub util_protected_preempt_frac: f64,
150 #[stat(desc = "Open CPU utilization %")]
151 pub util_open_frac: f64,
152 #[stat(desc = "fraction of total CPU utilization")]
153 pub util_frac: f64,
154 #[stat(desc = "number of tasks")]
155 pub tasks: u32,
156 #[stat(desc = "count of sched events during the period")]
157 pub total: u64,
158 #[stat(desc = "% dispatched into idle CPU from select_cpu")]
159 pub sel_local: f64,
160 #[stat(desc = "% dispatched into idle CPU from enqueue")]
161 pub enq_local: f64,
162 #[stat(desc = "% enqueued after wakeup")]
163 pub enq_wakeup: f64,
164 #[stat(desc = "% enqueued after slice expiration")]
165 pub enq_expire: f64,
166 #[stat(desc = "% re-enqueued due to RT preemption")]
167 pub enq_reenq: f64,
168 #[stat(desc = "% enqueued into the layer's LLC DSQs")]
169 pub enq_dsq: f64,
170 #[stat(desc = "count of times exec duration < min_exec_us")]
171 pub min_exec: f64,
172 #[stat(desc = "total exec durations extended due to min_exec_us")]
173 pub min_exec_us: u64,
174 #[stat(desc = "% dispatched into idle CPUs occupied by other layers")]
175 pub open_idle: f64,
176 #[stat(desc = "% preempted other tasks")]
177 pub preempt: f64,
178 #[stat(desc = "% preempted XLLC tasks")]
179 pub preempt_xllc: f64,
180 #[stat(desc = "% preempted across NUMA nodes")]
181 pub preempt_xnuma: f64,
182 #[stat(desc = "% first-preempted other tasks")]
183 pub preempt_first: f64,
184 #[stat(desc = "% idle-preempted other tasks")]
185 pub preempt_idle: f64,
186 #[stat(desc = "% attempted to preempt other tasks but failed")]
187 pub preempt_fail: f64,
188 #[stat(desc = "% violated config due to CPU affinity")]
189 pub affn_viol: f64,
190 #[stat(desc = "% continued executing after slice expiration")]
191 pub keep: f64,
192 #[stat(desc = "% disallowed to continue executing due to max_exec")]
193 pub keep_fail_max_exec: f64,
194 #[stat(desc = "% disallowed to continue executing due to other tasks")]
195 pub keep_fail_busy: f64,
196 #[stat(desc = "whether is exclusive", _om_skip)]
197 pub is_excl: u32,
198 #[stat(desc = "count of times an excl task skipped a CPU as the sibling was also excl")]
199 pub excl_collision: f64,
200 #[stat(desc = "% a sibling CPU was preempted for an exclusive task")]
201 pub excl_preempt: f64,
202 #[stat(desc = "% yielded")]
203 pub yielded: f64,
204 #[stat(desc = "count of times yield was ignored")]
205 pub yield_ignore: u64,
206 #[stat(desc = "% migrated across CPUs")]
207 pub migration: f64,
208 #[stat(desc = "% migrated across NUMA nodes")]
209 pub xnuma_migration: f64,
210 #[stat(desc = "% migrated across LLCs")]
211 pub xllc_migration: f64,
212 #[stat(desc = "% migration skipped across LLCs due to xllc_mig_min_us")]
213 pub xllc_migration_skip: f64,
214 #[stat(desc = "% wakers across layers")]
215 pub xlayer_wake: f64,
216 #[stat(desc = "% rewakers across layers where waker has waken the task previously")]
217 pub xlayer_rewake: f64,
218 #[stat(desc = "% LLC draining tried")]
219 pub llc_drain_try: f64,
220 #[stat(desc = "% LLC draining succeeded")]
221 pub llc_drain: f64,
222 #[stat(desc = "% skip LLC dispatch on remote node")]
223 pub skip_remote_node: f64,
224 #[stat(desc = "mask of allocated CPUs", _om_skip)]
225 pub cpus: Vec<u64>,
226 #[stat(desc = "count of CPUs assigned")]
227 pub cur_nr_cpus: u32,
228 #[stat(desc = "minimum # of CPUs assigned")]
229 pub min_nr_cpus: u32,
230 #[stat(desc = "maximum # of CPUs assigned")]
231 pub max_nr_cpus: u32,
232 #[stat(desc = "count of CPUs assigned per LLC")]
233 pub nr_llc_cpus: Vec<u32>,
234 #[stat(desc = "slice duration config")]
235 pub slice_us: u64,
236 #[stat(desc = "Per-LLC scheduling event fractions")]
237 pub llc_fracs: Vec<f64>,
238 #[stat(desc = "Per-LLC average latency")]
239 pub llc_lats: Vec<f64>,
240 #[stat(desc = "Layer memory bandwidth as a % of total allowed (0 for \"no limit\"")]
241 pub membw_pct: f64,
242 #[stat(desc = "DSQ insertion ratio EWMA (10s window)")]
243 pub dsq_insert_ewma: f64,
244 #[stat(desc = "Per-node layer utilization (100% = one full CPU)")]
245 pub node_utils: Vec<f64>,
246 #[stat(desc = "Per-node pinned task utilization (100% = one full CPU)")]
247 pub node_pinned_utils: Vec<f64>,
248 #[stat(desc = "Per-node pinned task counts")]
249 pub node_pinned_tasks: Vec<u64>,
250 #[stat(desc = "Per-node load (100% = one full CPU, from duty cycle sum)")]
251 pub node_loads: Vec<f64>,
252 #[stat(desc = "Whether xnuma gating is active for this layer (0/1)")]
253 pub xnuma_active: u32,
254}
255
256impl LayerStats {
257 pub fn new(
258 lidx: usize,
259 layer: &Layer,
260 stats: &Stats,
261 bstats: &BpfStats,
262 nr_cpus_range: (usize, usize),
263 xnuma_active: bool,
264 ) -> Self {
265 let lstat = |sidx| bstats.lstats[lidx][sidx];
266 let ltotal = lstat(LSTAT_SEL_LOCAL)
267 + lstat(LSTAT_ENQ_LOCAL)
268 + lstat(LSTAT_ENQ_WAKEUP)
269 + lstat(LSTAT_ENQ_EXPIRE)
270 + lstat(LSTAT_ENQ_REENQ)
271 + lstat(LSTAT_KEEP);
272 let lstat_pct = |sidx| {
273 if ltotal != 0 {
274 lstat(sidx) as f64 / ltotal as f64 * 100.0
275 } else {
276 0.0
277 }
278 };
279
280 let util_sum = stats.layer_utils[lidx]
281 .iter()
282 .take(LAYER_USAGE_SUM_UPTO + 1)
283 .sum::<f64>();
284
285 let util_comp_sum = stats.layer_utils_compensated[lidx]
286 .iter()
287 .take(LAYER_USAGE_SUM_UPTO + 1)
288 .sum::<f64>();
289
290 let membw_frac = match &layer.kind {
291 LayerKind::Open { .. } => 0.0,
293 LayerKind::Confined { membw_gb, .. } | LayerKind::Grouped { membw_gb, .. } => {
294 if let Some(membw_limit_gb) = membw_gb {
296 stats.layer_membws[lidx]
297 .iter()
298 .take(LAYER_USAGE_SUM_UPTO + 1)
299 .sum::<f64>()
300 / (*membw_limit_gb * (1024_u64.pow(3) as f64))
301 } else {
302 0.0
303 }
304 }
305 };
306
307 Self {
308 index: lidx,
309 util: util_sum * 100.0,
310 util_compensated: util_comp_sum * 100.0,
311 util_open_frac: calc_frac(stats.layer_utils[lidx][LAYER_USAGE_OPEN], util_sum),
312 util_protected_frac: calc_frac(
313 stats.layer_utils[lidx][LAYER_USAGE_PROTECTED],
314 util_sum,
315 ),
316 util_protected_preempt_frac: calc_frac(
317 stats.layer_utils[lidx][LAYER_USAGE_PROTECTED_PREEMPT],
318 util_sum,
319 ),
320 util_frac: calc_frac(util_sum, stats.total_util),
321 tasks: stats.nr_layer_tasks[lidx] as u32,
322 total: ltotal,
323 sel_local: lstat_pct(LSTAT_SEL_LOCAL),
324 enq_local: lstat_pct(LSTAT_ENQ_LOCAL),
325 enq_wakeup: lstat_pct(LSTAT_ENQ_WAKEUP),
326 enq_expire: lstat_pct(LSTAT_ENQ_EXPIRE),
327 enq_reenq: lstat_pct(LSTAT_ENQ_REENQ),
328 enq_dsq: lstat_pct(LSTAT_ENQ_DSQ),
329 min_exec: lstat_pct(LSTAT_MIN_EXEC),
330 min_exec_us: lstat(LSTAT_MIN_EXEC_NS) / 1000,
331 open_idle: lstat_pct(LSTAT_OPEN_IDLE),
332 preempt: lstat_pct(LSTAT_PREEMPT),
333 preempt_xllc: lstat_pct(LSTAT_PREEMPT_XLLC),
334 preempt_xnuma: lstat_pct(LSTAT_PREEMPT_XNUMA),
335 preempt_first: lstat_pct(LSTAT_PREEMPT_FIRST),
336 preempt_idle: lstat_pct(LSTAT_PREEMPT_IDLE),
337 preempt_fail: lstat_pct(LSTAT_PREEMPT_FAIL),
338 affn_viol: lstat_pct(LSTAT_AFFN_VIOL),
339 keep: lstat_pct(LSTAT_KEEP),
340 keep_fail_max_exec: lstat_pct(LSTAT_KEEP_FAIL_MAX_EXEC),
341 keep_fail_busy: lstat_pct(LSTAT_KEEP_FAIL_BUSY),
342 is_excl: layer.kind.common().exclusive as u32,
343 excl_collision: lstat_pct(LSTAT_EXCL_COLLISION),
344 excl_preempt: lstat_pct(LSTAT_EXCL_PREEMPT),
345 yielded: lstat_pct(LSTAT_YIELD),
346 yield_ignore: lstat(LSTAT_YIELD_IGNORE),
347 migration: lstat_pct(LSTAT_MIGRATION),
348 xnuma_migration: lstat_pct(LSTAT_XNUMA_MIGRATION),
349 xlayer_wake: lstat_pct(LSTAT_XLAYER_WAKE),
350 xlayer_rewake: lstat_pct(LSTAT_XLAYER_REWAKE),
351 xllc_migration: lstat_pct(LSTAT_XLLC_MIGRATION),
352 xllc_migration_skip: lstat_pct(LSTAT_XLLC_MIGRATION_SKIP),
353 llc_drain_try: lstat_pct(LSTAT_LLC_DRAIN_TRY),
354 llc_drain: lstat_pct(LSTAT_LLC_DRAIN),
355 skip_remote_node: lstat_pct(LSTAT_SKIP_REMOTE_NODE),
356 cpus: layer.cpus.as_raw_slice().to_vec(),
357 cur_nr_cpus: layer.cpus.weight() as u32,
358 min_nr_cpus: nr_cpus_range.0 as u32,
359 max_nr_cpus: nr_cpus_range.1 as u32,
360 nr_llc_cpus: layer.nr_llc_cpus.iter().map(|&v| v as u32).collect(),
361 slice_us: stats.layer_slice_us[lidx],
362 llc_fracs: {
363 let sid = LLC_LSTAT_CNT;
364 let sum = bstats.llc_lstats[lidx]
365 .iter()
366 .map(|lstats| lstats[sid])
367 .sum::<u64>() as f64;
368 bstats.llc_lstats[lidx]
369 .iter()
370 .map(|lstats| calc_frac(lstats[sid] as f64, sum))
371 .collect()
372 },
373 llc_lats: bstats.llc_lstats[lidx]
374 .iter()
375 .map(|lstats| lstats[LLC_LSTAT_LAT] as f64 / 1_000_000_000.0)
376 .collect(),
377 membw_pct: membw_frac * 100.0,
378 dsq_insert_ewma: stats.layer_dsq_insert_ewma[lidx] * 100.0,
379 node_utils: stats.layer_node_utils[lidx]
380 .iter()
381 .map(|u| u * 100.0)
382 .collect(),
383 node_pinned_utils: stats.layer_node_pinned_utils[lidx]
384 .iter()
385 .map(|u| u * 100.0)
386 .collect(),
387 node_pinned_tasks: stats.layer_nr_node_pinned_tasks[lidx].clone(),
388 node_loads: stats.layer_node_duty_sums[lidx]
389 .iter()
390 .map(|l| l * 100.0)
391 .collect(),
392 xnuma_active: if xnuma_active { 1 } else { 0 },
393 }
394 }
395
396 pub fn format<W: Write>(
397 &self,
398 w: &mut W,
399 name: &str,
400 topo: Option<&Topology>,
401 max_width: usize,
402 no_llc: bool,
403 ) -> Result<()> {
404 let comp_str = if self.util > 0.1 && (self.util_compensated - self.util).abs() > 0.1 {
406 let overhead_pct = (1.0 - self.util / self.util_compensated) * 100.0;
407 format!(" comp_overhead={:.1}%", overhead_pct)
408 } else {
409 String::new()
410 };
411 writeln!(
412 w,
413 "\n\u{25B6} {} \u{2500} util/open/frac={:6.1}/{}/{:7.1}{} prot/prot_preempt={}/{} tasks={:6}",
414 name,
415 self.util,
416 fmt_pct(self.util_open_frac),
417 self.util_frac,
418 comp_str,
419 fmt_pct(self.util_protected_frac),
420 fmt_pct(self.util_protected_preempt_frac),
421 self.tasks,
422 )?;
423
424 writeln!(
426 w,
427 " {:<7} tot={} dd_sel/enq={}/{} dsq/10s={}/{} wake/exp/re={}/{}/{}",
428 "sched",
429 fmt_num(self.total),
430 fmt_pct(self.sel_local),
431 fmt_pct(self.enq_local),
432 fmt_pct(self.enq_dsq),
433 fmt_pct(self.dsq_insert_ewma),
434 fmt_pct(self.enq_wakeup),
435 fmt_pct(self.enq_expire),
436 fmt_pct(self.enq_reenq),
437 )?;
438
439 writeln!(
441 w,
442 " {:<7} keep/max/busy={}/{}/{} yield/ign={}/{} slc={} min_ex={}/{}",
443 "exec",
444 fmt_pct(self.keep),
445 fmt_pct(self.keep_fail_max_exec),
446 fmt_pct(self.keep_fail_busy),
447 fmt_pct(self.yielded),
448 fmt_num(self.yield_ignore),
449 fmt_duration_ms(self.slice_us as f64 / 1000.0),
450 fmt_pct(self.min_exec),
451 fmt_duration_ms(self.min_exec_us as f64 / 1000.0),
452 )?;
453
454 writeln!(
456 w,
457 " {:<7} mig={} xnuma={} xllc/skip={}/{} open_idle={} affn_viol={}",
458 "mig",
459 fmt_pct(self.migration),
460 fmt_pct(self.xnuma_migration),
461 fmt_pct(self.xllc_migration),
462 fmt_pct(self.xllc_migration_skip),
463 fmt_pct(self.open_idle),
464 fmt_pct(self.affn_viol),
465 )?;
466
467 writeln!(
469 w,
470 " {:<7} preempt/first/xllc/xnuma/idle/fail={}/{}/{}/{}/{}/{}",
471 "preempt",
472 fmt_pct(self.preempt),
473 fmt_pct(self.preempt_first),
474 fmt_pct(self.preempt_xllc),
475 fmt_pct(self.preempt_xnuma),
476 fmt_pct(self.preempt_idle),
477 fmt_pct(self.preempt_fail),
478 )?;
479
480 writeln!(
482 w,
483 " {:<7} wake/re={}/{} llc_drain/try={}/{} skip_rnode={}",
484 "xlayer",
485 fmt_pct(self.xlayer_wake),
486 fmt_pct(self.xlayer_rewake),
487 fmt_pct(self.llc_drain),
488 fmt_pct(self.llc_drain_try),
489 fmt_pct(self.skip_remote_node),
490 )?;
491
492 if self.node_utils.len() > 1 {
494 let prefix = " node pin/ut/ld ";
495 let cell_width = 25;
497 let usable = if max_width > prefix.len() {
498 max_width - prefix.len()
499 } else {
500 60
501 };
502 let cells_per_row = (usable / cell_width).max(1);
503
504 for nid in 0..self.node_utils.len() {
505 let util = self.node_utils[nid];
506 let load = self.node_loads.get(nid).copied().unwrap_or(0.0);
507 let pin = self.node_pinned_utils.get(nid).copied().unwrap_or(0.0);
508 if nid % cells_per_row == 0 {
509 if nid > 0 {
510 writeln!(w)?;
511 }
512 write!(w, "{prefix}")?;
513 } else {
514 write!(w, " ")?;
515 }
516 write!(w, "N{}={:5.1}/{:5.1}/{:7.1}", nid, pin, util, load)?;
517 }
518 writeln!(w)?;
519 }
520
521 let cpumask = Cpumask::from_vec(self.cpus.clone());
523
524 if let Some(topo) = topo {
525 let header = topo.format_cpumask_header(&cpumask, self.min_nr_cpus, self.max_nr_cpus);
526 writeln!(w, " {}", header)?;
527 if cpumask.weight() > 0 {
528 topo.format_cpumask_grid(w, &cpumask, " ", max_width)?;
529 }
530 } else {
531 writeln!(
532 w,
533 " cpus={:3} [{:3},{:3}] {}",
534 self.cur_nr_cpus, self.min_nr_cpus, self.max_nr_cpus, &cpumask,
535 )?;
536 }
537
538 if self.is_excl != 0 {
540 writeln!(
541 w,
542 " excl_coll={} excl_preempt={}",
543 fmt_pct(self.excl_collision),
544 fmt_pct(self.excl_preempt),
545 )?;
546 } else if self.excl_collision != 0.0 || self.excl_preempt != 0.0 {
547 warn!(
548 "{}: exclusive is off but excl_coll={} excl_preempt={}",
549 name,
550 fmt_pct(self.excl_collision),
551 fmt_pct(self.excl_preempt),
552 );
553 }
554
555 if !no_llc {
557 let active_llcs: Vec<(usize, f64, f64)> = self
559 .llc_fracs
560 .iter()
561 .zip(self.llc_lats.iter())
562 .enumerate()
563 .filter(|(i, (&frac, _))| {
564 let nr_cpus = self.nr_llc_cpus.get(*i).copied().unwrap_or(0);
565 nr_cpus > 0 || frac > 0.0
566 })
567 .map(|(i, (&frac, &lat))| (i, frac, lat))
568 .collect();
569
570 if !active_llcs.is_empty() {
571 let indent = " ";
572 writeln!(w, "{indent}LLC sched%/lat_ms")?;
573 let cell_width = 14;
575 let usable = if max_width > indent.len() {
576 max_width - indent.len()
577 } else {
578 60
579 };
580 let cells_per_row = (usable / cell_width).max(1);
581
582 for (col, &(llc_id, frac, lat)) in active_llcs.iter().enumerate() {
583 if col % cells_per_row == 0 {
584 if col > 0 {
585 writeln!(w)?;
586 }
587 write!(w, "{indent}")?;
588 } else {
589 write!(w, " ")?;
590 }
591 write!(w, "[{:02}]{}/{:4.1}", llc_id, fmt_pct(frac), lat * 1_000.0)?;
592 }
593 writeln!(w)?;
594 }
595 }
596
597 Ok(())
598 }
599}
600
601#[stat_doc]
602#[derive(Clone, Debug, Default, Serialize, Deserialize, Stats)]
603#[stat(top)]
604pub struct SysStats {
605 #[stat(desc = "timestamp", _om_skip)]
606 pub at: f64,
607 #[stat(desc = "# of NUMA nodes")]
608 pub nr_nodes: usize,
609 #[stat(desc = "# sched events during the period")]
610 pub total: u64,
611 #[stat(desc = "% dispatched directly into an idle CPU from select_cpu")]
612 pub local_sel: f64,
613 #[stat(desc = "% dispatched directly into an idle CPU from enqueue")]
614 pub local_enq: f64,
615 #[stat(desc = "% open layer tasks scheduled into allocated but idle CPUs")]
616 pub open_idle: f64,
617 #[stat(desc = "% violated config due to CPU affinity")]
618 pub affn_viol: f64,
619 #[stat(desc = "% sent to hi fallback DSQs")]
620 pub hi_fb: f64,
621 #[stat(desc = "% sent to lo fallback DSQs")]
622 pub lo_fb: f64,
623 #[stat(desc = "count of times an excl task skipped a CPU as the sibling was also excl")]
624 pub excl_collision: f64,
625 #[stat(desc = "count of times a sibling CPU was preempted for an excl task")]
626 pub excl_preempt: f64,
627 #[stat(desc = "count of times a CPU skipped dispatching due to an excl task on the sibling")]
628 pub excl_idle: f64,
629 #[stat(
630 desc = "count of times an idle sibling CPU was woken up after an excl task is finished"
631 )]
632 pub excl_wakeup: f64,
633 #[stat(desc = "CPU time this binary consumed during the period")]
634 pub proc_ms: u64,
635 #[stat(desc = "CPU busy % (100% means all CPU)")]
636 pub busy: f64,
637 #[stat(desc = "CPU util % (100% means one CPU)")]
638 pub util: f64,
639 #[stat(desc = "CPU util % used by hi fallback DSQs")]
640 pub hi_fb_util: f64,
641 #[stat(desc = "CPU util % used by lo fallback DSQs")]
642 pub lo_fb_util: f64,
643 #[stat(desc = "Number of tasks dispatched via antistall")]
644 pub antistall: u64,
645 #[stat(desc = "Number of times preemptions of non-scx tasks were avoided")]
646 pub skip_preempt: u64,
647 #[stat(desc = "Number of times vtime was out of range and fixed up")]
648 pub fixup_vtime: u64,
649 #[stat(desc = "Number of times cpuc->preempting_task didn't come on the CPU")]
650 pub preempting_mismatch: u64,
651 #[stat(desc = "per-node fallback CPUs")]
652 pub fallback_cpus: BTreeMap<u32, u32>,
653 #[stat(desc = "per-layer statistics")]
654 pub fallback_cpu_util: f64,
655 #[stat(desc = "fallback CPU util %")]
656 pub layers: BTreeMap<String, LayerStats>,
657 #[stat(desc = "Number of gpu tasks affinitized since scheduler start")]
658 pub gpu_tasks_affinitized: u64,
659 #[stat(desc = "Time (in ms) of last affinitization run.")]
660 pub gpu_task_affinitization_ms: u64,
661 #[stat(desc = "System CPU utilization EWMA (10s window)")]
662 pub system_cpu_util_ewma: f64,
663}
664
665impl SysStats {
666 pub fn new(
667 stats: &Stats,
668 bstats: &BpfStats,
669 fallback_cpus: &BTreeMap<usize, usize>,
670 ) -> Result<Self> {
671 let lsum = |idx| stats.bpf_stats.lstats_sums[idx];
672 let total = lsum(LSTAT_SEL_LOCAL)
673 + lsum(LSTAT_ENQ_LOCAL)
674 + lsum(LSTAT_ENQ_WAKEUP)
675 + lsum(LSTAT_ENQ_EXPIRE)
676 + lsum(LSTAT_ENQ_REENQ)
677 + lsum(LSTAT_KEEP);
678 let lsum_pct = |idx| {
679 if total != 0 {
680 lsum(idx) as f64 / total as f64 * 100.0
681 } else {
682 0.0
683 }
684 };
685
686 let elapsed_ns = stats.elapsed.as_nanos();
687
688 Ok(Self {
689 at: SystemTime::now().duration_since(UNIX_EPOCH)?.as_secs_f64(),
690 nr_nodes: stats.topo.nodes.len(),
691 total,
692 local_sel: lsum_pct(LSTAT_SEL_LOCAL),
693 local_enq: lsum_pct(LSTAT_ENQ_LOCAL),
694 open_idle: lsum_pct(LSTAT_OPEN_IDLE),
695 affn_viol: lsum_pct(LSTAT_AFFN_VIOL),
696 hi_fb: calc_frac(
697 stats.bpf_stats.gstats[GSTAT_HI_FB_EVENTS] as f64,
698 total as f64,
699 ),
700 lo_fb: calc_frac(
701 stats.bpf_stats.gstats[GSTAT_LO_FB_EVENTS] as f64,
702 total as f64,
703 ),
704 excl_collision: lsum_pct(LSTAT_EXCL_COLLISION),
705 excl_preempt: lsum_pct(LSTAT_EXCL_PREEMPT),
706 excl_idle: bstats.gstats[GSTAT_EXCL_IDLE] as f64 / total as f64,
707 excl_wakeup: bstats.gstats[GSTAT_EXCL_WAKEUP] as f64 / total as f64,
708 proc_ms: stats.processing_dur.as_millis() as u64,
709 busy: stats.cpu_busy * 100.0,
710 util: stats.total_util * 100.0,
711 hi_fb_util: stats.bpf_stats.gstats[GSTAT_HI_FB_USAGE] as f64 / elapsed_ns as f64
712 * 100.0,
713 lo_fb_util: stats.bpf_stats.gstats[GSTAT_LO_FB_USAGE] as f64 / elapsed_ns as f64
714 * 100.0,
715 antistall: stats.bpf_stats.gstats[GSTAT_ANTISTALL],
716 skip_preempt: stats.bpf_stats.gstats[GSTAT_SKIP_PREEMPT],
717 fixup_vtime: stats.bpf_stats.gstats[GSTAT_FIXUP_VTIME],
718 preempting_mismatch: stats.bpf_stats.gstats[GSTAT_PREEMPTING_MISMATCH],
719 fallback_cpus: fallback_cpus
720 .iter()
721 .map(|(&k, &v)| (k as u32, v as u32))
722 .collect(),
723 fallback_cpu_util: stats.bpf_stats.gstats[GSTAT_FB_CPU_USAGE] as f64
724 / elapsed_ns as f64
725 * 100.0,
726 layers: BTreeMap::new(),
727 gpu_tasks_affinitized: stats.gpu_tasks_affinitized,
728 gpu_task_affinitization_ms: stats.gpu_task_affinitization_ms,
729 system_cpu_util_ewma: stats.system_cpu_util_ewma * 100.0,
730 })
731 }
732
733 pub fn format<W: Write>(&self, w: &mut W) -> Result<()> {
734 writeln!(
735 w,
736 "tot={:7} local_sel/enq={}/{} open_idle={} affn_viol={} hi/lo={}/{}",
737 self.total,
738 fmt_pct(self.local_sel),
739 fmt_pct(self.local_enq),
740 fmt_pct(self.open_idle),
741 fmt_pct(self.affn_viol),
742 fmt_pct(self.hi_fb),
743 fmt_pct(self.lo_fb),
744 )?;
745
746 let single_node = self.fallback_cpus.len() == 1;
747 let fb_cpus_str: Vec<String> = self
748 .fallback_cpus
749 .iter()
750 .map(|(n, c)| {
751 if single_node {
752 format!("{}", c)
753 } else {
754 format!("N{}:{}", n, c)
755 }
756 })
757 .collect();
758 writeln!(
759 w,
760 "busy={:5.1} util/hi/lo={:7.1}/{}/{} fb_cpus=[{}]/util={:4.1} proc={}ms sys_util_10s={:5.1}",
761 self.busy,
762 self.util,
763 fmt_pct(self.hi_fb_util),
764 fmt_pct(self.lo_fb_util),
765 fb_cpus_str.join(","),
766 self.fallback_cpu_util,
767 self.proc_ms,
768 self.system_cpu_util_ewma,
769 )?;
770
771 writeln!(
772 w,
773 "excl_coll={:.2} excl_preempt={:.2} excl_idle={:.2} excl_wakeup={:.2}",
774 self.excl_collision, self.excl_preempt, self.excl_idle, self.excl_wakeup
775 )?;
776
777 writeln!(
778 w,
779 "skip_preempt={} antistall={} fixup_vtime={} preempting_mismatch={}",
780 self.skip_preempt, self.antistall, self.fixup_vtime, self.preempting_mismatch
781 )?;
782
783 writeln!(
784 w,
785 "gpu_tasks_affinitized={} gpu_task_affinitization_time={}",
786 self.gpu_tasks_affinitized, self.gpu_task_affinitization_ms
787 )?;
788
789 Ok(())
790 }
791
792 pub fn format_all<W: Write>(
793 &self,
794 w: &mut W,
795 topo: Option<&Topology>,
796 max_width: usize,
797 no_llc: bool,
798 ) -> Result<()> {
799 self.format(w)?;
800
801 let mut idx_to_name: Vec<(usize, &String)> =
802 self.layers.iter().map(|(k, v)| (v.index, k)).collect();
803
804 idx_to_name.sort();
805
806 for (_idx, name) in &idx_to_name {
807 self.layers[*name].format(w, name, topo, max_width, no_llc)?;
808 }
809
810 Ok(())
811 }
812}
813
814#[derive(Debug)]
815pub enum StatsReq {
816 Hello(ThreadId),
817 Refresh(ThreadId, Box<Stats>),
818 Bye(ThreadId),
819}
820
821#[derive(Debug)]
822pub enum StatsRes {
823 Hello(Box<Stats>),
824 Refreshed(Box<(Stats, SysStats)>),
825 Bye,
826}
827
828pub fn server_data() -> StatsServerData<StatsReq, StatsRes> {
829 let open: Box<dyn StatsOpener<StatsReq, StatsRes>> = Box::new(move |(req_ch, res_ch)| {
830 let tid = current().id();
831 req_ch.send(StatsReq::Hello(tid))?;
832 let mut stats = Some(match res_ch.recv()? {
833 StatsRes::Hello(v) => *v,
834 res => bail!("invalid response to Hello: {:?}", res),
835 });
836
837 let read: Box<dyn StatsReader<StatsReq, StatsRes>> =
838 Box::new(move |_args, (req_ch, res_ch)| {
839 req_ch.send(StatsReq::Refresh(tid, Box::new(stats.take().unwrap())))?;
840 let (new_stats, sys_stats) = match res_ch.recv()? {
841 StatsRes::Refreshed(v) => *v,
842 res => bail!("invalid response to Refresh: {:?}", res),
843 };
844 stats = Some(new_stats);
845 sys_stats.to_json()
846 });
847
848 Ok(read)
849 });
850
851 let close: Box<dyn StatsCloser<StatsReq, StatsRes>> = Box::new(move |(req_ch, res_ch)| {
852 req_ch.send(StatsReq::Bye(current().id())).unwrap();
853 match res_ch.recv().unwrap() {
854 StatsRes::Bye => {}
855 res => panic!("invalid response to Bye: {:?}", res),
856 }
857 });
858
859 StatsServerData::new()
860 .add_meta(LayerStats::meta())
861 .add_meta(SysStats::meta())
862 .add_ops(
863 "top",
864 StatsOps {
865 open,
866 close: Some(close),
867 },
868 )
869}
870
871pub fn monitor(
872 intv: Duration,
873 shutdown: Arc<AtomicBool>,
874 max_width: usize,
875 no_llc: bool,
876) -> Result<()> {
877 let topo = Topology::new().ok();
878 scx_utils::monitor_stats::<SysStats>(
879 &[],
880 intv,
881 || shutdown.load(Ordering::Relaxed),
882 |sst| {
883 let dt = DateTime::<Local>::from(UNIX_EPOCH + Duration::from_secs_f64(sst.at));
884 let header = format!("\u{2501}\u{2501} {} ", dt.to_rfc2822());
885 let pad = max_width.saturating_sub(header.chars().count());
886 println!("{}{}", header, "\u{2501}".repeat(pad));
887 sst.format_all(&mut std::io::stdout(), topo.as_ref(), max_width, no_llc)
888 },
889 )
890}