1use std::collections::BTreeMap;
2use std::io::Write;
3use std::sync::atomic::AtomicBool;
4use std::sync::atomic::Ordering;
5use std::sync::Arc;
6use std::thread::current;
7use std::thread::ThreadId;
8use std::time::Duration;
9use std::time::SystemTime;
10use std::time::UNIX_EPOCH;
11
12use anyhow::bail;
13use anyhow::Result;
14use chrono::DateTime;
15use chrono::Local;
16use scx_stats::prelude::*;
17use scx_stats_derive::stat_doc;
18use scx_stats_derive::Stats;
19use scx_utils::Cpumask;
20use scx_utils::Topology;
21use serde::Deserialize;
22use serde::Serialize;
23use tracing::warn;
24
25use crate::bpf_intf;
26use crate::BpfStats;
27use crate::Layer;
28use crate::LayerKind;
29use crate::Stats;
30use crate::LAYER_USAGE_OPEN;
31use crate::LAYER_USAGE_PROTECTED;
32use crate::LAYER_USAGE_PROTECTED_PREEMPT;
33use crate::LAYER_USAGE_SUM_UPTO;
34
35const GSTAT_EXCL_IDLE: usize = bpf_intf::global_stat_id_GSTAT_EXCL_IDLE as usize;
36const GSTAT_EXCL_WAKEUP: usize = bpf_intf::global_stat_id_GSTAT_EXCL_WAKEUP as usize;
37const GSTAT_HI_FB_EVENTS: usize = bpf_intf::global_stat_id_GSTAT_HI_FB_EVENTS as usize;
38const GSTAT_HI_FB_USAGE: usize = bpf_intf::global_stat_id_GSTAT_HI_FB_USAGE as usize;
39const GSTAT_LO_FB_EVENTS: usize = bpf_intf::global_stat_id_GSTAT_LO_FB_EVENTS as usize;
40const GSTAT_LO_FB_USAGE: usize = bpf_intf::global_stat_id_GSTAT_LO_FB_USAGE as usize;
41const GSTAT_FB_CPU_USAGE: usize = bpf_intf::global_stat_id_GSTAT_FB_CPU_USAGE as usize;
42const GSTAT_ANTISTALL: usize = bpf_intf::global_stat_id_GSTAT_ANTISTALL as usize;
43const GSTAT_SKIP_PREEMPT: usize = bpf_intf::global_stat_id_GSTAT_SKIP_PREEMPT as usize;
44const GSTAT_FIXUP_VTIME: usize = bpf_intf::global_stat_id_GSTAT_FIXUP_VTIME as usize;
45const GSTAT_PREEMPTING_MISMATCH: usize =
46 bpf_intf::global_stat_id_GSTAT_PREEMPTING_MISMATCH as usize;
47
48const LSTAT_SEL_LOCAL: usize = bpf_intf::layer_stat_id_LSTAT_SEL_LOCAL as usize;
49const LSTAT_ENQ_LOCAL: usize = bpf_intf::layer_stat_id_LSTAT_ENQ_LOCAL as usize;
50const LSTAT_ENQ_WAKEUP: usize = bpf_intf::layer_stat_id_LSTAT_ENQ_WAKEUP as usize;
51const LSTAT_ENQ_EXPIRE: usize = bpf_intf::layer_stat_id_LSTAT_ENQ_EXPIRE as usize;
52const LSTAT_ENQ_REENQ: usize = bpf_intf::layer_stat_id_LSTAT_ENQ_REENQ as usize;
53const LSTAT_ENQ_DSQ: usize = bpf_intf::layer_stat_id_LSTAT_ENQ_DSQ as usize;
54const LSTAT_MIN_EXEC: usize = bpf_intf::layer_stat_id_LSTAT_MIN_EXEC as usize;
55const LSTAT_MIN_EXEC_NS: usize = bpf_intf::layer_stat_id_LSTAT_MIN_EXEC_NS as usize;
56const LSTAT_OPEN_IDLE: usize = bpf_intf::layer_stat_id_LSTAT_OPEN_IDLE as usize;
57const LSTAT_AFFN_VIOL: usize = bpf_intf::layer_stat_id_LSTAT_AFFN_VIOL as usize;
58const LSTAT_KEEP: usize = bpf_intf::layer_stat_id_LSTAT_KEEP as usize;
59const LSTAT_KEEP_FAIL_MAX_EXEC: usize = bpf_intf::layer_stat_id_LSTAT_KEEP_FAIL_MAX_EXEC as usize;
60const LSTAT_KEEP_FAIL_BUSY: usize = bpf_intf::layer_stat_id_LSTAT_KEEP_FAIL_BUSY as usize;
61const LSTAT_PREEMPT: usize = bpf_intf::layer_stat_id_LSTAT_PREEMPT as usize;
62const LSTAT_PREEMPT_FIRST: usize = bpf_intf::layer_stat_id_LSTAT_PREEMPT_FIRST as usize;
63const LSTAT_PREEMPT_XLLC: usize = bpf_intf::layer_stat_id_LSTAT_PREEMPT_XLLC as usize;
64const LSTAT_PREEMPT_XNUMA: usize = bpf_intf::layer_stat_id_LSTAT_PREEMPT_XNUMA as usize;
65const LSTAT_PREEMPT_IDLE: usize = bpf_intf::layer_stat_id_LSTAT_PREEMPT_IDLE as usize;
66const LSTAT_PREEMPT_FAIL: usize = bpf_intf::layer_stat_id_LSTAT_PREEMPT_FAIL as usize;
67const LSTAT_EXCL_COLLISION: usize = bpf_intf::layer_stat_id_LSTAT_EXCL_COLLISION as usize;
68const LSTAT_EXCL_PREEMPT: usize = bpf_intf::layer_stat_id_LSTAT_EXCL_PREEMPT as usize;
69const LSTAT_YIELD: usize = bpf_intf::layer_stat_id_LSTAT_YIELD as usize;
70const LSTAT_YIELD_IGNORE: usize = bpf_intf::layer_stat_id_LSTAT_YIELD_IGNORE as usize;
71const LSTAT_MIGRATION: usize = bpf_intf::layer_stat_id_LSTAT_MIGRATION as usize;
72const LSTAT_XNUMA_MIGRATION: usize = bpf_intf::layer_stat_id_LSTAT_XNUMA_MIGRATION as usize;
73const LSTAT_XLLC_MIGRATION: usize = bpf_intf::layer_stat_id_LSTAT_XLLC_MIGRATION as usize;
74const LSTAT_XLLC_MIGRATION_SKIP: usize = bpf_intf::layer_stat_id_LSTAT_XLLC_MIGRATION_SKIP as usize;
75const LSTAT_XLAYER_WAKE: usize = bpf_intf::layer_stat_id_LSTAT_XLAYER_WAKE as usize;
76const LSTAT_XLAYER_REWAKE: usize = bpf_intf::layer_stat_id_LSTAT_XLAYER_REWAKE as usize;
77const LSTAT_LLC_DRAIN_TRY: usize = bpf_intf::layer_stat_id_LSTAT_LLC_DRAIN_TRY as usize;
78const LSTAT_LLC_DRAIN: usize = bpf_intf::layer_stat_id_LSTAT_LLC_DRAIN as usize;
79const LSTAT_SKIP_REMOTE_NODE: usize = bpf_intf::layer_stat_id_LSTAT_SKIP_REMOTE_NODE as usize;
80
81const LLC_LSTAT_LAT: usize = bpf_intf::llc_layer_stat_id_LLC_LSTAT_LAT as usize;
82const LLC_LSTAT_CNT: usize = bpf_intf::llc_layer_stat_id_LLC_LSTAT_CNT as usize;
83
84fn calc_frac(a: f64, b: f64) -> f64 {
85 if b != 0.0 {
86 a / b * 100.0
87 } else {
88 0.0
89 }
90}
91
92fn fmt_pct(v: f64) -> String {
93 if v >= 99.95 {
94 format!("{:4.0}", v)
95 } else if v >= 10.0 {
96 format!("{:4.1}", v)
97 } else if v > 0.0 && v < 0.01 {
98 format!("{:4.2}", 0.01)
99 } else {
100 format!("{:4.2}", v)
101 }
102}
103
104fn fmt_duration_ms(ms: f64) -> String {
105 if ms >= 60_000.0 {
106 let min = ms / 60_000.0;
107 if min >= 100.0 {
108 format!("{:.0}min", min)
109 } else {
110 format!("{:.1}min", min)
111 }
112 } else if ms >= 1_000.0 {
113 let s = ms / 1_000.0;
114 if s >= 100.0 {
115 format!("{:.0}s", s)
116 } else {
117 format!("{:.1}s", s)
118 }
119 } else if ms >= 10.0 {
120 format!("{:.0}ms", ms)
121 } else {
122 format!("{:.1}ms", ms)
123 }
124}
125
126fn fmt_num(v: u64) -> String {
127 if v > 1_000_000 {
128 format!("{:5.1}m", v as f64 / 1_000_000.0)
129 } else if v > 1_000 {
130 format!("{:5.1}k", v as f64 / 1_000.0)
131 } else {
132 format!("{:5.0} ", v)
133 }
134}
135
136#[stat_doc]
137#[derive(Clone, Debug, Default, Serialize, Deserialize, Stats)]
138#[stat(_om_prefix = "l_", _om_label = "layer_name")]
139pub struct LayerStats {
140 #[stat(desc = "index", _om_skip)]
141 pub index: usize,
142 #[stat(desc = "Total CPU utilization (100% means one full CPU)")]
143 pub util: f64,
144 #[stat(desc = "Protected CPU utilization %")]
145 pub util_protected_frac: f64,
146 #[stat(desc = "Preempt-protected CPU utilization %")]
147 pub util_protected_preempt_frac: f64,
148 #[stat(desc = "Open CPU utilization %")]
149 pub util_open_frac: f64,
150 #[stat(desc = "fraction of total CPU utilization")]
151 pub util_frac: f64,
152 #[stat(desc = "number of tasks")]
153 pub tasks: u32,
154 #[stat(desc = "count of sched events during the period")]
155 pub total: u64,
156 #[stat(desc = "% dispatched into idle CPU from select_cpu")]
157 pub sel_local: f64,
158 #[stat(desc = "% dispatched into idle CPU from enqueue")]
159 pub enq_local: f64,
160 #[stat(desc = "% enqueued after wakeup")]
161 pub enq_wakeup: f64,
162 #[stat(desc = "% enqueued after slice expiration")]
163 pub enq_expire: f64,
164 #[stat(desc = "% re-enqueued due to RT preemption")]
165 pub enq_reenq: f64,
166 #[stat(desc = "% enqueued into the layer's LLC DSQs")]
167 pub enq_dsq: f64,
168 #[stat(desc = "count of times exec duration < min_exec_us")]
169 pub min_exec: f64,
170 #[stat(desc = "total exec durations extended due to min_exec_us")]
171 pub min_exec_us: u64,
172 #[stat(desc = "% dispatched into idle CPUs occupied by other layers")]
173 pub open_idle: f64,
174 #[stat(desc = "% preempted other tasks")]
175 pub preempt: f64,
176 #[stat(desc = "% preempted XLLC tasks")]
177 pub preempt_xllc: f64,
178 #[stat(desc = "% preempted across NUMA nodes")]
179 pub preempt_xnuma: f64,
180 #[stat(desc = "% first-preempted other tasks")]
181 pub preempt_first: f64,
182 #[stat(desc = "% idle-preempted other tasks")]
183 pub preempt_idle: f64,
184 #[stat(desc = "% attempted to preempt other tasks but failed")]
185 pub preempt_fail: f64,
186 #[stat(desc = "% violated config due to CPU affinity")]
187 pub affn_viol: f64,
188 #[stat(desc = "% continued executing after slice expiration")]
189 pub keep: f64,
190 #[stat(desc = "% disallowed to continue executing due to max_exec")]
191 pub keep_fail_max_exec: f64,
192 #[stat(desc = "% disallowed to continue executing due to other tasks")]
193 pub keep_fail_busy: f64,
194 #[stat(desc = "whether is exclusive", _om_skip)]
195 pub is_excl: u32,
196 #[stat(desc = "count of times an excl task skipped a CPU as the sibling was also excl")]
197 pub excl_collision: f64,
198 #[stat(desc = "% a sibling CPU was preempted for an exclusive task")]
199 pub excl_preempt: f64,
200 #[stat(desc = "% yielded")]
201 pub yielded: f64,
202 #[stat(desc = "count of times yield was ignored")]
203 pub yield_ignore: u64,
204 #[stat(desc = "% migrated across CPUs")]
205 pub migration: f64,
206 #[stat(desc = "% migrated across NUMA nodes")]
207 pub xnuma_migration: f64,
208 #[stat(desc = "% migrated across LLCs")]
209 pub xllc_migration: f64,
210 #[stat(desc = "% migration skipped across LLCs due to xllc_mig_min_us")]
211 pub xllc_migration_skip: f64,
212 #[stat(desc = "% wakers across layers")]
213 pub xlayer_wake: f64,
214 #[stat(desc = "% rewakers across layers where waker has waken the task previously")]
215 pub xlayer_rewake: f64,
216 #[stat(desc = "% LLC draining tried")]
217 pub llc_drain_try: f64,
218 #[stat(desc = "% LLC draining succeeded")]
219 pub llc_drain: f64,
220 #[stat(desc = "% skip LLC dispatch on remote node")]
221 pub skip_remote_node: f64,
222 #[stat(desc = "mask of allocated CPUs", _om_skip)]
223 pub cpus: Vec<u64>,
224 #[stat(desc = "count of CPUs assigned")]
225 pub cur_nr_cpus: u32,
226 #[stat(desc = "minimum # of CPUs assigned")]
227 pub min_nr_cpus: u32,
228 #[stat(desc = "maximum # of CPUs assigned")]
229 pub max_nr_cpus: u32,
230 #[stat(desc = "count of CPUs assigned per LLC")]
231 pub nr_llc_cpus: Vec<u32>,
232 #[stat(desc = "slice duration config")]
233 pub slice_us: u64,
234 #[stat(desc = "Per-LLC scheduling event fractions")]
235 pub llc_fracs: Vec<f64>,
236 #[stat(desc = "Per-LLC average latency")]
237 pub llc_lats: Vec<f64>,
238 #[stat(desc = "Layer memory bandwidth as a % of total allowed (0 for \"no limit\"")]
239 pub membw_pct: f64,
240 #[stat(desc = "DSQ insertion ratio EWMA (10s window)")]
241 pub dsq_insert_ewma: f64,
242 #[stat(desc = "Per-node layer utilization (100% = one full CPU)")]
243 pub node_utils: Vec<f64>,
244 #[stat(desc = "Per-node pinned task utilization (100% = one full CPU)")]
245 pub node_pinned_utils: Vec<f64>,
246 #[stat(desc = "Per-node pinned task counts")]
247 pub node_pinned_tasks: Vec<u64>,
248 #[stat(desc = "Per-node load (100% = one full CPU, from duty cycle sum)")]
249 pub node_loads: Vec<f64>,
250 #[stat(desc = "Whether xnuma gating is active for this layer (0/1)")]
251 pub xnuma_active: u32,
252}
253
254impl LayerStats {
255 pub fn new(
256 lidx: usize,
257 layer: &Layer,
258 stats: &Stats,
259 bstats: &BpfStats,
260 nr_cpus_range: (usize, usize),
261 xnuma_active: bool,
262 ) -> Self {
263 let lstat = |sidx| bstats.lstats[lidx][sidx];
264 let ltotal = lstat(LSTAT_SEL_LOCAL)
265 + lstat(LSTAT_ENQ_LOCAL)
266 + lstat(LSTAT_ENQ_WAKEUP)
267 + lstat(LSTAT_ENQ_EXPIRE)
268 + lstat(LSTAT_ENQ_REENQ)
269 + lstat(LSTAT_KEEP);
270 let lstat_pct = |sidx| {
271 if ltotal != 0 {
272 lstat(sidx) as f64 / ltotal as f64 * 100.0
273 } else {
274 0.0
275 }
276 };
277
278 let util_sum = stats.layer_utils[lidx]
279 .iter()
280 .take(LAYER_USAGE_SUM_UPTO + 1)
281 .sum::<f64>();
282
283 let membw_frac = match &layer.kind {
284 LayerKind::Open { .. } => 0.0,
286 LayerKind::Confined { membw_gb, .. } | LayerKind::Grouped { membw_gb, .. } => {
287 if let Some(membw_limit_gb) = membw_gb {
289 stats.layer_membws[lidx]
290 .iter()
291 .take(LAYER_USAGE_SUM_UPTO + 1)
292 .sum::<f64>()
293 / ((*membw_limit_gb * (1024_u64.pow(3) as f64)) as f64)
294 } else {
295 0.0
296 }
297 }
298 };
299
300 Self {
301 index: lidx,
302 util: util_sum * 100.0,
303 util_open_frac: calc_frac(stats.layer_utils[lidx][LAYER_USAGE_OPEN], util_sum),
304 util_protected_frac: calc_frac(
305 stats.layer_utils[lidx][LAYER_USAGE_PROTECTED],
306 util_sum,
307 ),
308 util_protected_preempt_frac: calc_frac(
309 stats.layer_utils[lidx][LAYER_USAGE_PROTECTED_PREEMPT],
310 util_sum,
311 ),
312 util_frac: calc_frac(util_sum, stats.total_util),
313 tasks: stats.nr_layer_tasks[lidx] as u32,
314 total: ltotal,
315 sel_local: lstat_pct(LSTAT_SEL_LOCAL),
316 enq_local: lstat_pct(LSTAT_ENQ_LOCAL),
317 enq_wakeup: lstat_pct(LSTAT_ENQ_WAKEUP),
318 enq_expire: lstat_pct(LSTAT_ENQ_EXPIRE),
319 enq_reenq: lstat_pct(LSTAT_ENQ_REENQ),
320 enq_dsq: lstat_pct(LSTAT_ENQ_DSQ),
321 min_exec: lstat_pct(LSTAT_MIN_EXEC),
322 min_exec_us: (lstat(LSTAT_MIN_EXEC_NS) / 1000) as u64,
323 open_idle: lstat_pct(LSTAT_OPEN_IDLE),
324 preempt: lstat_pct(LSTAT_PREEMPT),
325 preempt_xllc: lstat_pct(LSTAT_PREEMPT_XLLC),
326 preempt_xnuma: lstat_pct(LSTAT_PREEMPT_XNUMA),
327 preempt_first: lstat_pct(LSTAT_PREEMPT_FIRST),
328 preempt_idle: lstat_pct(LSTAT_PREEMPT_IDLE),
329 preempt_fail: lstat_pct(LSTAT_PREEMPT_FAIL),
330 affn_viol: lstat_pct(LSTAT_AFFN_VIOL),
331 keep: lstat_pct(LSTAT_KEEP),
332 keep_fail_max_exec: lstat_pct(LSTAT_KEEP_FAIL_MAX_EXEC),
333 keep_fail_busy: lstat_pct(LSTAT_KEEP_FAIL_BUSY),
334 is_excl: layer.kind.common().exclusive as u32,
335 excl_collision: lstat_pct(LSTAT_EXCL_COLLISION),
336 excl_preempt: lstat_pct(LSTAT_EXCL_PREEMPT),
337 yielded: lstat_pct(LSTAT_YIELD),
338 yield_ignore: lstat(LSTAT_YIELD_IGNORE) as u64,
339 migration: lstat_pct(LSTAT_MIGRATION),
340 xnuma_migration: lstat_pct(LSTAT_XNUMA_MIGRATION),
341 xlayer_wake: lstat_pct(LSTAT_XLAYER_WAKE),
342 xlayer_rewake: lstat_pct(LSTAT_XLAYER_REWAKE),
343 xllc_migration: lstat_pct(LSTAT_XLLC_MIGRATION),
344 xllc_migration_skip: lstat_pct(LSTAT_XLLC_MIGRATION_SKIP),
345 llc_drain_try: lstat_pct(LSTAT_LLC_DRAIN_TRY),
346 llc_drain: lstat_pct(LSTAT_LLC_DRAIN),
347 skip_remote_node: lstat_pct(LSTAT_SKIP_REMOTE_NODE),
348 cpus: layer.cpus.as_raw_slice().to_vec(),
349 cur_nr_cpus: layer.cpus.weight() as u32,
350 min_nr_cpus: nr_cpus_range.0 as u32,
351 max_nr_cpus: nr_cpus_range.1 as u32,
352 nr_llc_cpus: layer.nr_llc_cpus.iter().map(|&v| v as u32).collect(),
353 slice_us: stats.layer_slice_us[lidx],
354 llc_fracs: {
355 let sid = LLC_LSTAT_CNT;
356 let sum = bstats.llc_lstats[lidx]
357 .iter()
358 .map(|lstats| lstats[sid])
359 .sum::<u64>() as f64;
360 bstats.llc_lstats[lidx]
361 .iter()
362 .map(|lstats| calc_frac(lstats[sid] as f64, sum))
363 .collect()
364 },
365 llc_lats: bstats.llc_lstats[lidx]
366 .iter()
367 .map(|lstats| lstats[LLC_LSTAT_LAT] as f64 / 1_000_000_000.0)
368 .collect(),
369 membw_pct: membw_frac * 100.0,
370 dsq_insert_ewma: stats.layer_dsq_insert_ewma[lidx] * 100.0,
371 node_utils: stats.layer_node_utils[lidx]
372 .iter()
373 .map(|u| u * 100.0)
374 .collect(),
375 node_pinned_utils: stats.layer_node_pinned_utils[lidx]
376 .iter()
377 .map(|u| u * 100.0)
378 .collect(),
379 node_pinned_tasks: stats.layer_nr_node_pinned_tasks[lidx].clone(),
380 node_loads: stats.layer_node_duty_sums[lidx]
381 .iter()
382 .map(|l| l * 100.0)
383 .collect(),
384 xnuma_active: if xnuma_active { 1 } else { 0 },
385 }
386 }
387
388 pub fn format<W: Write>(
389 &self,
390 w: &mut W,
391 name: &str,
392 topo: Option<&Topology>,
393 max_width: usize,
394 no_llc: bool,
395 ) -> Result<()> {
396 writeln!(
398 w,
399 "\n\u{25B6} {} \u{2500} util/open/frac={:6.1}/{}/{:7.1} prot/prot_preempt={}/{} tasks={:6}",
400 name,
401 self.util,
402 fmt_pct(self.util_open_frac),
403 self.util_frac,
404 fmt_pct(self.util_protected_frac),
405 fmt_pct(self.util_protected_preempt_frac),
406 self.tasks,
407 )?;
408
409 writeln!(
411 w,
412 " {:<7} tot={} dd_sel/enq={}/{} dsq/10s={}/{} wake/exp/re={}/{}/{}",
413 "sched",
414 fmt_num(self.total),
415 fmt_pct(self.sel_local),
416 fmt_pct(self.enq_local),
417 fmt_pct(self.enq_dsq),
418 fmt_pct(self.dsq_insert_ewma),
419 fmt_pct(self.enq_wakeup),
420 fmt_pct(self.enq_expire),
421 fmt_pct(self.enq_reenq),
422 )?;
423
424 writeln!(
426 w,
427 " {:<7} keep/max/busy={}/{}/{} yield/ign={}/{} slc={} min_ex={}/{}",
428 "exec",
429 fmt_pct(self.keep),
430 fmt_pct(self.keep_fail_max_exec),
431 fmt_pct(self.keep_fail_busy),
432 fmt_pct(self.yielded),
433 fmt_num(self.yield_ignore),
434 fmt_duration_ms(self.slice_us as f64 / 1000.0),
435 fmt_pct(self.min_exec),
436 fmt_duration_ms(self.min_exec_us as f64 / 1000.0),
437 )?;
438
439 writeln!(
441 w,
442 " {:<7} mig={} xnuma={} xllc/skip={}/{} open_idle={} affn_viol={}",
443 "mig",
444 fmt_pct(self.migration),
445 fmt_pct(self.xnuma_migration),
446 fmt_pct(self.xllc_migration),
447 fmt_pct(self.xllc_migration_skip),
448 fmt_pct(self.open_idle),
449 fmt_pct(self.affn_viol),
450 )?;
451
452 writeln!(
454 w,
455 " {:<7} preempt/first/xllc/xnuma/idle/fail={}/{}/{}/{}/{}/{}",
456 "preempt",
457 fmt_pct(self.preempt),
458 fmt_pct(self.preempt_first),
459 fmt_pct(self.preempt_xllc),
460 fmt_pct(self.preempt_xnuma),
461 fmt_pct(self.preempt_idle),
462 fmt_pct(self.preempt_fail),
463 )?;
464
465 writeln!(
467 w,
468 " {:<7} wake/re={}/{} llc_drain/try={}/{} skip_rnode={}",
469 "xlayer",
470 fmt_pct(self.xlayer_wake),
471 fmt_pct(self.xlayer_rewake),
472 fmt_pct(self.llc_drain),
473 fmt_pct(self.llc_drain_try),
474 fmt_pct(self.skip_remote_node),
475 )?;
476
477 if self.node_utils.len() > 1 {
479 let xnuma_tag = if self.xnuma_active != 0 {
480 " [xnuma]"
481 } else {
482 ""
483 };
484 let prefix = format!(" node util/load{xnuma_tag} ");
485 let cell_width = 21;
487 let usable = if max_width > prefix.len() {
488 max_width - prefix.len()
489 } else {
490 60
491 };
492 let cells_per_row = (usable / cell_width).max(1);
493
494 for nid in 0..self.node_utils.len() {
495 let util = self.node_utils[nid];
496 let load = self.node_loads.get(nid).copied().unwrap_or(0.0);
497 if nid % cells_per_row == 0 {
498 if nid > 0 {
499 writeln!(w)?;
500 }
501 write!(w, "{prefix}")?;
502 } else {
503 write!(w, " ")?;
504 }
505 write!(w, "N{}={:7.1}/{:7.1}", nid, util, load)?;
506 }
507 writeln!(w)?;
508 }
509
510 if self.node_pinned_tasks.iter().any(|t| *t > 0) {
512 let prefix = " pinned util/tasks ";
513 let cell_width = 19;
515 let usable = if max_width > prefix.len() {
516 max_width - prefix.len()
517 } else {
518 60
519 };
520 let cells_per_row = (usable / cell_width).max(1);
521
522 for nid in 0..self.node_pinned_utils.len() {
523 let util = self.node_pinned_utils[nid];
524 let tasks = self.node_pinned_tasks.get(nid).copied().unwrap_or(0);
525 if nid % cells_per_row == 0 {
526 if nid > 0 {
527 writeln!(w)?;
528 }
529 write!(w, "{prefix}")?;
530 } else {
531 write!(w, " ")?;
532 }
533 write!(w, "N{}={:7.1}/{:5}", nid, util, tasks)?;
534 }
535 writeln!(w)?;
536 }
537
538 let cpumask = Cpumask::from_vec(self.cpus.clone());
540
541 if let Some(topo) = topo {
542 let header = topo.format_cpumask_header(&cpumask, self.min_nr_cpus, self.max_nr_cpus);
543 writeln!(w, " {}", header)?;
544 if cpumask.weight() > 0 {
545 topo.format_cpumask_grid(w, &cpumask, " ", max_width)?;
546 }
547 } else {
548 writeln!(
549 w,
550 " cpus={:3} [{:3},{:3}] {}",
551 self.cur_nr_cpus, self.min_nr_cpus, self.max_nr_cpus, &cpumask,
552 )?;
553 }
554
555 if self.is_excl != 0 {
557 writeln!(
558 w,
559 " excl_coll={} excl_preempt={}",
560 fmt_pct(self.excl_collision),
561 fmt_pct(self.excl_preempt),
562 )?;
563 } else if self.excl_collision != 0.0 || self.excl_preempt != 0.0 {
564 warn!(
565 "{}: exclusive is off but excl_coll={} excl_preempt={}",
566 name,
567 fmt_pct(self.excl_collision),
568 fmt_pct(self.excl_preempt),
569 );
570 }
571
572 if !no_llc {
574 let active_llcs: Vec<(usize, f64, f64)> = self
576 .llc_fracs
577 .iter()
578 .zip(self.llc_lats.iter())
579 .enumerate()
580 .filter(|(i, (&frac, _))| {
581 let nr_cpus = self.nr_llc_cpus.get(*i).copied().unwrap_or(0);
582 nr_cpus > 0 || frac > 0.0
583 })
584 .map(|(i, (&frac, &lat))| (i, frac, lat))
585 .collect();
586
587 if !active_llcs.is_empty() {
588 let indent = " ";
589 writeln!(w, "{indent}LLC sched%/lat_ms")?;
590 let cell_width = 14;
592 let usable = if max_width > indent.len() {
593 max_width - indent.len()
594 } else {
595 60
596 };
597 let cells_per_row = (usable / cell_width).max(1);
598
599 for (col, &(llc_id, frac, lat)) in active_llcs.iter().enumerate() {
600 if col % cells_per_row == 0 {
601 if col > 0 {
602 writeln!(w)?;
603 }
604 write!(w, "{indent}")?;
605 } else {
606 write!(w, " ")?;
607 }
608 write!(w, "[{:02}]{}/{:4.1}", llc_id, fmt_pct(frac), lat * 1_000.0)?;
609 }
610 writeln!(w)?;
611 }
612 }
613
614 Ok(())
615 }
616}
617
618#[stat_doc]
619#[derive(Clone, Debug, Default, Serialize, Deserialize, Stats)]
620#[stat(top)]
621pub struct SysStats {
622 #[stat(desc = "timestamp", _om_skip)]
623 pub at: f64,
624 #[stat(desc = "# of NUMA nodes")]
625 pub nr_nodes: usize,
626 #[stat(desc = "# sched events during the period")]
627 pub total: u64,
628 #[stat(desc = "% dispatched directly into an idle CPU from select_cpu")]
629 pub local_sel: f64,
630 #[stat(desc = "% dispatched directly into an idle CPU from enqueue")]
631 pub local_enq: f64,
632 #[stat(desc = "% open layer tasks scheduled into allocated but idle CPUs")]
633 pub open_idle: f64,
634 #[stat(desc = "% violated config due to CPU affinity")]
635 pub affn_viol: f64,
636 #[stat(desc = "% sent to hi fallback DSQs")]
637 pub hi_fb: f64,
638 #[stat(desc = "% sent to lo fallback DSQs")]
639 pub lo_fb: f64,
640 #[stat(desc = "count of times an excl task skipped a CPU as the sibling was also excl")]
641 pub excl_collision: f64,
642 #[stat(desc = "count of times a sibling CPU was preempted for an excl task")]
643 pub excl_preempt: f64,
644 #[stat(desc = "count of times a CPU skipped dispatching due to an excl task on the sibling")]
645 pub excl_idle: f64,
646 #[stat(
647 desc = "count of times an idle sibling CPU was woken up after an excl task is finished"
648 )]
649 pub excl_wakeup: f64,
650 #[stat(desc = "CPU time this binary consumed during the period")]
651 pub proc_ms: u64,
652 #[stat(desc = "CPU busy % (100% means all CPU)")]
653 pub busy: f64,
654 #[stat(desc = "CPU util % (100% means one CPU)")]
655 pub util: f64,
656 #[stat(desc = "CPU util % used by hi fallback DSQs")]
657 pub hi_fb_util: f64,
658 #[stat(desc = "CPU util % used by lo fallback DSQs")]
659 pub lo_fb_util: f64,
660 #[stat(desc = "Number of tasks dispatched via antistall")]
661 pub antistall: u64,
662 #[stat(desc = "Number of times preemptions of non-scx tasks were avoided")]
663 pub skip_preempt: u64,
664 #[stat(desc = "Number of times vtime was out of range and fixed up")]
665 pub fixup_vtime: u64,
666 #[stat(desc = "Number of times cpuc->preempting_task didn't come on the CPU")]
667 pub preempting_mismatch: u64,
668 #[stat(desc = "per-node fallback CPUs")]
669 pub fallback_cpus: BTreeMap<u32, u32>,
670 #[stat(desc = "per-layer statistics")]
671 pub fallback_cpu_util: f64,
672 #[stat(desc = "fallback CPU util %")]
673 pub layers: BTreeMap<String, LayerStats>,
674 #[stat(desc = "Number of gpu tasks affinitized since scheduler start")]
675 pub gpu_tasks_affinitized: u64,
676 #[stat(desc = "Time (in ms) of last affinitization run.")]
677 pub gpu_task_affinitization_ms: u64,
678 #[stat(desc = "System CPU utilization EWMA (10s window)")]
679 pub system_cpu_util_ewma: f64,
680}
681
682impl SysStats {
683 pub fn new(
684 stats: &Stats,
685 bstats: &BpfStats,
686 fallback_cpus: &BTreeMap<usize, usize>,
687 ) -> Result<Self> {
688 let lsum = |idx| stats.bpf_stats.lstats_sums[idx];
689 let total = lsum(LSTAT_SEL_LOCAL)
690 + lsum(LSTAT_ENQ_LOCAL)
691 + lsum(LSTAT_ENQ_WAKEUP)
692 + lsum(LSTAT_ENQ_EXPIRE)
693 + lsum(LSTAT_ENQ_REENQ)
694 + lsum(LSTAT_KEEP);
695 let lsum_pct = |idx| {
696 if total != 0 {
697 lsum(idx) as f64 / total as f64 * 100.0
698 } else {
699 0.0
700 }
701 };
702
703 let elapsed_ns = stats.elapsed.as_nanos();
704
705 Ok(Self {
706 at: SystemTime::now().duration_since(UNIX_EPOCH)?.as_secs_f64(),
707 nr_nodes: stats.topo.nodes.len(),
708 total,
709 local_sel: lsum_pct(LSTAT_SEL_LOCAL),
710 local_enq: lsum_pct(LSTAT_ENQ_LOCAL),
711 open_idle: lsum_pct(LSTAT_OPEN_IDLE),
712 affn_viol: lsum_pct(LSTAT_AFFN_VIOL),
713 hi_fb: calc_frac(
714 stats.bpf_stats.gstats[GSTAT_HI_FB_EVENTS] as f64,
715 total as f64,
716 ),
717 lo_fb: calc_frac(
718 stats.bpf_stats.gstats[GSTAT_LO_FB_EVENTS] as f64,
719 total as f64,
720 ),
721 excl_collision: lsum_pct(LSTAT_EXCL_COLLISION),
722 excl_preempt: lsum_pct(LSTAT_EXCL_PREEMPT),
723 excl_idle: bstats.gstats[GSTAT_EXCL_IDLE] as f64 / total as f64,
724 excl_wakeup: bstats.gstats[GSTAT_EXCL_WAKEUP] as f64 / total as f64,
725 proc_ms: stats.processing_dur.as_millis() as u64,
726 busy: stats.cpu_busy * 100.0,
727 util: stats.total_util * 100.0,
728 hi_fb_util: stats.bpf_stats.gstats[GSTAT_HI_FB_USAGE] as f64 / elapsed_ns as f64
729 * 100.0,
730 lo_fb_util: stats.bpf_stats.gstats[GSTAT_LO_FB_USAGE] as f64 / elapsed_ns as f64
731 * 100.0,
732 antistall: stats.bpf_stats.gstats[GSTAT_ANTISTALL],
733 skip_preempt: stats.bpf_stats.gstats[GSTAT_SKIP_PREEMPT],
734 fixup_vtime: stats.bpf_stats.gstats[GSTAT_FIXUP_VTIME],
735 preempting_mismatch: stats.bpf_stats.gstats[GSTAT_PREEMPTING_MISMATCH],
736 fallback_cpus: fallback_cpus
737 .iter()
738 .map(|(&k, &v)| (k as u32, v as u32))
739 .collect(),
740 fallback_cpu_util: stats.bpf_stats.gstats[GSTAT_FB_CPU_USAGE] as f64
741 / elapsed_ns as f64
742 * 100.0,
743 layers: BTreeMap::new(),
744 gpu_tasks_affinitized: stats.gpu_tasks_affinitized,
745 gpu_task_affinitization_ms: stats.gpu_task_affinitization_ms,
746 system_cpu_util_ewma: stats.system_cpu_util_ewma * 100.0,
747 })
748 }
749
750 pub fn format<W: Write>(&self, w: &mut W) -> Result<()> {
751 writeln!(
752 w,
753 "tot={:7} local_sel/enq={}/{} open_idle={} affn_viol={} hi/lo={}/{}",
754 self.total,
755 fmt_pct(self.local_sel),
756 fmt_pct(self.local_enq),
757 fmt_pct(self.open_idle),
758 fmt_pct(self.affn_viol),
759 fmt_pct(self.hi_fb),
760 fmt_pct(self.lo_fb),
761 )?;
762
763 let single_node = self.fallback_cpus.len() == 1;
764 let fb_cpus_str: Vec<String> = self
765 .fallback_cpus
766 .iter()
767 .map(|(n, c)| {
768 if single_node {
769 format!("{}", c)
770 } else {
771 format!("N{}:{}", n, c)
772 }
773 })
774 .collect();
775 writeln!(
776 w,
777 "busy={:5.1} util/hi/lo={:7.1}/{}/{} fb_cpus=[{}]/util={:4.1} proc={}ms sys_util_10s={:5.1}",
778 self.busy,
779 self.util,
780 fmt_pct(self.hi_fb_util),
781 fmt_pct(self.lo_fb_util),
782 fb_cpus_str.join(","),
783 self.fallback_cpu_util,
784 self.proc_ms,
785 self.system_cpu_util_ewma,
786 )?;
787
788 writeln!(
789 w,
790 "excl_coll={:.2} excl_preempt={:.2} excl_idle={:.2} excl_wakeup={:.2}",
791 self.excl_collision, self.excl_preempt, self.excl_idle, self.excl_wakeup
792 )?;
793
794 writeln!(
795 w,
796 "skip_preempt={} antistall={} fixup_vtime={} preempting_mismatch={}",
797 self.skip_preempt, self.antistall, self.fixup_vtime, self.preempting_mismatch
798 )?;
799
800 writeln!(
801 w,
802 "gpu_tasks_affinitized={} gpu_task_affinitization_time={}",
803 self.gpu_tasks_affinitized, self.gpu_task_affinitization_ms
804 )?;
805
806 Ok(())
807 }
808
809 pub fn format_all<W: Write>(
810 &self,
811 w: &mut W,
812 topo: Option<&Topology>,
813 max_width: usize,
814 no_llc: bool,
815 ) -> Result<()> {
816 self.format(w)?;
817
818 let mut idx_to_name: Vec<(usize, &String)> =
819 self.layers.iter().map(|(k, v)| (v.index, k)).collect();
820
821 idx_to_name.sort();
822
823 for (_idx, name) in &idx_to_name {
824 self.layers[*name].format(w, name, topo, max_width, no_llc)?;
825 }
826
827 Ok(())
828 }
829}
830
831#[derive(Debug)]
832pub enum StatsReq {
833 Hello(ThreadId),
834 Refresh(ThreadId, Stats),
835 Bye(ThreadId),
836}
837
838#[derive(Debug)]
839pub enum StatsRes {
840 Hello(Stats),
841 Refreshed((Stats, SysStats)),
842 Bye,
843}
844
845pub fn server_data() -> StatsServerData<StatsReq, StatsRes> {
846 let open: Box<dyn StatsOpener<StatsReq, StatsRes>> = Box::new(move |(req_ch, res_ch)| {
847 let tid = current().id();
848 req_ch.send(StatsReq::Hello(tid))?;
849 let mut stats = Some(match res_ch.recv()? {
850 StatsRes::Hello(v) => v,
851 res => bail!("invalid response to Hello: {:?}", res),
852 });
853
854 let read: Box<dyn StatsReader<StatsReq, StatsRes>> =
855 Box::new(move |_args, (req_ch, res_ch)| {
856 req_ch.send(StatsReq::Refresh(tid, stats.take().unwrap()))?;
857 let (new_stats, sys_stats) = match res_ch.recv()? {
858 StatsRes::Refreshed(v) => v,
859 res => bail!("invalid response to Refresh: {:?}", res),
860 };
861 stats = Some(new_stats);
862 sys_stats.to_json()
863 });
864
865 Ok(read)
866 });
867
868 let close: Box<dyn StatsCloser<StatsReq, StatsRes>> = Box::new(move |(req_ch, res_ch)| {
869 req_ch.send(StatsReq::Bye(current().id())).unwrap();
870 match res_ch.recv().unwrap() {
871 StatsRes::Bye => {}
872 res => panic!("invalid response to Bye: {:?}", res),
873 }
874 });
875
876 StatsServerData::new()
877 .add_meta(LayerStats::meta())
878 .add_meta(SysStats::meta())
879 .add_ops(
880 "top",
881 StatsOps {
882 open,
883 close: Some(close),
884 },
885 )
886}
887
888pub fn monitor(
889 intv: Duration,
890 shutdown: Arc<AtomicBool>,
891 max_width: usize,
892 no_llc: bool,
893) -> Result<()> {
894 let topo = Topology::new().ok();
895 scx_utils::monitor_stats::<SysStats>(
896 &[],
897 intv,
898 || shutdown.load(Ordering::Relaxed),
899 |sst| {
900 let dt = DateTime::<Local>::from(UNIX_EPOCH + Duration::from_secs_f64(sst.at));
901 let header = format!("\u{2501}\u{2501} {} ", dt.to_rfc2822());
902 let pad = max_width.saturating_sub(header.chars().count());
903 println!("{}{}", header, "\u{2501}".repeat(pad));
904 sst.format_all(&mut std::io::stdout(), topo.as_ref(), max_width, no_llc)
905 },
906 )
907}