1use std::collections::BTreeMap;
2use std::io::Write;
3use std::sync::atomic::AtomicBool;
4use std::sync::atomic::Ordering;
5use std::sync::Arc;
6use std::thread::current;
7use std::thread::ThreadId;
8use std::time::Duration;
9use std::time::SystemTime;
10use std::time::UNIX_EPOCH;
11
12use anyhow::bail;
13use anyhow::Result;
14use chrono::DateTime;
15use chrono::Local;
16use scx_stats::prelude::*;
17use scx_stats_derive::stat_doc;
18use scx_stats_derive::Stats;
19use scx_utils::Cpumask;
20use scx_utils::Topology;
21use serde::Deserialize;
22use serde::Serialize;
23use tracing::warn;
24
25use crate::bpf_intf;
26use crate::BpfStats;
27use crate::Layer;
28use crate::LayerKind;
29use crate::Stats;
30use crate::LAYER_USAGE_OPEN;
31use crate::LAYER_USAGE_PROTECTED;
32use crate::LAYER_USAGE_PROTECTED_PREEMPT;
33use crate::LAYER_USAGE_SUM_UPTO;
34
35const GSTAT_EXCL_IDLE: usize = bpf_intf::global_stat_id_GSTAT_EXCL_IDLE as usize;
36const GSTAT_EXCL_WAKEUP: usize = bpf_intf::global_stat_id_GSTAT_EXCL_WAKEUP as usize;
37const GSTAT_HI_FB_EVENTS: usize = bpf_intf::global_stat_id_GSTAT_HI_FB_EVENTS as usize;
38const GSTAT_HI_FB_USAGE: usize = bpf_intf::global_stat_id_GSTAT_HI_FB_USAGE as usize;
39const GSTAT_LO_FB_EVENTS: usize = bpf_intf::global_stat_id_GSTAT_LO_FB_EVENTS as usize;
40const GSTAT_LO_FB_USAGE: usize = bpf_intf::global_stat_id_GSTAT_LO_FB_USAGE as usize;
41const GSTAT_FB_CPU_USAGE: usize = bpf_intf::global_stat_id_GSTAT_FB_CPU_USAGE as usize;
42const GSTAT_ANTISTALL: usize = bpf_intf::global_stat_id_GSTAT_ANTISTALL as usize;
43const GSTAT_SKIP_PREEMPT: usize = bpf_intf::global_stat_id_GSTAT_SKIP_PREEMPT as usize;
44const GSTAT_FIXUP_VTIME: usize = bpf_intf::global_stat_id_GSTAT_FIXUP_VTIME as usize;
45const GSTAT_PREEMPTING_MISMATCH: usize =
46 bpf_intf::global_stat_id_GSTAT_PREEMPTING_MISMATCH as usize;
47
48const LSTAT_SEL_LOCAL: usize = bpf_intf::layer_stat_id_LSTAT_SEL_LOCAL as usize;
49const LSTAT_ENQ_LOCAL: usize = bpf_intf::layer_stat_id_LSTAT_ENQ_LOCAL as usize;
50const LSTAT_ENQ_WAKEUP: usize = bpf_intf::layer_stat_id_LSTAT_ENQ_WAKEUP as usize;
51const LSTAT_ENQ_EXPIRE: usize = bpf_intf::layer_stat_id_LSTAT_ENQ_EXPIRE as usize;
52const LSTAT_ENQ_REENQ: usize = bpf_intf::layer_stat_id_LSTAT_ENQ_REENQ as usize;
53const LSTAT_ENQ_DSQ: usize = bpf_intf::layer_stat_id_LSTAT_ENQ_DSQ as usize;
54const LSTAT_MIN_EXEC: usize = bpf_intf::layer_stat_id_LSTAT_MIN_EXEC as usize;
55const LSTAT_MIN_EXEC_NS: usize = bpf_intf::layer_stat_id_LSTAT_MIN_EXEC_NS as usize;
56const LSTAT_OPEN_IDLE: usize = bpf_intf::layer_stat_id_LSTAT_OPEN_IDLE as usize;
57const LSTAT_AFFN_VIOL: usize = bpf_intf::layer_stat_id_LSTAT_AFFN_VIOL as usize;
58const LSTAT_KEEP: usize = bpf_intf::layer_stat_id_LSTAT_KEEP as usize;
59const LSTAT_KEEP_FAIL_MAX_EXEC: usize = bpf_intf::layer_stat_id_LSTAT_KEEP_FAIL_MAX_EXEC as usize;
60const LSTAT_KEEP_FAIL_BUSY: usize = bpf_intf::layer_stat_id_LSTAT_KEEP_FAIL_BUSY as usize;
61const LSTAT_PREEMPT: usize = bpf_intf::layer_stat_id_LSTAT_PREEMPT as usize;
62const LSTAT_PREEMPT_FIRST: usize = bpf_intf::layer_stat_id_LSTAT_PREEMPT_FIRST as usize;
63const LSTAT_PREEMPT_XLLC: usize = bpf_intf::layer_stat_id_LSTAT_PREEMPT_XLLC as usize;
64const LSTAT_PREEMPT_XNUMA: usize = bpf_intf::layer_stat_id_LSTAT_PREEMPT_XNUMA as usize;
65const LSTAT_PREEMPT_IDLE: usize = bpf_intf::layer_stat_id_LSTAT_PREEMPT_IDLE as usize;
66const LSTAT_PREEMPT_FAIL: usize = bpf_intf::layer_stat_id_LSTAT_PREEMPT_FAIL as usize;
67const LSTAT_EXCL_COLLISION: usize = bpf_intf::layer_stat_id_LSTAT_EXCL_COLLISION as usize;
68const LSTAT_EXCL_PREEMPT: usize = bpf_intf::layer_stat_id_LSTAT_EXCL_PREEMPT as usize;
69const LSTAT_YIELD: usize = bpf_intf::layer_stat_id_LSTAT_YIELD as usize;
70const LSTAT_YIELD_IGNORE: usize = bpf_intf::layer_stat_id_LSTAT_YIELD_IGNORE as usize;
71const LSTAT_MIGRATION: usize = bpf_intf::layer_stat_id_LSTAT_MIGRATION as usize;
72const LSTAT_XNUMA_MIGRATION: usize = bpf_intf::layer_stat_id_LSTAT_XNUMA_MIGRATION as usize;
73const LSTAT_XLLC_MIGRATION: usize = bpf_intf::layer_stat_id_LSTAT_XLLC_MIGRATION as usize;
74const LSTAT_XLLC_MIGRATION_SKIP: usize = bpf_intf::layer_stat_id_LSTAT_XLLC_MIGRATION_SKIP as usize;
75const LSTAT_XLAYER_WAKE: usize = bpf_intf::layer_stat_id_LSTAT_XLAYER_WAKE as usize;
76const LSTAT_XLAYER_REWAKE: usize = bpf_intf::layer_stat_id_LSTAT_XLAYER_REWAKE as usize;
77const LSTAT_LLC_DRAIN_TRY: usize = bpf_intf::layer_stat_id_LSTAT_LLC_DRAIN_TRY as usize;
78const LSTAT_LLC_DRAIN: usize = bpf_intf::layer_stat_id_LSTAT_LLC_DRAIN as usize;
79const LSTAT_SKIP_REMOTE_NODE: usize = bpf_intf::layer_stat_id_LSTAT_SKIP_REMOTE_NODE as usize;
80
81const LLC_LSTAT_LAT: usize = bpf_intf::llc_layer_stat_id_LLC_LSTAT_LAT as usize;
82const LLC_LSTAT_CNT: usize = bpf_intf::llc_layer_stat_id_LLC_LSTAT_CNT as usize;
83
84fn calc_frac(a: f64, b: f64) -> f64 {
85 if b != 0.0 {
86 a / b * 100.0
87 } else {
88 0.0
89 }
90}
91
92fn fmt_pct(v: f64) -> String {
93 if v >= 99.95 {
94 format!("{:4.0}", v)
95 } else if v >= 10.0 {
96 format!("{:4.1}", v)
97 } else if v > 0.0 && v < 0.01 {
98 format!("{:4.2}", 0.01)
99 } else {
100 format!("{:4.2}", v)
101 }
102}
103
104fn fmt_duration_ms(ms: f64) -> String {
105 if ms >= 60_000.0 {
106 let min = ms / 60_000.0;
107 if min >= 100.0 {
108 format!("{:.0}min", min)
109 } else {
110 format!("{:.1}min", min)
111 }
112 } else if ms >= 1_000.0 {
113 let s = ms / 1_000.0;
114 if s >= 100.0 {
115 format!("{:.0}s", s)
116 } else {
117 format!("{:.1}s", s)
118 }
119 } else if ms >= 10.0 {
120 format!("{:.0}ms", ms)
121 } else {
122 format!("{:.1}ms", ms)
123 }
124}
125
126fn fmt_num(v: u64) -> String {
127 if v > 1_000_000 {
128 format!("{:5.1}m", v as f64 / 1_000_000.0)
129 } else if v > 1_000 {
130 format!("{:5.1}k", v as f64 / 1_000.0)
131 } else {
132 format!("{:5.0} ", v)
133 }
134}
135
136#[stat_doc]
137#[derive(Clone, Debug, Default, Serialize, Deserialize, Stats)]
138#[stat(_om_prefix = "l_", _om_label = "layer_name")]
139pub struct LayerStats {
140 #[stat(desc = "index", _om_skip)]
141 pub index: usize,
142 #[stat(desc = "Total CPU utilization (100% means one full CPU)")]
143 pub util: f64,
144 #[stat(desc = "Protected CPU utilization %")]
145 pub util_protected_frac: f64,
146 #[stat(desc = "Preempt-protected CPU utilization %")]
147 pub util_protected_preempt_frac: f64,
148 #[stat(desc = "Open CPU utilization %")]
149 pub util_open_frac: f64,
150 #[stat(desc = "fraction of total CPU utilization")]
151 pub util_frac: f64,
152 #[stat(desc = "number of tasks")]
153 pub tasks: u32,
154 #[stat(desc = "count of sched events during the period")]
155 pub total: u64,
156 #[stat(desc = "% dispatched into idle CPU from select_cpu")]
157 pub sel_local: f64,
158 #[stat(desc = "% dispatched into idle CPU from enqueue")]
159 pub enq_local: f64,
160 #[stat(desc = "% enqueued after wakeup")]
161 pub enq_wakeup: f64,
162 #[stat(desc = "% enqueued after slice expiration")]
163 pub enq_expire: f64,
164 #[stat(desc = "% re-enqueued due to RT preemption")]
165 pub enq_reenq: f64,
166 #[stat(desc = "% enqueued into the layer's LLC DSQs")]
167 pub enq_dsq: f64,
168 #[stat(desc = "count of times exec duration < min_exec_us")]
169 pub min_exec: f64,
170 #[stat(desc = "total exec durations extended due to min_exec_us")]
171 pub min_exec_us: u64,
172 #[stat(desc = "% dispatched into idle CPUs occupied by other layers")]
173 pub open_idle: f64,
174 #[stat(desc = "% preempted other tasks")]
175 pub preempt: f64,
176 #[stat(desc = "% preempted XLLC tasks")]
177 pub preempt_xllc: f64,
178 #[stat(desc = "% preempted across NUMA nodes")]
179 pub preempt_xnuma: f64,
180 #[stat(desc = "% first-preempted other tasks")]
181 pub preempt_first: f64,
182 #[stat(desc = "% idle-preempted other tasks")]
183 pub preempt_idle: f64,
184 #[stat(desc = "% attempted to preempt other tasks but failed")]
185 pub preempt_fail: f64,
186 #[stat(desc = "% violated config due to CPU affinity")]
187 pub affn_viol: f64,
188 #[stat(desc = "% continued executing after slice expiration")]
189 pub keep: f64,
190 #[stat(desc = "% disallowed to continue executing due to max_exec")]
191 pub keep_fail_max_exec: f64,
192 #[stat(desc = "% disallowed to continue executing due to other tasks")]
193 pub keep_fail_busy: f64,
194 #[stat(desc = "whether is exclusive", _om_skip)]
195 pub is_excl: u32,
196 #[stat(desc = "count of times an excl task skipped a CPU as the sibling was also excl")]
197 pub excl_collision: f64,
198 #[stat(desc = "% a sibling CPU was preempted for an exclusive task")]
199 pub excl_preempt: f64,
200 #[stat(desc = "% yielded")]
201 pub yielded: f64,
202 #[stat(desc = "count of times yield was ignored")]
203 pub yield_ignore: u64,
204 #[stat(desc = "% migrated across CPUs")]
205 pub migration: f64,
206 #[stat(desc = "% migrated across NUMA nodes")]
207 pub xnuma_migration: f64,
208 #[stat(desc = "% migrated across LLCs")]
209 pub xllc_migration: f64,
210 #[stat(desc = "% migration skipped across LLCs due to xllc_mig_min_us")]
211 pub xllc_migration_skip: f64,
212 #[stat(desc = "% wakers across layers")]
213 pub xlayer_wake: f64,
214 #[stat(desc = "% rewakers across layers where waker has waken the task previously")]
215 pub xlayer_rewake: f64,
216 #[stat(desc = "% LLC draining tried")]
217 pub llc_drain_try: f64,
218 #[stat(desc = "% LLC draining succeeded")]
219 pub llc_drain: f64,
220 #[stat(desc = "% skip LLC dispatch on remote node")]
221 pub skip_remote_node: f64,
222 #[stat(desc = "mask of allocated CPUs", _om_skip)]
223 pub cpus: Vec<u64>,
224 #[stat(desc = "count of CPUs assigned")]
225 pub cur_nr_cpus: u32,
226 #[stat(desc = "minimum # of CPUs assigned")]
227 pub min_nr_cpus: u32,
228 #[stat(desc = "maximum # of CPUs assigned")]
229 pub max_nr_cpus: u32,
230 #[stat(desc = "count of CPUs assigned per LLC")]
231 pub nr_llc_cpus: Vec<u32>,
232 #[stat(desc = "slice duration config")]
233 pub slice_us: u64,
234 #[stat(desc = "Per-LLC scheduling event fractions")]
235 pub llc_fracs: Vec<f64>,
236 #[stat(desc = "Per-LLC average latency")]
237 pub llc_lats: Vec<f64>,
238 #[stat(desc = "Layer memory bandwidth as a % of total allowed (0 for \"no limit\"")]
239 pub membw_pct: f64,
240 #[stat(desc = "DSQ insertion ratio EWMA (10s window)")]
241 pub dsq_insert_ewma: f64,
242 #[stat(desc = "Per-node layer utilization (100% = one full CPU)")]
243 pub node_utils: Vec<f64>,
244 #[stat(desc = "Per-node pinned task utilization (100% = one full CPU)")]
245 pub node_pinned_utils: Vec<f64>,
246 #[stat(desc = "Per-node pinned task counts")]
247 pub node_pinned_tasks: Vec<u64>,
248 #[stat(desc = "Per-node load (100% = one full CPU, from duty cycle sum)")]
249 pub node_loads: Vec<f64>,
250 #[stat(desc = "Whether xnuma gating is active for this layer (0/1)")]
251 pub xnuma_active: u32,
252}
253
254impl LayerStats {
255 pub fn new(
256 lidx: usize,
257 layer: &Layer,
258 stats: &Stats,
259 bstats: &BpfStats,
260 nr_cpus_range: (usize, usize),
261 xnuma_active: bool,
262 ) -> Self {
263 let lstat = |sidx| bstats.lstats[lidx][sidx];
264 let ltotal = lstat(LSTAT_SEL_LOCAL)
265 + lstat(LSTAT_ENQ_LOCAL)
266 + lstat(LSTAT_ENQ_WAKEUP)
267 + lstat(LSTAT_ENQ_EXPIRE)
268 + lstat(LSTAT_ENQ_REENQ)
269 + lstat(LSTAT_KEEP);
270 let lstat_pct = |sidx| {
271 if ltotal != 0 {
272 lstat(sidx) as f64 / ltotal as f64 * 100.0
273 } else {
274 0.0
275 }
276 };
277
278 let util_sum = stats.layer_utils[lidx]
279 .iter()
280 .take(LAYER_USAGE_SUM_UPTO + 1)
281 .sum::<f64>();
282
283 let membw_frac = match &layer.kind {
284 LayerKind::Open { .. } => 0.0,
286 LayerKind::Confined { membw_gb, .. } | LayerKind::Grouped { membw_gb, .. } => {
287 if let Some(membw_limit_gb) = membw_gb {
289 stats.layer_membws[lidx]
290 .iter()
291 .take(LAYER_USAGE_SUM_UPTO + 1)
292 .sum::<f64>()
293 / (*membw_limit_gb * (1024_u64.pow(3) as f64))
294 } else {
295 0.0
296 }
297 }
298 };
299
300 Self {
301 index: lidx,
302 util: util_sum * 100.0,
303 util_open_frac: calc_frac(stats.layer_utils[lidx][LAYER_USAGE_OPEN], util_sum),
304 util_protected_frac: calc_frac(
305 stats.layer_utils[lidx][LAYER_USAGE_PROTECTED],
306 util_sum,
307 ),
308 util_protected_preempt_frac: calc_frac(
309 stats.layer_utils[lidx][LAYER_USAGE_PROTECTED_PREEMPT],
310 util_sum,
311 ),
312 util_frac: calc_frac(util_sum, stats.total_util),
313 tasks: stats.nr_layer_tasks[lidx] as u32,
314 total: ltotal,
315 sel_local: lstat_pct(LSTAT_SEL_LOCAL),
316 enq_local: lstat_pct(LSTAT_ENQ_LOCAL),
317 enq_wakeup: lstat_pct(LSTAT_ENQ_WAKEUP),
318 enq_expire: lstat_pct(LSTAT_ENQ_EXPIRE),
319 enq_reenq: lstat_pct(LSTAT_ENQ_REENQ),
320 enq_dsq: lstat_pct(LSTAT_ENQ_DSQ),
321 min_exec: lstat_pct(LSTAT_MIN_EXEC),
322 min_exec_us: lstat(LSTAT_MIN_EXEC_NS) / 1000,
323 open_idle: lstat_pct(LSTAT_OPEN_IDLE),
324 preempt: lstat_pct(LSTAT_PREEMPT),
325 preempt_xllc: lstat_pct(LSTAT_PREEMPT_XLLC),
326 preempt_xnuma: lstat_pct(LSTAT_PREEMPT_XNUMA),
327 preempt_first: lstat_pct(LSTAT_PREEMPT_FIRST),
328 preempt_idle: lstat_pct(LSTAT_PREEMPT_IDLE),
329 preempt_fail: lstat_pct(LSTAT_PREEMPT_FAIL),
330 affn_viol: lstat_pct(LSTAT_AFFN_VIOL),
331 keep: lstat_pct(LSTAT_KEEP),
332 keep_fail_max_exec: lstat_pct(LSTAT_KEEP_FAIL_MAX_EXEC),
333 keep_fail_busy: lstat_pct(LSTAT_KEEP_FAIL_BUSY),
334 is_excl: layer.kind.common().exclusive as u32,
335 excl_collision: lstat_pct(LSTAT_EXCL_COLLISION),
336 excl_preempt: lstat_pct(LSTAT_EXCL_PREEMPT),
337 yielded: lstat_pct(LSTAT_YIELD),
338 yield_ignore: lstat(LSTAT_YIELD_IGNORE),
339 migration: lstat_pct(LSTAT_MIGRATION),
340 xnuma_migration: lstat_pct(LSTAT_XNUMA_MIGRATION),
341 xlayer_wake: lstat_pct(LSTAT_XLAYER_WAKE),
342 xlayer_rewake: lstat_pct(LSTAT_XLAYER_REWAKE),
343 xllc_migration: lstat_pct(LSTAT_XLLC_MIGRATION),
344 xllc_migration_skip: lstat_pct(LSTAT_XLLC_MIGRATION_SKIP),
345 llc_drain_try: lstat_pct(LSTAT_LLC_DRAIN_TRY),
346 llc_drain: lstat_pct(LSTAT_LLC_DRAIN),
347 skip_remote_node: lstat_pct(LSTAT_SKIP_REMOTE_NODE),
348 cpus: layer.cpus.as_raw_slice().to_vec(),
349 cur_nr_cpus: layer.cpus.weight() as u32,
350 min_nr_cpus: nr_cpus_range.0 as u32,
351 max_nr_cpus: nr_cpus_range.1 as u32,
352 nr_llc_cpus: layer.nr_llc_cpus.iter().map(|&v| v as u32).collect(),
353 slice_us: stats.layer_slice_us[lidx],
354 llc_fracs: {
355 let sid = LLC_LSTAT_CNT;
356 let sum = bstats.llc_lstats[lidx]
357 .iter()
358 .map(|lstats| lstats[sid])
359 .sum::<u64>() as f64;
360 bstats.llc_lstats[lidx]
361 .iter()
362 .map(|lstats| calc_frac(lstats[sid] as f64, sum))
363 .collect()
364 },
365 llc_lats: bstats.llc_lstats[lidx]
366 .iter()
367 .map(|lstats| lstats[LLC_LSTAT_LAT] as f64 / 1_000_000_000.0)
368 .collect(),
369 membw_pct: membw_frac * 100.0,
370 dsq_insert_ewma: stats.layer_dsq_insert_ewma[lidx] * 100.0,
371 node_utils: stats.layer_node_utils[lidx]
372 .iter()
373 .map(|u| u * 100.0)
374 .collect(),
375 node_pinned_utils: stats.layer_node_pinned_utils[lidx]
376 .iter()
377 .map(|u| u * 100.0)
378 .collect(),
379 node_pinned_tasks: stats.layer_nr_node_pinned_tasks[lidx].clone(),
380 node_loads: stats.layer_node_duty_sums[lidx]
381 .iter()
382 .map(|l| l * 100.0)
383 .collect(),
384 xnuma_active: if xnuma_active { 1 } else { 0 },
385 }
386 }
387
388 pub fn format<W: Write>(
389 &self,
390 w: &mut W,
391 name: &str,
392 topo: Option<&Topology>,
393 max_width: usize,
394 no_llc: bool,
395 ) -> Result<()> {
396 writeln!(
398 w,
399 "\n\u{25B6} {} \u{2500} util/open/frac={:6.1}/{}/{:7.1} prot/prot_preempt={}/{} tasks={:6}",
400 name,
401 self.util,
402 fmt_pct(self.util_open_frac),
403 self.util_frac,
404 fmt_pct(self.util_protected_frac),
405 fmt_pct(self.util_protected_preempt_frac),
406 self.tasks,
407 )?;
408
409 writeln!(
411 w,
412 " {:<7} tot={} dd_sel/enq={}/{} dsq/10s={}/{} wake/exp/re={}/{}/{}",
413 "sched",
414 fmt_num(self.total),
415 fmt_pct(self.sel_local),
416 fmt_pct(self.enq_local),
417 fmt_pct(self.enq_dsq),
418 fmt_pct(self.dsq_insert_ewma),
419 fmt_pct(self.enq_wakeup),
420 fmt_pct(self.enq_expire),
421 fmt_pct(self.enq_reenq),
422 )?;
423
424 writeln!(
426 w,
427 " {:<7} keep/max/busy={}/{}/{} yield/ign={}/{} slc={} min_ex={}/{}",
428 "exec",
429 fmt_pct(self.keep),
430 fmt_pct(self.keep_fail_max_exec),
431 fmt_pct(self.keep_fail_busy),
432 fmt_pct(self.yielded),
433 fmt_num(self.yield_ignore),
434 fmt_duration_ms(self.slice_us as f64 / 1000.0),
435 fmt_pct(self.min_exec),
436 fmt_duration_ms(self.min_exec_us as f64 / 1000.0),
437 )?;
438
439 writeln!(
441 w,
442 " {:<7} mig={} xnuma={} xllc/skip={}/{} open_idle={} affn_viol={}",
443 "mig",
444 fmt_pct(self.migration),
445 fmt_pct(self.xnuma_migration),
446 fmt_pct(self.xllc_migration),
447 fmt_pct(self.xllc_migration_skip),
448 fmt_pct(self.open_idle),
449 fmt_pct(self.affn_viol),
450 )?;
451
452 writeln!(
454 w,
455 " {:<7} preempt/first/xllc/xnuma/idle/fail={}/{}/{}/{}/{}/{}",
456 "preempt",
457 fmt_pct(self.preempt),
458 fmt_pct(self.preempt_first),
459 fmt_pct(self.preempt_xllc),
460 fmt_pct(self.preempt_xnuma),
461 fmt_pct(self.preempt_idle),
462 fmt_pct(self.preempt_fail),
463 )?;
464
465 writeln!(
467 w,
468 " {:<7} wake/re={}/{} llc_drain/try={}/{} skip_rnode={}",
469 "xlayer",
470 fmt_pct(self.xlayer_wake),
471 fmt_pct(self.xlayer_rewake),
472 fmt_pct(self.llc_drain),
473 fmt_pct(self.llc_drain_try),
474 fmt_pct(self.skip_remote_node),
475 )?;
476
477 if self.node_utils.len() > 1 {
479 let prefix = " node pin/ut/ld ";
480 let cell_width = 25;
482 let usable = if max_width > prefix.len() {
483 max_width - prefix.len()
484 } else {
485 60
486 };
487 let cells_per_row = (usable / cell_width).max(1);
488
489 for nid in 0..self.node_utils.len() {
490 let util = self.node_utils[nid];
491 let load = self.node_loads.get(nid).copied().unwrap_or(0.0);
492 let pin = self.node_pinned_utils.get(nid).copied().unwrap_or(0.0);
493 if nid % cells_per_row == 0 {
494 if nid > 0 {
495 writeln!(w)?;
496 }
497 write!(w, "{prefix}")?;
498 } else {
499 write!(w, " ")?;
500 }
501 write!(w, "N{}={:5.1}/{:5.1}/{:7.1}", nid, pin, util, load)?;
502 }
503 writeln!(w)?;
504 }
505
506 let cpumask = Cpumask::from_vec(self.cpus.clone());
508
509 if let Some(topo) = topo {
510 let header = topo.format_cpumask_header(&cpumask, self.min_nr_cpus, self.max_nr_cpus);
511 writeln!(w, " {}", header)?;
512 if cpumask.weight() > 0 {
513 topo.format_cpumask_grid(w, &cpumask, " ", max_width)?;
514 }
515 } else {
516 writeln!(
517 w,
518 " cpus={:3} [{:3},{:3}] {}",
519 self.cur_nr_cpus, self.min_nr_cpus, self.max_nr_cpus, &cpumask,
520 )?;
521 }
522
523 if self.is_excl != 0 {
525 writeln!(
526 w,
527 " excl_coll={} excl_preempt={}",
528 fmt_pct(self.excl_collision),
529 fmt_pct(self.excl_preempt),
530 )?;
531 } else if self.excl_collision != 0.0 || self.excl_preempt != 0.0 {
532 warn!(
533 "{}: exclusive is off but excl_coll={} excl_preempt={}",
534 name,
535 fmt_pct(self.excl_collision),
536 fmt_pct(self.excl_preempt),
537 );
538 }
539
540 if !no_llc {
542 let active_llcs: Vec<(usize, f64, f64)> = self
544 .llc_fracs
545 .iter()
546 .zip(self.llc_lats.iter())
547 .enumerate()
548 .filter(|(i, (&frac, _))| {
549 let nr_cpus = self.nr_llc_cpus.get(*i).copied().unwrap_or(0);
550 nr_cpus > 0 || frac > 0.0
551 })
552 .map(|(i, (&frac, &lat))| (i, frac, lat))
553 .collect();
554
555 if !active_llcs.is_empty() {
556 let indent = " ";
557 writeln!(w, "{indent}LLC sched%/lat_ms")?;
558 let cell_width = 14;
560 let usable = if max_width > indent.len() {
561 max_width - indent.len()
562 } else {
563 60
564 };
565 let cells_per_row = (usable / cell_width).max(1);
566
567 for (col, &(llc_id, frac, lat)) in active_llcs.iter().enumerate() {
568 if col % cells_per_row == 0 {
569 if col > 0 {
570 writeln!(w)?;
571 }
572 write!(w, "{indent}")?;
573 } else {
574 write!(w, " ")?;
575 }
576 write!(w, "[{:02}]{}/{:4.1}", llc_id, fmt_pct(frac), lat * 1_000.0)?;
577 }
578 writeln!(w)?;
579 }
580 }
581
582 Ok(())
583 }
584}
585
586#[stat_doc]
587#[derive(Clone, Debug, Default, Serialize, Deserialize, Stats)]
588#[stat(top)]
589pub struct SysStats {
590 #[stat(desc = "timestamp", _om_skip)]
591 pub at: f64,
592 #[stat(desc = "# of NUMA nodes")]
593 pub nr_nodes: usize,
594 #[stat(desc = "# sched events during the period")]
595 pub total: u64,
596 #[stat(desc = "% dispatched directly into an idle CPU from select_cpu")]
597 pub local_sel: f64,
598 #[stat(desc = "% dispatched directly into an idle CPU from enqueue")]
599 pub local_enq: f64,
600 #[stat(desc = "% open layer tasks scheduled into allocated but idle CPUs")]
601 pub open_idle: f64,
602 #[stat(desc = "% violated config due to CPU affinity")]
603 pub affn_viol: f64,
604 #[stat(desc = "% sent to hi fallback DSQs")]
605 pub hi_fb: f64,
606 #[stat(desc = "% sent to lo fallback DSQs")]
607 pub lo_fb: f64,
608 #[stat(desc = "count of times an excl task skipped a CPU as the sibling was also excl")]
609 pub excl_collision: f64,
610 #[stat(desc = "count of times a sibling CPU was preempted for an excl task")]
611 pub excl_preempt: f64,
612 #[stat(desc = "count of times a CPU skipped dispatching due to an excl task on the sibling")]
613 pub excl_idle: f64,
614 #[stat(
615 desc = "count of times an idle sibling CPU was woken up after an excl task is finished"
616 )]
617 pub excl_wakeup: f64,
618 #[stat(desc = "CPU time this binary consumed during the period")]
619 pub proc_ms: u64,
620 #[stat(desc = "CPU busy % (100% means all CPU)")]
621 pub busy: f64,
622 #[stat(desc = "CPU util % (100% means one CPU)")]
623 pub util: f64,
624 #[stat(desc = "CPU util % used by hi fallback DSQs")]
625 pub hi_fb_util: f64,
626 #[stat(desc = "CPU util % used by lo fallback DSQs")]
627 pub lo_fb_util: f64,
628 #[stat(desc = "Number of tasks dispatched via antistall")]
629 pub antistall: u64,
630 #[stat(desc = "Number of times preemptions of non-scx tasks were avoided")]
631 pub skip_preempt: u64,
632 #[stat(desc = "Number of times vtime was out of range and fixed up")]
633 pub fixup_vtime: u64,
634 #[stat(desc = "Number of times cpuc->preempting_task didn't come on the CPU")]
635 pub preempting_mismatch: u64,
636 #[stat(desc = "per-node fallback CPUs")]
637 pub fallback_cpus: BTreeMap<u32, u32>,
638 #[stat(desc = "per-layer statistics")]
639 pub fallback_cpu_util: f64,
640 #[stat(desc = "fallback CPU util %")]
641 pub layers: BTreeMap<String, LayerStats>,
642 #[stat(desc = "Number of gpu tasks affinitized since scheduler start")]
643 pub gpu_tasks_affinitized: u64,
644 #[stat(desc = "Time (in ms) of last affinitization run.")]
645 pub gpu_task_affinitization_ms: u64,
646 #[stat(desc = "System CPU utilization EWMA (10s window)")]
647 pub system_cpu_util_ewma: f64,
648}
649
650impl SysStats {
651 pub fn new(
652 stats: &Stats,
653 bstats: &BpfStats,
654 fallback_cpus: &BTreeMap<usize, usize>,
655 ) -> Result<Self> {
656 let lsum = |idx| stats.bpf_stats.lstats_sums[idx];
657 let total = lsum(LSTAT_SEL_LOCAL)
658 + lsum(LSTAT_ENQ_LOCAL)
659 + lsum(LSTAT_ENQ_WAKEUP)
660 + lsum(LSTAT_ENQ_EXPIRE)
661 + lsum(LSTAT_ENQ_REENQ)
662 + lsum(LSTAT_KEEP);
663 let lsum_pct = |idx| {
664 if total != 0 {
665 lsum(idx) as f64 / total as f64 * 100.0
666 } else {
667 0.0
668 }
669 };
670
671 let elapsed_ns = stats.elapsed.as_nanos();
672
673 Ok(Self {
674 at: SystemTime::now().duration_since(UNIX_EPOCH)?.as_secs_f64(),
675 nr_nodes: stats.topo.nodes.len(),
676 total,
677 local_sel: lsum_pct(LSTAT_SEL_LOCAL),
678 local_enq: lsum_pct(LSTAT_ENQ_LOCAL),
679 open_idle: lsum_pct(LSTAT_OPEN_IDLE),
680 affn_viol: lsum_pct(LSTAT_AFFN_VIOL),
681 hi_fb: calc_frac(
682 stats.bpf_stats.gstats[GSTAT_HI_FB_EVENTS] as f64,
683 total as f64,
684 ),
685 lo_fb: calc_frac(
686 stats.bpf_stats.gstats[GSTAT_LO_FB_EVENTS] as f64,
687 total as f64,
688 ),
689 excl_collision: lsum_pct(LSTAT_EXCL_COLLISION),
690 excl_preempt: lsum_pct(LSTAT_EXCL_PREEMPT),
691 excl_idle: bstats.gstats[GSTAT_EXCL_IDLE] as f64 / total as f64,
692 excl_wakeup: bstats.gstats[GSTAT_EXCL_WAKEUP] as f64 / total as f64,
693 proc_ms: stats.processing_dur.as_millis() as u64,
694 busy: stats.cpu_busy * 100.0,
695 util: stats.total_util * 100.0,
696 hi_fb_util: stats.bpf_stats.gstats[GSTAT_HI_FB_USAGE] as f64 / elapsed_ns as f64
697 * 100.0,
698 lo_fb_util: stats.bpf_stats.gstats[GSTAT_LO_FB_USAGE] as f64 / elapsed_ns as f64
699 * 100.0,
700 antistall: stats.bpf_stats.gstats[GSTAT_ANTISTALL],
701 skip_preempt: stats.bpf_stats.gstats[GSTAT_SKIP_PREEMPT],
702 fixup_vtime: stats.bpf_stats.gstats[GSTAT_FIXUP_VTIME],
703 preempting_mismatch: stats.bpf_stats.gstats[GSTAT_PREEMPTING_MISMATCH],
704 fallback_cpus: fallback_cpus
705 .iter()
706 .map(|(&k, &v)| (k as u32, v as u32))
707 .collect(),
708 fallback_cpu_util: stats.bpf_stats.gstats[GSTAT_FB_CPU_USAGE] as f64
709 / elapsed_ns as f64
710 * 100.0,
711 layers: BTreeMap::new(),
712 gpu_tasks_affinitized: stats.gpu_tasks_affinitized,
713 gpu_task_affinitization_ms: stats.gpu_task_affinitization_ms,
714 system_cpu_util_ewma: stats.system_cpu_util_ewma * 100.0,
715 })
716 }
717
718 pub fn format<W: Write>(&self, w: &mut W) -> Result<()> {
719 writeln!(
720 w,
721 "tot={:7} local_sel/enq={}/{} open_idle={} affn_viol={} hi/lo={}/{}",
722 self.total,
723 fmt_pct(self.local_sel),
724 fmt_pct(self.local_enq),
725 fmt_pct(self.open_idle),
726 fmt_pct(self.affn_viol),
727 fmt_pct(self.hi_fb),
728 fmt_pct(self.lo_fb),
729 )?;
730
731 let single_node = self.fallback_cpus.len() == 1;
732 let fb_cpus_str: Vec<String> = self
733 .fallback_cpus
734 .iter()
735 .map(|(n, c)| {
736 if single_node {
737 format!("{}", c)
738 } else {
739 format!("N{}:{}", n, c)
740 }
741 })
742 .collect();
743 writeln!(
744 w,
745 "busy={:5.1} util/hi/lo={:7.1}/{}/{} fb_cpus=[{}]/util={:4.1} proc={}ms sys_util_10s={:5.1}",
746 self.busy,
747 self.util,
748 fmt_pct(self.hi_fb_util),
749 fmt_pct(self.lo_fb_util),
750 fb_cpus_str.join(","),
751 self.fallback_cpu_util,
752 self.proc_ms,
753 self.system_cpu_util_ewma,
754 )?;
755
756 writeln!(
757 w,
758 "excl_coll={:.2} excl_preempt={:.2} excl_idle={:.2} excl_wakeup={:.2}",
759 self.excl_collision, self.excl_preempt, self.excl_idle, self.excl_wakeup
760 )?;
761
762 writeln!(
763 w,
764 "skip_preempt={} antistall={} fixup_vtime={} preempting_mismatch={}",
765 self.skip_preempt, self.antistall, self.fixup_vtime, self.preempting_mismatch
766 )?;
767
768 writeln!(
769 w,
770 "gpu_tasks_affinitized={} gpu_task_affinitization_time={}",
771 self.gpu_tasks_affinitized, self.gpu_task_affinitization_ms
772 )?;
773
774 Ok(())
775 }
776
777 pub fn format_all<W: Write>(
778 &self,
779 w: &mut W,
780 topo: Option<&Topology>,
781 max_width: usize,
782 no_llc: bool,
783 ) -> Result<()> {
784 self.format(w)?;
785
786 let mut idx_to_name: Vec<(usize, &String)> =
787 self.layers.iter().map(|(k, v)| (v.index, k)).collect();
788
789 idx_to_name.sort();
790
791 for (_idx, name) in &idx_to_name {
792 self.layers[*name].format(w, name, topo, max_width, no_llc)?;
793 }
794
795 Ok(())
796 }
797}
798
799#[derive(Debug)]
800pub enum StatsReq {
801 Hello(ThreadId),
802 Refresh(ThreadId, Box<Stats>),
803 Bye(ThreadId),
804}
805
806#[derive(Debug)]
807pub enum StatsRes {
808 Hello(Box<Stats>),
809 Refreshed(Box<(Stats, SysStats)>),
810 Bye,
811}
812
813pub fn server_data() -> StatsServerData<StatsReq, StatsRes> {
814 let open: Box<dyn StatsOpener<StatsReq, StatsRes>> = Box::new(move |(req_ch, res_ch)| {
815 let tid = current().id();
816 req_ch.send(StatsReq::Hello(tid))?;
817 let mut stats = Some(match res_ch.recv()? {
818 StatsRes::Hello(v) => *v,
819 res => bail!("invalid response to Hello: {:?}", res),
820 });
821
822 let read: Box<dyn StatsReader<StatsReq, StatsRes>> =
823 Box::new(move |_args, (req_ch, res_ch)| {
824 req_ch.send(StatsReq::Refresh(tid, Box::new(stats.take().unwrap())))?;
825 let (new_stats, sys_stats) = match res_ch.recv()? {
826 StatsRes::Refreshed(v) => *v,
827 res => bail!("invalid response to Refresh: {:?}", res),
828 };
829 stats = Some(new_stats);
830 sys_stats.to_json()
831 });
832
833 Ok(read)
834 });
835
836 let close: Box<dyn StatsCloser<StatsReq, StatsRes>> = Box::new(move |(req_ch, res_ch)| {
837 req_ch.send(StatsReq::Bye(current().id())).unwrap();
838 match res_ch.recv().unwrap() {
839 StatsRes::Bye => {}
840 res => panic!("invalid response to Bye: {:?}", res),
841 }
842 });
843
844 StatsServerData::new()
845 .add_meta(LayerStats::meta())
846 .add_meta(SysStats::meta())
847 .add_ops(
848 "top",
849 StatsOps {
850 open,
851 close: Some(close),
852 },
853 )
854}
855
856pub fn monitor(
857 intv: Duration,
858 shutdown: Arc<AtomicBool>,
859 max_width: usize,
860 no_llc: bool,
861) -> Result<()> {
862 let topo = Topology::new().ok();
863 scx_utils::monitor_stats::<SysStats>(
864 &[],
865 intv,
866 || shutdown.load(Ordering::Relaxed),
867 |sst| {
868 let dt = DateTime::<Local>::from(UNIX_EPOCH + Duration::from_secs_f64(sst.at));
869 let header = format!("\u{2501}\u{2501} {} ", dt.to_rfc2822());
870 let pad = max_width.saturating_sub(header.chars().count());
871 println!("{}{}", header, "\u{2501}".repeat(pad));
872 sst.format_all(&mut std::io::stdout(), topo.as_ref(), max_width, no_llc)
873 },
874 )
875}