1use std::collections::BTreeMap;
2use std::io::Write;
3use std::sync::atomic::AtomicBool;
4use std::sync::atomic::Ordering;
5use std::sync::Arc;
6use std::thread::current;
7use std::thread::ThreadId;
8use std::time::Duration;
9use std::time::SystemTime;
10use std::time::UNIX_EPOCH;
11
12use anyhow::bail;
13use anyhow::Result;
14use chrono::DateTime;
15use chrono::Local;
16use log::warn;
17use scx_stats::prelude::*;
18use scx_stats_derive::stat_doc;
19use scx_stats_derive::Stats;
20use scx_utils::Cpumask;
21use serde::Deserialize;
22use serde::Serialize;
23
24use crate::bpf_intf;
25use crate::BpfStats;
26use crate::Layer;
27use crate::LayerKind;
28use crate::Stats;
29use crate::LAYER_USAGE_OPEN;
30use crate::LAYER_USAGE_PROTECTED;
31use crate::LAYER_USAGE_PROTECTED_PREEMPT;
32use crate::LAYER_USAGE_SUM_UPTO;
33
34const GSTAT_EXCL_IDLE: usize = bpf_intf::global_stat_id_GSTAT_EXCL_IDLE as usize;
35const GSTAT_EXCL_WAKEUP: usize = bpf_intf::global_stat_id_GSTAT_EXCL_WAKEUP as usize;
36const GSTAT_HI_FB_EVENTS: usize = bpf_intf::global_stat_id_GSTAT_HI_FB_EVENTS as usize;
37const GSTAT_HI_FB_USAGE: usize = bpf_intf::global_stat_id_GSTAT_HI_FB_USAGE as usize;
38const GSTAT_LO_FB_EVENTS: usize = bpf_intf::global_stat_id_GSTAT_LO_FB_EVENTS as usize;
39const GSTAT_LO_FB_USAGE: usize = bpf_intf::global_stat_id_GSTAT_LO_FB_USAGE as usize;
40const GSTAT_FB_CPU_USAGE: usize = bpf_intf::global_stat_id_GSTAT_FB_CPU_USAGE as usize;
41const GSTAT_ANTISTALL: usize = bpf_intf::global_stat_id_GSTAT_ANTISTALL as usize;
42const GSTAT_SKIP_PREEMPT: usize = bpf_intf::global_stat_id_GSTAT_SKIP_PREEMPT as usize;
43const GSTAT_FIXUP_VTIME: usize = bpf_intf::global_stat_id_GSTAT_FIXUP_VTIME as usize;
44const GSTAT_PREEMPTING_MISMATCH: usize =
45 bpf_intf::global_stat_id_GSTAT_PREEMPTING_MISMATCH as usize;
46
47const LSTAT_SEL_LOCAL: usize = bpf_intf::layer_stat_id_LSTAT_SEL_LOCAL as usize;
48const LSTAT_ENQ_LOCAL: usize = bpf_intf::layer_stat_id_LSTAT_ENQ_LOCAL as usize;
49const LSTAT_ENQ_WAKEUP: usize = bpf_intf::layer_stat_id_LSTAT_ENQ_WAKEUP as usize;
50const LSTAT_ENQ_EXPIRE: usize = bpf_intf::layer_stat_id_LSTAT_ENQ_EXPIRE as usize;
51const LSTAT_ENQ_REENQ: usize = bpf_intf::layer_stat_id_LSTAT_ENQ_REENQ as usize;
52const LSTAT_ENQ_DSQ: usize = bpf_intf::layer_stat_id_LSTAT_ENQ_DSQ as usize;
53const LSTAT_MIN_EXEC: usize = bpf_intf::layer_stat_id_LSTAT_MIN_EXEC as usize;
54const LSTAT_MIN_EXEC_NS: usize = bpf_intf::layer_stat_id_LSTAT_MIN_EXEC_NS as usize;
55const LSTAT_OPEN_IDLE: usize = bpf_intf::layer_stat_id_LSTAT_OPEN_IDLE as usize;
56const LSTAT_AFFN_VIOL: usize = bpf_intf::layer_stat_id_LSTAT_AFFN_VIOL as usize;
57const LSTAT_KEEP: usize = bpf_intf::layer_stat_id_LSTAT_KEEP as usize;
58const LSTAT_KEEP_FAIL_MAX_EXEC: usize = bpf_intf::layer_stat_id_LSTAT_KEEP_FAIL_MAX_EXEC as usize;
59const LSTAT_KEEP_FAIL_BUSY: usize = bpf_intf::layer_stat_id_LSTAT_KEEP_FAIL_BUSY as usize;
60const LSTAT_PREEMPT: usize = bpf_intf::layer_stat_id_LSTAT_PREEMPT as usize;
61const LSTAT_PREEMPT_FIRST: usize = bpf_intf::layer_stat_id_LSTAT_PREEMPT_FIRST as usize;
62const LSTAT_PREEMPT_XLLC: usize = bpf_intf::layer_stat_id_LSTAT_PREEMPT_XLLC as usize;
63const LSTAT_PREEMPT_XNUMA: usize = bpf_intf::layer_stat_id_LSTAT_PREEMPT_XNUMA as usize;
64const LSTAT_PREEMPT_IDLE: usize = bpf_intf::layer_stat_id_LSTAT_PREEMPT_IDLE as usize;
65const LSTAT_PREEMPT_FAIL: usize = bpf_intf::layer_stat_id_LSTAT_PREEMPT_FAIL as usize;
66const LSTAT_EXCL_COLLISION: usize = bpf_intf::layer_stat_id_LSTAT_EXCL_COLLISION as usize;
67const LSTAT_EXCL_PREEMPT: usize = bpf_intf::layer_stat_id_LSTAT_EXCL_PREEMPT as usize;
68const LSTAT_YIELD: usize = bpf_intf::layer_stat_id_LSTAT_YIELD as usize;
69const LSTAT_YIELD_IGNORE: usize = bpf_intf::layer_stat_id_LSTAT_YIELD_IGNORE as usize;
70const LSTAT_MIGRATION: usize = bpf_intf::layer_stat_id_LSTAT_MIGRATION as usize;
71const LSTAT_XNUMA_MIGRATION: usize = bpf_intf::layer_stat_id_LSTAT_XNUMA_MIGRATION as usize;
72const LSTAT_XLLC_MIGRATION: usize = bpf_intf::layer_stat_id_LSTAT_XLLC_MIGRATION as usize;
73const LSTAT_XLLC_MIGRATION_SKIP: usize = bpf_intf::layer_stat_id_LSTAT_XLLC_MIGRATION_SKIP as usize;
74const LSTAT_XLAYER_WAKE: usize = bpf_intf::layer_stat_id_LSTAT_XLAYER_WAKE as usize;
75const LSTAT_XLAYER_REWAKE: usize = bpf_intf::layer_stat_id_LSTAT_XLAYER_REWAKE as usize;
76const LSTAT_LLC_DRAIN_TRY: usize = bpf_intf::layer_stat_id_LSTAT_LLC_DRAIN_TRY as usize;
77const LSTAT_LLC_DRAIN: usize = bpf_intf::layer_stat_id_LSTAT_LLC_DRAIN as usize;
78const LSTAT_SKIP_REMOTE_NODE: usize = bpf_intf::layer_stat_id_LSTAT_SKIP_REMOTE_NODE as usize;
79
80const LLC_LSTAT_LAT: usize = bpf_intf::llc_layer_stat_id_LLC_LSTAT_LAT as usize;
81const LLC_LSTAT_CNT: usize = bpf_intf::llc_layer_stat_id_LLC_LSTAT_CNT as usize;
82
83fn calc_frac(a: f64, b: f64) -> f64 {
84 if b != 0.0 {
85 a / b * 100.0
86 } else {
87 0.0
88 }
89}
90
91fn fmt_pct(v: f64) -> String {
92 if v >= 99.995 {
93 format!("{:5.1}", v)
94 } else if v > 0.0 && v < 0.01 {
95 format!("{:5.2}", 0.01)
96 } else {
97 format!("{:5.2}", v)
98 }
99}
100
101fn fmt_num(v: u64) -> String {
102 if v > 1_000_000 {
103 format!("{:5.1}m", v as f64 / 1_000_000.0)
104 } else if v > 1_000 {
105 format!("{:5.1}k", v as f64 / 1_000.0)
106 } else {
107 format!("{:5.0} ", v)
108 }
109}
110
111#[stat_doc]
112#[derive(Clone, Debug, Default, Serialize, Deserialize, Stats)]
113#[stat(_om_prefix = "l_", _om_label = "layer_name")]
114pub struct LayerStats {
115 #[stat(desc = "index", _om_skip)]
116 pub index: usize,
117 #[stat(desc = "Total CPU utilization (100% means one full CPU)")]
118 pub util: f64,
119 #[stat(desc = "Protected CPU utilization %")]
120 pub util_protected_frac: f64,
121 #[stat(desc = "Preempt-protected CPU utilization %")]
122 pub util_protected_preempt_frac: f64,
123 #[stat(desc = "Open CPU utilization %")]
124 pub util_open_frac: f64,
125 #[stat(desc = "fraction of total CPU utilization")]
126 pub util_frac: f64,
127 #[stat(desc = "number of tasks")]
128 pub tasks: u32,
129 #[stat(desc = "count of sched events during the period")]
130 pub total: u64,
131 #[stat(desc = "% dispatched into idle CPU from select_cpu")]
132 pub sel_local: f64,
133 #[stat(desc = "% dispatched into idle CPU from enqueue")]
134 pub enq_local: f64,
135 #[stat(desc = "% enqueued after wakeup")]
136 pub enq_wakeup: f64,
137 #[stat(desc = "% enqueued after slice expiration")]
138 pub enq_expire: f64,
139 #[stat(desc = "% re-enqueued due to RT preemption")]
140 pub enq_reenq: f64,
141 #[stat(desc = "% enqueued into the layer's LLC DSQs")]
142 pub enq_dsq: f64,
143 #[stat(desc = "count of times exec duration < min_exec_us")]
144 pub min_exec: f64,
145 #[stat(desc = "total exec durations extended due to min_exec_us")]
146 pub min_exec_us: u64,
147 #[stat(desc = "% dispatched into idle CPUs occupied by other layers")]
148 pub open_idle: f64,
149 #[stat(desc = "% preempted other tasks")]
150 pub preempt: f64,
151 #[stat(desc = "% preempted XLLC tasks")]
152 pub preempt_xllc: f64,
153 #[stat(desc = "% preempted XNUMA tasks")]
154 pub preempt_xnuma: f64,
155 #[stat(desc = "% first-preempted other tasks")]
156 pub preempt_first: f64,
157 #[stat(desc = "% idle-preempted other tasks")]
158 pub preempt_idle: f64,
159 #[stat(desc = "% attempted to preempt other tasks but failed")]
160 pub preempt_fail: f64,
161 #[stat(desc = "% violated config due to CPU affinity")]
162 pub affn_viol: f64,
163 #[stat(desc = "% continued executing after slice expiration")]
164 pub keep: f64,
165 #[stat(desc = "% disallowed to continue executing due to max_exec")]
166 pub keep_fail_max_exec: f64,
167 #[stat(desc = "% disallowed to continue executing due to other tasks")]
168 pub keep_fail_busy: f64,
169 #[stat(desc = "whether is exclusive", _om_skip)]
170 pub is_excl: u32,
171 #[stat(desc = "count of times an excl task skipped a CPU as the sibling was also excl")]
172 pub excl_collision: f64,
173 #[stat(desc = "% a sibling CPU was preempted for an exclusive task")]
174 pub excl_preempt: f64,
175 #[stat(desc = "% yielded")]
176 pub yielded: f64,
177 #[stat(desc = "count of times yield was ignored")]
178 pub yield_ignore: u64,
179 #[stat(desc = "% migrated across CPUs")]
180 pub migration: f64,
181 #[stat(desc = "% migrated across NUMA nodes")]
182 pub xnuma_migration: f64,
183 #[stat(desc = "% migrated across LLCs")]
184 pub xllc_migration: f64,
185 #[stat(desc = "% migration skipped across LLCs due to xllc_mig_min_us")]
186 pub xllc_migration_skip: f64,
187 #[stat(desc = "% wakers across layers")]
188 pub xlayer_wake: f64,
189 #[stat(desc = "% rewakers across layers where waker has waken the task previously")]
190 pub xlayer_rewake: f64,
191 #[stat(desc = "% LLC draining tried")]
192 pub llc_drain_try: f64,
193 #[stat(desc = "% LLC draining succeeded")]
194 pub llc_drain: f64,
195 #[stat(desc = "% skip LLC dispatch on remote node")]
196 pub skip_remote_node: f64,
197 #[stat(desc = "mask of allocated CPUs", _om_skip)]
198 pub cpus: Vec<u64>,
199 #[stat(desc = "count of CPUs assigned")]
200 pub cur_nr_cpus: u32,
201 #[stat(desc = "minimum # of CPUs assigned")]
202 pub min_nr_cpus: u32,
203 #[stat(desc = "maximum # of CPUs assigned")]
204 pub max_nr_cpus: u32,
205 #[stat(desc = "count of CPUs assigned per LLC")]
206 pub nr_llc_cpus: Vec<u32>,
207 #[stat(desc = "slice duration config")]
208 pub slice_us: u64,
209 #[stat(desc = "Per-LLC scheduling event fractions")]
210 pub llc_fracs: Vec<f64>,
211 #[stat(desc = "Per-LLC average latency")]
212 pub llc_lats: Vec<f64>,
213 #[stat(desc = "Layer memory bandwidth as a % of total allowed (0 for \"no limit\"")]
214 pub membw_pct: f64,
215}
216
217impl LayerStats {
218 pub fn new(
219 lidx: usize,
220 layer: &Layer,
221 stats: &Stats,
222 bstats: &BpfStats,
223 nr_cpus_range: (usize, usize),
224 ) -> Self {
225 let lstat = |sidx| bstats.lstats[lidx][sidx];
226 let ltotal = lstat(LSTAT_SEL_LOCAL)
227 + lstat(LSTAT_ENQ_LOCAL)
228 + lstat(LSTAT_ENQ_WAKEUP)
229 + lstat(LSTAT_ENQ_EXPIRE)
230 + lstat(LSTAT_ENQ_REENQ)
231 + lstat(LSTAT_KEEP);
232 let lstat_pct = |sidx| {
233 if ltotal != 0 {
234 lstat(sidx) as f64 / ltotal as f64 * 100.0
235 } else {
236 0.0
237 }
238 };
239
240 let util_sum = stats.layer_utils[lidx]
241 .iter()
242 .take(LAYER_USAGE_SUM_UPTO + 1)
243 .sum::<f64>();
244
245 let membw_frac = match &layer.kind {
246 LayerKind::Open { .. } => 0.0,
248 LayerKind::Confined { membw_gb, .. } | LayerKind::Grouped { membw_gb, .. } => {
249 if let Some(membw_limit_gb) = membw_gb {
251 stats.layer_membws[lidx]
252 .iter()
253 .take(LAYER_USAGE_SUM_UPTO + 1)
254 .sum::<f64>()
255 / ((*membw_limit_gb * (1024_u64.pow(3) as f64)) as f64)
256 } else {
257 0.0
258 }
259 }
260 };
261
262 Self {
263 index: lidx,
264 util: util_sum * 100.0,
265 util_open_frac: calc_frac(stats.layer_utils[lidx][LAYER_USAGE_OPEN], util_sum),
266 util_protected_frac: calc_frac(
267 stats.layer_utils[lidx][LAYER_USAGE_PROTECTED],
268 util_sum,
269 ),
270 util_protected_preempt_frac: calc_frac(
271 stats.layer_utils[lidx][LAYER_USAGE_PROTECTED_PREEMPT],
272 util_sum,
273 ),
274 util_frac: calc_frac(util_sum, stats.total_util),
275 tasks: stats.nr_layer_tasks[lidx] as u32,
276 total: ltotal,
277 sel_local: lstat_pct(LSTAT_SEL_LOCAL),
278 enq_local: lstat_pct(LSTAT_ENQ_LOCAL),
279 enq_wakeup: lstat_pct(LSTAT_ENQ_WAKEUP),
280 enq_expire: lstat_pct(LSTAT_ENQ_EXPIRE),
281 enq_reenq: lstat_pct(LSTAT_ENQ_REENQ),
282 enq_dsq: lstat_pct(LSTAT_ENQ_DSQ),
283 min_exec: lstat_pct(LSTAT_MIN_EXEC),
284 min_exec_us: (lstat(LSTAT_MIN_EXEC_NS) / 1000) as u64,
285 open_idle: lstat_pct(LSTAT_OPEN_IDLE),
286 preempt: lstat_pct(LSTAT_PREEMPT),
287 preempt_xllc: lstat_pct(LSTAT_PREEMPT_XLLC),
288 preempt_xnuma: lstat_pct(LSTAT_PREEMPT_XNUMA),
289 preempt_first: lstat_pct(LSTAT_PREEMPT_FIRST),
290 preempt_idle: lstat_pct(LSTAT_PREEMPT_IDLE),
291 preempt_fail: lstat_pct(LSTAT_PREEMPT_FAIL),
292 affn_viol: lstat_pct(LSTAT_AFFN_VIOL),
293 keep: lstat_pct(LSTAT_KEEP),
294 keep_fail_max_exec: lstat_pct(LSTAT_KEEP_FAIL_MAX_EXEC),
295 keep_fail_busy: lstat_pct(LSTAT_KEEP_FAIL_BUSY),
296 is_excl: layer.kind.common().exclusive as u32,
297 excl_collision: lstat_pct(LSTAT_EXCL_COLLISION),
298 excl_preempt: lstat_pct(LSTAT_EXCL_PREEMPT),
299 yielded: lstat_pct(LSTAT_YIELD),
300 yield_ignore: lstat(LSTAT_YIELD_IGNORE) as u64,
301 migration: lstat_pct(LSTAT_MIGRATION),
302 xnuma_migration: lstat_pct(LSTAT_XNUMA_MIGRATION),
303 xlayer_wake: lstat_pct(LSTAT_XLAYER_WAKE),
304 xlayer_rewake: lstat_pct(LSTAT_XLAYER_REWAKE),
305 xllc_migration: lstat_pct(LSTAT_XLLC_MIGRATION),
306 xllc_migration_skip: lstat_pct(LSTAT_XLLC_MIGRATION_SKIP),
307 llc_drain_try: lstat_pct(LSTAT_LLC_DRAIN_TRY),
308 llc_drain: lstat_pct(LSTAT_LLC_DRAIN),
309 skip_remote_node: lstat_pct(LSTAT_SKIP_REMOTE_NODE),
310 cpus: layer.cpus.as_raw_slice().to_vec(),
311 cur_nr_cpus: layer.cpus.weight() as u32,
312 min_nr_cpus: nr_cpus_range.0 as u32,
313 max_nr_cpus: nr_cpus_range.1 as u32,
314 nr_llc_cpus: layer.nr_llc_cpus.iter().map(|&v| v as u32).collect(),
315 slice_us: stats.layer_slice_us[lidx],
316 llc_fracs: {
317 let sid = LLC_LSTAT_CNT;
318 let sum = bstats.llc_lstats[lidx]
319 .iter()
320 .map(|lstats| lstats[sid])
321 .sum::<u64>() as f64;
322 bstats.llc_lstats[lidx]
323 .iter()
324 .map(|lstats| calc_frac(lstats[sid] as f64, sum))
325 .collect()
326 },
327 llc_lats: bstats.llc_lstats[lidx]
328 .iter()
329 .map(|lstats| lstats[LLC_LSTAT_LAT] as f64 / 1_000_000_000.0)
330 .collect(),
331 membw_pct: membw_frac * 100.0,
332 }
333 }
334
335 pub fn format<W: Write>(&self, w: &mut W, name: &str, header_width: usize) -> Result<()> {
336 writeln!(
337 w,
338 " {:<width$}: util/open/frac={:6.1}/{}/{:7.1} prot/prot_preempt={}/{} tasks={:6}",
339 name,
340 self.util,
341 fmt_pct(self.util_open_frac),
342 self.util_frac,
343 fmt_pct(self.util_protected_frac),
344 fmt_pct(self.util_protected_preempt_frac),
345 self.tasks,
346 width = header_width,
347 )?;
348
349 writeln!(
350 w,
351 " {:<width$} tot={:7} local_sel/enq={}/{} enq_dsq={} wake/exp/reenq={}/{}/{}",
352 "",
353 self.total,
354 fmt_pct(self.sel_local),
355 fmt_pct(self.enq_local),
356 fmt_pct(self.enq_dsq),
357 fmt_pct(self.enq_wakeup),
358 fmt_pct(self.enq_expire),
359 fmt_pct(self.enq_reenq),
360 width = header_width,
361 )?;
362
363 writeln!(
364 w,
365 " {:<width$} keep/max/busy={}/{}/{} yield/ign={}/{}",
366 "",
367 fmt_pct(self.keep),
368 fmt_pct(self.keep_fail_max_exec),
369 fmt_pct(self.keep_fail_busy),
370 fmt_pct(self.yielded),
371 fmt_num(self.yield_ignore),
372 width = header_width,
373 )?;
374
375 writeln!(
376 w,
377 " {:<width$} open_idle={} mig={} xnuma_mig={} xllc_mig/skip={}/{} affn_viol={}",
378 "",
379 fmt_pct(self.open_idle),
380 fmt_pct(self.migration),
381 fmt_pct(self.xnuma_migration),
382 fmt_pct(self.xllc_migration),
383 fmt_pct(self.xllc_migration_skip),
384 fmt_pct(self.affn_viol),
385 width = header_width,
386 )?;
387
388 writeln!(
389 w,
390 " {:<width$} preempt/first/xllc/xnuma/idle/fail={}/{}/{}/{}/{}/{}",
391 "",
392 fmt_pct(self.preempt),
393 fmt_pct(self.preempt_first),
394 fmt_pct(self.preempt_xllc),
395 fmt_pct(self.preempt_xnuma),
396 fmt_pct(self.preempt_idle),
397 fmt_pct(self.preempt_fail),
398 width = header_width,
399 )?;
400
401 writeln!(
402 w,
403 " {:<width$} xlayer_wake/re={}/{} llc_drain/try={}/{} skip_rnode={}",
404 "",
405 fmt_pct(self.xlayer_wake),
406 fmt_pct(self.xlayer_rewake),
407 fmt_pct(self.llc_drain),
408 fmt_pct(self.llc_drain_try),
409 fmt_pct(self.skip_remote_node),
410 width = header_width,
411 )?;
412
413 writeln!(
414 w,
415 " {:<width$} slice={}ms min_exec={}/{:7.2}ms",
416 "",
417 self.slice_us as f64 / 1000.0,
418 fmt_pct(self.min_exec),
419 self.min_exec_us as f64 / 1000.0,
420 width = header_width
421 )?;
422
423 let cpumask = Cpumask::from_vec(self.cpus.clone());
424
425 writeln!(
426 w,
427 " {:<width$} cpus={:3} [{:3},{:3}] {}",
428 "",
429 self.cur_nr_cpus,
430 self.min_nr_cpus,
431 self.max_nr_cpus,
432 &cpumask,
433 width = header_width
434 )?;
435
436 write!(
437 w,
438 " {:<width$} [LLC] nr_cpus: sched% lat_ms",
439 "",
440 width = header_width
441 )?;
442
443 for (i, (&frac, &lat)) in self.llc_fracs.iter().zip(self.llc_lats.iter()).enumerate() {
444 if (i % 4) == 0 {
445 writeln!(w, "")?;
446 write!(w, " {:<width$} [{:03}]", "", i, width = header_width)?;
447 } else {
448 write!(w, " |")?;
449 }
450 write!(
451 w,
452 " {:2}:{}%{:7.2}",
453 self.nr_llc_cpus[i],
454 fmt_pct(frac),
455 lat * 1_000.0
456 )?;
457 }
458 writeln!(w, "")?;
459
460 if self.is_excl != 0 {
461 writeln!(
462 w,
463 " {:<width$} excl_coll={} excl_preempt={}",
464 "",
465 fmt_pct(self.excl_collision),
466 fmt_pct(self.excl_preempt),
467 width = header_width,
468 )?;
469 } else if self.excl_collision != 0.0 || self.excl_preempt != 0.0 {
470 warn!(
471 "{}: exclusive is off but excl_coll={} excl_preempt={}",
472 name,
473 fmt_pct(self.excl_collision),
474 fmt_pct(self.excl_preempt),
475 );
476 }
477
478 Ok(())
479 }
480}
481
482#[stat_doc]
483#[derive(Clone, Debug, Default, Serialize, Deserialize, Stats)]
484#[stat(top)]
485pub struct SysStats {
486 #[stat(desc = "timestamp", _om_skip)]
487 pub at: f64,
488 #[stat(desc = "# of NUMA nodes")]
489 pub nr_nodes: usize,
490 #[stat(desc = "# sched events during the period")]
491 pub total: u64,
492 #[stat(desc = "% dispatched directly into an idle CPU from select_cpu")]
493 pub local_sel: f64,
494 #[stat(desc = "% dispatched directly into an idle CPU from enqueue")]
495 pub local_enq: f64,
496 #[stat(desc = "% open layer tasks scheduled into allocated but idle CPUs")]
497 pub open_idle: f64,
498 #[stat(desc = "% violated config due to CPU affinity")]
499 pub affn_viol: f64,
500 #[stat(desc = "% sent to hi fallback DSQs")]
501 pub hi_fb: f64,
502 #[stat(desc = "% sent to lo fallback DSQs")]
503 pub lo_fb: f64,
504 #[stat(desc = "count of times an excl task skipped a CPU as the sibling was also excl")]
505 pub excl_collision: f64,
506 #[stat(desc = "count of times a sibling CPU was preempted for an excl task")]
507 pub excl_preempt: f64,
508 #[stat(desc = "count of times a CPU skipped dispatching due to an excl task on the sibling")]
509 pub excl_idle: f64,
510 #[stat(
511 desc = "count of times an idle sibling CPU was woken up after an excl task is finished"
512 )]
513 pub excl_wakeup: f64,
514 #[stat(desc = "CPU time this binary consumed during the period")]
515 pub proc_ms: u64,
516 #[stat(desc = "CPU busy % (100% means all CPU)")]
517 pub busy: f64,
518 #[stat(desc = "CPU util % (100% means one CPU)")]
519 pub util: f64,
520 #[stat(desc = "CPU util % used by hi fallback DSQs")]
521 pub hi_fb_util: f64,
522 #[stat(desc = "CPU util % used by lo fallback DSQs")]
523 pub lo_fb_util: f64,
524 #[stat(desc = "Number of tasks dispatched via antistall")]
525 pub antistall: u64,
526 #[stat(desc = "Number of times preemptions of non-scx tasks were avoided")]
527 pub skip_preempt: u64,
528 #[stat(desc = "Number of times vtime was out of range and fixed up")]
529 pub fixup_vtime: u64,
530 #[stat(desc = "Number of times cpuc->preempting_task didn't come on the CPU")]
531 pub preempting_mismatch: u64,
532 #[stat(desc = "fallback CPU")]
533 pub fallback_cpu: u32,
534 #[stat(desc = "per-layer statistics")]
535 pub fallback_cpu_util: f64,
536 #[stat(desc = "fallback CPU util %")]
537 pub layers: BTreeMap<String, LayerStats>,
538 #[stat(desc = "Number of gpu tasks affinitized since scheduler start")]
539 pub gpu_tasks_affinitized: u64,
540 #[stat(desc = "Time (in ms) of last affinitization run.")]
541 pub gpu_task_affinitization_ms: u64,
542}
543
544impl SysStats {
545 pub fn new(stats: &Stats, bstats: &BpfStats, fallback_cpu: usize) -> Result<Self> {
546 let lsum = |idx| stats.bpf_stats.lstats_sums[idx];
547 let total = lsum(LSTAT_SEL_LOCAL)
548 + lsum(LSTAT_ENQ_LOCAL)
549 + lsum(LSTAT_ENQ_WAKEUP)
550 + lsum(LSTAT_ENQ_EXPIRE)
551 + lsum(LSTAT_ENQ_REENQ)
552 + lsum(LSTAT_KEEP);
553 let lsum_pct = |idx| {
554 if total != 0 {
555 lsum(idx) as f64 / total as f64 * 100.0
556 } else {
557 0.0
558 }
559 };
560
561 let elapsed_ns = stats.elapsed.as_nanos();
562
563 Ok(Self {
564 at: SystemTime::now().duration_since(UNIX_EPOCH)?.as_secs_f64(),
565 nr_nodes: stats.nr_nodes,
566 total,
567 local_sel: lsum_pct(LSTAT_SEL_LOCAL),
568 local_enq: lsum_pct(LSTAT_ENQ_LOCAL),
569 open_idle: lsum_pct(LSTAT_OPEN_IDLE),
570 affn_viol: lsum_pct(LSTAT_AFFN_VIOL),
571 hi_fb: calc_frac(
572 stats.bpf_stats.gstats[GSTAT_HI_FB_EVENTS] as f64,
573 total as f64,
574 ),
575 lo_fb: calc_frac(
576 stats.bpf_stats.gstats[GSTAT_LO_FB_EVENTS] as f64,
577 total as f64,
578 ),
579 excl_collision: lsum_pct(LSTAT_EXCL_COLLISION),
580 excl_preempt: lsum_pct(LSTAT_EXCL_PREEMPT),
581 excl_idle: bstats.gstats[GSTAT_EXCL_IDLE] as f64 / total as f64,
582 excl_wakeup: bstats.gstats[GSTAT_EXCL_WAKEUP] as f64 / total as f64,
583 proc_ms: stats.processing_dur.as_millis() as u64,
584 busy: stats.cpu_busy * 100.0,
585 util: stats.total_util * 100.0,
586 hi_fb_util: stats.bpf_stats.gstats[GSTAT_HI_FB_USAGE] as f64 / elapsed_ns as f64
587 * 100.0,
588 lo_fb_util: stats.bpf_stats.gstats[GSTAT_LO_FB_USAGE] as f64 / elapsed_ns as f64
589 * 100.0,
590 antistall: stats.bpf_stats.gstats[GSTAT_ANTISTALL],
591 skip_preempt: stats.bpf_stats.gstats[GSTAT_SKIP_PREEMPT],
592 fixup_vtime: stats.bpf_stats.gstats[GSTAT_FIXUP_VTIME],
593 preempting_mismatch: stats.bpf_stats.gstats[GSTAT_PREEMPTING_MISMATCH],
594 fallback_cpu: fallback_cpu as u32,
595 fallback_cpu_util: stats.bpf_stats.gstats[GSTAT_FB_CPU_USAGE] as f64
596 / elapsed_ns as f64
597 * 100.0,
598 layers: BTreeMap::new(),
599 gpu_tasks_affinitized: stats.gpu_tasks_affinitized,
600 gpu_task_affinitization_ms: stats.gpu_task_affinitization_ms,
601 })
602 }
603
604 pub fn format<W: Write>(&self, w: &mut W) -> Result<()> {
605 writeln!(
606 w,
607 "tot={:7} local_sel/enq={}/{} open_idle={} affn_viol={} hi/lo={}/{}",
608 self.total,
609 fmt_pct(self.local_sel),
610 fmt_pct(self.local_enq),
611 fmt_pct(self.open_idle),
612 fmt_pct(self.affn_viol),
613 fmt_pct(self.hi_fb),
614 fmt_pct(self.lo_fb),
615 )?;
616
617 writeln!(
618 w,
619 "busy={:5.1} util/hi/lo={:7.1}/{}/{} fallback_cpu/util={:3}/{:4.1} proc={:?}ms",
620 self.busy,
621 self.util,
622 fmt_pct(self.hi_fb_util),
623 fmt_pct(self.lo_fb_util),
624 self.fallback_cpu,
625 self.fallback_cpu_util,
626 self.proc_ms,
627 )?;
628
629 writeln!(
630 w,
631 "excl_coll={:.2} excl_preempt={:.2} excl_idle={:.2} excl_wakeup={:.2}",
632 self.excl_collision, self.excl_preempt, self.excl_idle, self.excl_wakeup
633 )?;
634
635 writeln!(
636 w,
637 "skip_preempt={} antistall={} fixup_vtime={} preempting_mismatch={}",
638 self.skip_preempt, self.antistall, self.fixup_vtime, self.preempting_mismatch
639 )?;
640
641 writeln!(
642 w,
643 "gpu_tasks_affinitized={} gpu_task_affinitization_time={}",
644 self.gpu_tasks_affinitized, self.gpu_task_affinitization_ms
645 )?;
646
647 Ok(())
648 }
649
650 pub fn format_all<W: Write>(&self, w: &mut W) -> Result<()> {
651 self.format(w)?;
652
653 let header_width = self
654 .layers
655 .keys()
656 .map(|name| name.len())
657 .max()
658 .unwrap_or(0)
659 .max(4);
660
661 let mut idx_to_name: Vec<(usize, &String)> =
662 self.layers.iter().map(|(k, v)| (v.index, k)).collect();
663
664 idx_to_name.sort();
665
666 for (_idx, name) in &idx_to_name {
667 self.layers[*name].format(w, name, header_width)?;
668 }
669
670 Ok(())
671 }
672}
673
674#[derive(Debug)]
675pub enum StatsReq {
676 Hello(ThreadId),
677 Refresh(ThreadId, Stats),
678 Bye(ThreadId),
679}
680
681#[derive(Debug)]
682pub enum StatsRes {
683 Hello(Stats),
684 Refreshed((Stats, SysStats)),
685 Bye,
686}
687
688pub fn server_data() -> StatsServerData<StatsReq, StatsRes> {
689 let open: Box<dyn StatsOpener<StatsReq, StatsRes>> = Box::new(move |(req_ch, res_ch)| {
690 let tid = current().id();
691 req_ch.send(StatsReq::Hello(tid))?;
692 let mut stats = Some(match res_ch.recv()? {
693 StatsRes::Hello(v) => v,
694 res => bail!("invalid response to Hello: {:?}", res),
695 });
696
697 let read: Box<dyn StatsReader<StatsReq, StatsRes>> =
698 Box::new(move |_args, (req_ch, res_ch)| {
699 req_ch.send(StatsReq::Refresh(tid, stats.take().unwrap()))?;
700 let (new_stats, sys_stats) = match res_ch.recv()? {
701 StatsRes::Refreshed(v) => v,
702 res => bail!("invalid response to Refresh: {:?}", res),
703 };
704 stats = Some(new_stats);
705 sys_stats.to_json()
706 });
707
708 Ok(read)
709 });
710
711 let close: Box<dyn StatsCloser<StatsReq, StatsRes>> = Box::new(move |(req_ch, res_ch)| {
712 req_ch.send(StatsReq::Bye(current().id())).unwrap();
713 match res_ch.recv().unwrap() {
714 StatsRes::Bye => {}
715 res => panic!("invalid response to Bye: {:?}", res),
716 }
717 });
718
719 StatsServerData::new()
720 .add_meta(LayerStats::meta())
721 .add_meta(SysStats::meta())
722 .add_ops(
723 "top",
724 StatsOps {
725 open,
726 close: Some(close),
727 },
728 )
729}
730
731pub fn monitor(intv: Duration, shutdown: Arc<AtomicBool>) -> Result<()> {
732 scx_utils::monitor_stats::<SysStats>(
733 &vec![],
734 intv,
735 || shutdown.load(Ordering::Relaxed),
736 |sst| {
737 let dt = DateTime::<Local>::from(UNIX_EPOCH + Duration::from_secs_f64(sst.at));
738 println!("###### {} ######", dt.to_rfc2822());
739 sst.format_all(&mut std::io::stdout())
740 },
741 )
742}