Skip to main content

scx_layered/
stats.rs

1use std::collections::BTreeMap;
2use std::io::Write;
3use std::sync::atomic::AtomicBool;
4use std::sync::atomic::Ordering;
5use std::sync::Arc;
6use std::thread::current;
7use std::thread::ThreadId;
8use std::time::Duration;
9use std::time::SystemTime;
10use std::time::UNIX_EPOCH;
11
12use anyhow::bail;
13use anyhow::Result;
14use chrono::DateTime;
15use chrono::Local;
16use scx_stats::prelude::*;
17use scx_stats_derive::stat_doc;
18use scx_stats_derive::Stats;
19use scx_utils::Cpumask;
20use scx_utils::Topology;
21use serde::Deserialize;
22use serde::Serialize;
23use tracing::warn;
24
25use crate::bpf_intf;
26use crate::BpfStats;
27use crate::Layer;
28use crate::LayerKind;
29use crate::Stats;
30use crate::LAYER_USAGE_OPEN;
31use crate::LAYER_USAGE_PROTECTED;
32use crate::LAYER_USAGE_PROTECTED_PREEMPT;
33use crate::LAYER_USAGE_SUM_UPTO;
34
35const GSTAT_EXCL_IDLE: usize = bpf_intf::global_stat_id_GSTAT_EXCL_IDLE as usize;
36const GSTAT_EXCL_WAKEUP: usize = bpf_intf::global_stat_id_GSTAT_EXCL_WAKEUP as usize;
37const GSTAT_HI_FB_EVENTS: usize = bpf_intf::global_stat_id_GSTAT_HI_FB_EVENTS as usize;
38const GSTAT_HI_FB_USAGE: usize = bpf_intf::global_stat_id_GSTAT_HI_FB_USAGE as usize;
39const GSTAT_LO_FB_EVENTS: usize = bpf_intf::global_stat_id_GSTAT_LO_FB_EVENTS as usize;
40const GSTAT_LO_FB_USAGE: usize = bpf_intf::global_stat_id_GSTAT_LO_FB_USAGE as usize;
41const GSTAT_FB_CPU_USAGE: usize = bpf_intf::global_stat_id_GSTAT_FB_CPU_USAGE as usize;
42const GSTAT_ANTISTALL: usize = bpf_intf::global_stat_id_GSTAT_ANTISTALL as usize;
43const GSTAT_SKIP_PREEMPT: usize = bpf_intf::global_stat_id_GSTAT_SKIP_PREEMPT as usize;
44const GSTAT_FIXUP_VTIME: usize = bpf_intf::global_stat_id_GSTAT_FIXUP_VTIME as usize;
45const GSTAT_PREEMPTING_MISMATCH: usize =
46    bpf_intf::global_stat_id_GSTAT_PREEMPTING_MISMATCH as usize;
47
48const LSTAT_SEL_LOCAL: usize = bpf_intf::layer_stat_id_LSTAT_SEL_LOCAL as usize;
49const LSTAT_ENQ_LOCAL: usize = bpf_intf::layer_stat_id_LSTAT_ENQ_LOCAL as usize;
50const LSTAT_ENQ_WAKEUP: usize = bpf_intf::layer_stat_id_LSTAT_ENQ_WAKEUP as usize;
51const LSTAT_ENQ_EXPIRE: usize = bpf_intf::layer_stat_id_LSTAT_ENQ_EXPIRE as usize;
52const LSTAT_ENQ_REENQ: usize = bpf_intf::layer_stat_id_LSTAT_ENQ_REENQ as usize;
53const LSTAT_ENQ_DSQ: usize = bpf_intf::layer_stat_id_LSTAT_ENQ_DSQ as usize;
54const LSTAT_MIN_EXEC: usize = bpf_intf::layer_stat_id_LSTAT_MIN_EXEC as usize;
55const LSTAT_MIN_EXEC_NS: usize = bpf_intf::layer_stat_id_LSTAT_MIN_EXEC_NS as usize;
56const LSTAT_OPEN_IDLE: usize = bpf_intf::layer_stat_id_LSTAT_OPEN_IDLE as usize;
57const LSTAT_AFFN_VIOL: usize = bpf_intf::layer_stat_id_LSTAT_AFFN_VIOL as usize;
58const LSTAT_KEEP: usize = bpf_intf::layer_stat_id_LSTAT_KEEP as usize;
59const LSTAT_KEEP_FAIL_MAX_EXEC: usize = bpf_intf::layer_stat_id_LSTAT_KEEP_FAIL_MAX_EXEC as usize;
60const LSTAT_KEEP_FAIL_BUSY: usize = bpf_intf::layer_stat_id_LSTAT_KEEP_FAIL_BUSY as usize;
61const LSTAT_PREEMPT: usize = bpf_intf::layer_stat_id_LSTAT_PREEMPT as usize;
62const LSTAT_PREEMPT_FIRST: usize = bpf_intf::layer_stat_id_LSTAT_PREEMPT_FIRST as usize;
63const LSTAT_PREEMPT_XLLC: usize = bpf_intf::layer_stat_id_LSTAT_PREEMPT_XLLC as usize;
64const LSTAT_PREEMPT_XNUMA: usize = bpf_intf::layer_stat_id_LSTAT_PREEMPT_XNUMA as usize;
65const LSTAT_PREEMPT_IDLE: usize = bpf_intf::layer_stat_id_LSTAT_PREEMPT_IDLE as usize;
66const LSTAT_PREEMPT_FAIL: usize = bpf_intf::layer_stat_id_LSTAT_PREEMPT_FAIL as usize;
67const LSTAT_EXCL_COLLISION: usize = bpf_intf::layer_stat_id_LSTAT_EXCL_COLLISION as usize;
68const LSTAT_EXCL_PREEMPT: usize = bpf_intf::layer_stat_id_LSTAT_EXCL_PREEMPT as usize;
69const LSTAT_YIELD: usize = bpf_intf::layer_stat_id_LSTAT_YIELD as usize;
70const LSTAT_YIELD_IGNORE: usize = bpf_intf::layer_stat_id_LSTAT_YIELD_IGNORE as usize;
71const LSTAT_MIGRATION: usize = bpf_intf::layer_stat_id_LSTAT_MIGRATION as usize;
72const LSTAT_XNUMA_MIGRATION: usize = bpf_intf::layer_stat_id_LSTAT_XNUMA_MIGRATION as usize;
73const LSTAT_XLLC_MIGRATION: usize = bpf_intf::layer_stat_id_LSTAT_XLLC_MIGRATION as usize;
74const LSTAT_XLLC_MIGRATION_SKIP: usize = bpf_intf::layer_stat_id_LSTAT_XLLC_MIGRATION_SKIP as usize;
75const LSTAT_XLAYER_WAKE: usize = bpf_intf::layer_stat_id_LSTAT_XLAYER_WAKE as usize;
76const LSTAT_XLAYER_REWAKE: usize = bpf_intf::layer_stat_id_LSTAT_XLAYER_REWAKE as usize;
77const LSTAT_LLC_DRAIN_TRY: usize = bpf_intf::layer_stat_id_LSTAT_LLC_DRAIN_TRY as usize;
78const LSTAT_LLC_DRAIN: usize = bpf_intf::layer_stat_id_LSTAT_LLC_DRAIN as usize;
79const LSTAT_SKIP_REMOTE_NODE: usize = bpf_intf::layer_stat_id_LSTAT_SKIP_REMOTE_NODE as usize;
80
81const LLC_LSTAT_LAT: usize = bpf_intf::llc_layer_stat_id_LLC_LSTAT_LAT as usize;
82const LLC_LSTAT_CNT: usize = bpf_intf::llc_layer_stat_id_LLC_LSTAT_CNT as usize;
83
84fn calc_frac(a: f64, b: f64) -> f64 {
85    if b != 0.0 {
86        a / b * 100.0
87    } else {
88        0.0
89    }
90}
91
92fn fmt_pct(v: f64) -> String {
93    if v >= 99.95 {
94        format!("{:4.0}", v)
95    } else if v >= 10.0 {
96        format!("{:4.1}", v)
97    } else if v > 0.0 && v < 0.01 {
98        format!("{:4.2}", 0.01)
99    } else {
100        format!("{:4.2}", v)
101    }
102}
103
104fn fmt_duration_ms(ms: f64) -> String {
105    if ms >= 60_000.0 {
106        let min = ms / 60_000.0;
107        if min >= 100.0 {
108            format!("{:.0}min", min)
109        } else {
110            format!("{:.1}min", min)
111        }
112    } else if ms >= 1_000.0 {
113        let s = ms / 1_000.0;
114        if s >= 100.0 {
115            format!("{:.0}s", s)
116        } else {
117            format!("{:.1}s", s)
118        }
119    } else if ms >= 10.0 {
120        format!("{:.0}ms", ms)
121    } else {
122        format!("{:.1}ms", ms)
123    }
124}
125
126fn fmt_num(v: u64) -> String {
127    if v > 1_000_000 {
128        format!("{:5.1}m", v as f64 / 1_000_000.0)
129    } else if v > 1_000 {
130        format!("{:5.1}k", v as f64 / 1_000.0)
131    } else {
132        format!("{:5.0} ", v)
133    }
134}
135
136#[stat_doc]
137#[derive(Clone, Debug, Default, Serialize, Deserialize, Stats)]
138#[stat(_om_prefix = "l_", _om_label = "layer_name")]
139pub struct LayerStats {
140    #[stat(desc = "index", _om_skip)]
141    pub index: usize,
142    #[stat(desc = "Total CPU utilization (100% means one full CPU)")]
143    pub util: f64,
144    #[stat(desc = "Protected CPU utilization %")]
145    pub util_protected_frac: f64,
146    #[stat(desc = "Preempt-protected CPU utilization %")]
147    pub util_protected_preempt_frac: f64,
148    #[stat(desc = "Open CPU utilization %")]
149    pub util_open_frac: f64,
150    #[stat(desc = "fraction of total CPU utilization")]
151    pub util_frac: f64,
152    #[stat(desc = "number of tasks")]
153    pub tasks: u32,
154    #[stat(desc = "count of sched events during the period")]
155    pub total: u64,
156    #[stat(desc = "% dispatched into idle CPU from select_cpu")]
157    pub sel_local: f64,
158    #[stat(desc = "% dispatched into idle CPU from enqueue")]
159    pub enq_local: f64,
160    #[stat(desc = "% enqueued after wakeup")]
161    pub enq_wakeup: f64,
162    #[stat(desc = "% enqueued after slice expiration")]
163    pub enq_expire: f64,
164    #[stat(desc = "% re-enqueued due to RT preemption")]
165    pub enq_reenq: f64,
166    #[stat(desc = "% enqueued into the layer's LLC DSQs")]
167    pub enq_dsq: f64,
168    #[stat(desc = "count of times exec duration < min_exec_us")]
169    pub min_exec: f64,
170    #[stat(desc = "total exec durations extended due to min_exec_us")]
171    pub min_exec_us: u64,
172    #[stat(desc = "% dispatched into idle CPUs occupied by other layers")]
173    pub open_idle: f64,
174    #[stat(desc = "% preempted other tasks")]
175    pub preempt: f64,
176    #[stat(desc = "% preempted XLLC tasks")]
177    pub preempt_xllc: f64,
178    #[stat(desc = "% preempted across NUMA nodes")]
179    pub preempt_xnuma: f64,
180    #[stat(desc = "% first-preempted other tasks")]
181    pub preempt_first: f64,
182    #[stat(desc = "% idle-preempted other tasks")]
183    pub preempt_idle: f64,
184    #[stat(desc = "% attempted to preempt other tasks but failed")]
185    pub preempt_fail: f64,
186    #[stat(desc = "% violated config due to CPU affinity")]
187    pub affn_viol: f64,
188    #[stat(desc = "% continued executing after slice expiration")]
189    pub keep: f64,
190    #[stat(desc = "% disallowed to continue executing due to max_exec")]
191    pub keep_fail_max_exec: f64,
192    #[stat(desc = "% disallowed to continue executing due to other tasks")]
193    pub keep_fail_busy: f64,
194    #[stat(desc = "whether is exclusive", _om_skip)]
195    pub is_excl: u32,
196    #[stat(desc = "count of times an excl task skipped a CPU as the sibling was also excl")]
197    pub excl_collision: f64,
198    #[stat(desc = "% a sibling CPU was preempted for an exclusive task")]
199    pub excl_preempt: f64,
200    #[stat(desc = "% yielded")]
201    pub yielded: f64,
202    #[stat(desc = "count of times yield was ignored")]
203    pub yield_ignore: u64,
204    #[stat(desc = "% migrated across CPUs")]
205    pub migration: f64,
206    #[stat(desc = "% migrated across NUMA nodes")]
207    pub xnuma_migration: f64,
208    #[stat(desc = "% migrated across LLCs")]
209    pub xllc_migration: f64,
210    #[stat(desc = "% migration skipped across LLCs due to xllc_mig_min_us")]
211    pub xllc_migration_skip: f64,
212    #[stat(desc = "% wakers across layers")]
213    pub xlayer_wake: f64,
214    #[stat(desc = "% rewakers across layers where waker has waken the task previously")]
215    pub xlayer_rewake: f64,
216    #[stat(desc = "% LLC draining tried")]
217    pub llc_drain_try: f64,
218    #[stat(desc = "% LLC draining succeeded")]
219    pub llc_drain: f64,
220    #[stat(desc = "% skip LLC dispatch on remote node")]
221    pub skip_remote_node: f64,
222    #[stat(desc = "mask of allocated CPUs", _om_skip)]
223    pub cpus: Vec<u64>,
224    #[stat(desc = "count of CPUs assigned")]
225    pub cur_nr_cpus: u32,
226    #[stat(desc = "minimum # of CPUs assigned")]
227    pub min_nr_cpus: u32,
228    #[stat(desc = "maximum # of CPUs assigned")]
229    pub max_nr_cpus: u32,
230    #[stat(desc = "count of CPUs assigned per LLC")]
231    pub nr_llc_cpus: Vec<u32>,
232    #[stat(desc = "slice duration config")]
233    pub slice_us: u64,
234    #[stat(desc = "Per-LLC scheduling event fractions")]
235    pub llc_fracs: Vec<f64>,
236    #[stat(desc = "Per-LLC average latency")]
237    pub llc_lats: Vec<f64>,
238    #[stat(desc = "Layer memory bandwidth as a % of total allowed (0 for \"no limit\"")]
239    pub membw_pct: f64,
240    #[stat(desc = "DSQ insertion ratio EWMA (10s window)")]
241    pub dsq_insert_ewma: f64,
242    #[stat(desc = "Per-node layer utilization (100% = one full CPU)")]
243    pub node_utils: Vec<f64>,
244    #[stat(desc = "Per-node pinned task utilization (100% = one full CPU)")]
245    pub node_pinned_utils: Vec<f64>,
246    #[stat(desc = "Per-node pinned task counts")]
247    pub node_pinned_tasks: Vec<u64>,
248    #[stat(desc = "Per-node load (100% = one full CPU, from duty cycle sum)")]
249    pub node_loads: Vec<f64>,
250    #[stat(desc = "Whether xnuma gating is active for this layer (0/1)")]
251    pub xnuma_active: u32,
252}
253
254impl LayerStats {
255    pub fn new(
256        lidx: usize,
257        layer: &Layer,
258        stats: &Stats,
259        bstats: &BpfStats,
260        nr_cpus_range: (usize, usize),
261        xnuma_active: bool,
262    ) -> Self {
263        let lstat = |sidx| bstats.lstats[lidx][sidx];
264        let ltotal = lstat(LSTAT_SEL_LOCAL)
265            + lstat(LSTAT_ENQ_LOCAL)
266            + lstat(LSTAT_ENQ_WAKEUP)
267            + lstat(LSTAT_ENQ_EXPIRE)
268            + lstat(LSTAT_ENQ_REENQ)
269            + lstat(LSTAT_KEEP);
270        let lstat_pct = |sidx| {
271            if ltotal != 0 {
272                lstat(sidx) as f64 / ltotal as f64 * 100.0
273            } else {
274                0.0
275            }
276        };
277
278        let util_sum = stats.layer_utils[lidx]
279            .iter()
280            .take(LAYER_USAGE_SUM_UPTO + 1)
281            .sum::<f64>();
282
283        let membw_frac = match &layer.kind {
284            // Open layer's can't have a memory BW limit.
285            LayerKind::Open { .. } => 0.0,
286            LayerKind::Confined { membw_gb, .. } | LayerKind::Grouped { membw_gb, .. } => {
287                // Check if we have set a memory BW limit.
288                if let Some(membw_limit_gb) = membw_gb {
289                    stats.layer_membws[lidx]
290                        .iter()
291                        .take(LAYER_USAGE_SUM_UPTO + 1)
292                        .sum::<f64>()
293                        / ((*membw_limit_gb * (1024_u64.pow(3) as f64)) as f64)
294                } else {
295                    0.0
296                }
297            }
298        };
299
300        Self {
301            index: lidx,
302            util: util_sum * 100.0,
303            util_open_frac: calc_frac(stats.layer_utils[lidx][LAYER_USAGE_OPEN], util_sum),
304            util_protected_frac: calc_frac(
305                stats.layer_utils[lidx][LAYER_USAGE_PROTECTED],
306                util_sum,
307            ),
308            util_protected_preempt_frac: calc_frac(
309                stats.layer_utils[lidx][LAYER_USAGE_PROTECTED_PREEMPT],
310                util_sum,
311            ),
312            util_frac: calc_frac(util_sum, stats.total_util),
313            tasks: stats.nr_layer_tasks[lidx] as u32,
314            total: ltotal,
315            sel_local: lstat_pct(LSTAT_SEL_LOCAL),
316            enq_local: lstat_pct(LSTAT_ENQ_LOCAL),
317            enq_wakeup: lstat_pct(LSTAT_ENQ_WAKEUP),
318            enq_expire: lstat_pct(LSTAT_ENQ_EXPIRE),
319            enq_reenq: lstat_pct(LSTAT_ENQ_REENQ),
320            enq_dsq: lstat_pct(LSTAT_ENQ_DSQ),
321            min_exec: lstat_pct(LSTAT_MIN_EXEC),
322            min_exec_us: (lstat(LSTAT_MIN_EXEC_NS) / 1000) as u64,
323            open_idle: lstat_pct(LSTAT_OPEN_IDLE),
324            preempt: lstat_pct(LSTAT_PREEMPT),
325            preempt_xllc: lstat_pct(LSTAT_PREEMPT_XLLC),
326            preempt_xnuma: lstat_pct(LSTAT_PREEMPT_XNUMA),
327            preempt_first: lstat_pct(LSTAT_PREEMPT_FIRST),
328            preempt_idle: lstat_pct(LSTAT_PREEMPT_IDLE),
329            preempt_fail: lstat_pct(LSTAT_PREEMPT_FAIL),
330            affn_viol: lstat_pct(LSTAT_AFFN_VIOL),
331            keep: lstat_pct(LSTAT_KEEP),
332            keep_fail_max_exec: lstat_pct(LSTAT_KEEP_FAIL_MAX_EXEC),
333            keep_fail_busy: lstat_pct(LSTAT_KEEP_FAIL_BUSY),
334            is_excl: layer.kind.common().exclusive as u32,
335            excl_collision: lstat_pct(LSTAT_EXCL_COLLISION),
336            excl_preempt: lstat_pct(LSTAT_EXCL_PREEMPT),
337            yielded: lstat_pct(LSTAT_YIELD),
338            yield_ignore: lstat(LSTAT_YIELD_IGNORE) as u64,
339            migration: lstat_pct(LSTAT_MIGRATION),
340            xnuma_migration: lstat_pct(LSTAT_XNUMA_MIGRATION),
341            xlayer_wake: lstat_pct(LSTAT_XLAYER_WAKE),
342            xlayer_rewake: lstat_pct(LSTAT_XLAYER_REWAKE),
343            xllc_migration: lstat_pct(LSTAT_XLLC_MIGRATION),
344            xllc_migration_skip: lstat_pct(LSTAT_XLLC_MIGRATION_SKIP),
345            llc_drain_try: lstat_pct(LSTAT_LLC_DRAIN_TRY),
346            llc_drain: lstat_pct(LSTAT_LLC_DRAIN),
347            skip_remote_node: lstat_pct(LSTAT_SKIP_REMOTE_NODE),
348            cpus: layer.cpus.as_raw_slice().to_vec(),
349            cur_nr_cpus: layer.cpus.weight() as u32,
350            min_nr_cpus: nr_cpus_range.0 as u32,
351            max_nr_cpus: nr_cpus_range.1 as u32,
352            nr_llc_cpus: layer.nr_llc_cpus.iter().map(|&v| v as u32).collect(),
353            slice_us: stats.layer_slice_us[lidx],
354            llc_fracs: {
355                let sid = LLC_LSTAT_CNT;
356                let sum = bstats.llc_lstats[lidx]
357                    .iter()
358                    .map(|lstats| lstats[sid])
359                    .sum::<u64>() as f64;
360                bstats.llc_lstats[lidx]
361                    .iter()
362                    .map(|lstats| calc_frac(lstats[sid] as f64, sum))
363                    .collect()
364            },
365            llc_lats: bstats.llc_lstats[lidx]
366                .iter()
367                .map(|lstats| lstats[LLC_LSTAT_LAT] as f64 / 1_000_000_000.0)
368                .collect(),
369            membw_pct: membw_frac * 100.0,
370            dsq_insert_ewma: stats.layer_dsq_insert_ewma[lidx] * 100.0,
371            node_utils: stats.layer_node_utils[lidx]
372                .iter()
373                .map(|u| u * 100.0)
374                .collect(),
375            node_pinned_utils: stats.layer_node_pinned_utils[lidx]
376                .iter()
377                .map(|u| u * 100.0)
378                .collect(),
379            node_pinned_tasks: stats.layer_nr_node_pinned_tasks[lidx].clone(),
380            node_loads: stats.layer_node_duty_sums[lidx]
381                .iter()
382                .map(|l| l * 100.0)
383                .collect(),
384            xnuma_active: if xnuma_active { 1 } else { 0 },
385        }
386    }
387
388    pub fn format<W: Write>(
389        &self,
390        w: &mut W,
391        name: &str,
392        topo: Option<&Topology>,
393        max_width: usize,
394        no_llc: bool,
395    ) -> Result<()> {
396        // Line 1: layer summary
397        writeln!(
398            w,
399            "\n\u{25B6} {} \u{2500} util/open/frac={:6.1}/{}/{:7.1} prot/prot_preempt={}/{} tasks={:6}",
400            name,
401            self.util,
402            fmt_pct(self.util_open_frac),
403            self.util_frac,
404            fmt_pct(self.util_protected_frac),
405            fmt_pct(self.util_protected_preempt_frac),
406            self.tasks,
407        )?;
408
409        // sched: scheduling event flow
410        writeln!(
411            w,
412            "  {:<7} tot={} dd_sel/enq={}/{} dsq/10s={}/{} wake/exp/re={}/{}/{}",
413            "sched",
414            fmt_num(self.total),
415            fmt_pct(self.sel_local),
416            fmt_pct(self.enq_local),
417            fmt_pct(self.enq_dsq),
418            fmt_pct(self.dsq_insert_ewma),
419            fmt_pct(self.enq_wakeup),
420            fmt_pct(self.enq_expire),
421            fmt_pct(self.enq_reenq),
422        )?;
423
424        // exec: execution behavior (merged keep/yield + slice/min_exec)
425        writeln!(
426            w,
427            "  {:<7} keep/max/busy={}/{}/{} yield/ign={}/{} slc={} min_ex={}/{}",
428            "exec",
429            fmt_pct(self.keep),
430            fmt_pct(self.keep_fail_max_exec),
431            fmt_pct(self.keep_fail_busy),
432            fmt_pct(self.yielded),
433            fmt_num(self.yield_ignore),
434            fmt_duration_ms(self.slice_us as f64 / 1000.0),
435            fmt_pct(self.min_exec),
436            fmt_duration_ms(self.min_exec_us as f64 / 1000.0),
437        )?;
438
439        // mig: CPU placement and movement
440        writeln!(
441            w,
442            "  {:<7} mig={} xnuma={} xllc/skip={}/{} open_idle={} affn_viol={}",
443            "mig",
444            fmt_pct(self.migration),
445            fmt_pct(self.xnuma_migration),
446            fmt_pct(self.xllc_migration),
447            fmt_pct(self.xllc_migration_skip),
448            fmt_pct(self.open_idle),
449            fmt_pct(self.affn_viol),
450        )?;
451
452        // preempt: preemption
453        writeln!(
454            w,
455            "  {:<7} preempt/first/xllc/xnuma/idle/fail={}/{}/{}/{}/{}/{}",
456            "preempt",
457            fmt_pct(self.preempt),
458            fmt_pct(self.preempt_first),
459            fmt_pct(self.preempt_xllc),
460            fmt_pct(self.preempt_xnuma),
461            fmt_pct(self.preempt_idle),
462            fmt_pct(self.preempt_fail),
463        )?;
464
465        // xlayer: cross-layer and LLC dispatch
466        writeln!(
467            w,
468            "  {:<7} wake/re={}/{} llc_drain/try={}/{} skip_rnode={}",
469            "xlayer",
470            fmt_pct(self.xlayer_wake),
471            fmt_pct(self.xlayer_rewake),
472            fmt_pct(self.llc_drain),
473            fmt_pct(self.llc_drain_try),
474            fmt_pct(self.skip_remote_node),
475        )?;
476
477        // per-node utilization and load (multi-node only)
478        if self.node_utils.len() > 1 {
479            let xnuma_tag = if self.xnuma_active != 0 {
480                " [xnuma]"
481            } else {
482                ""
483            };
484            let prefix = format!("  node util/load{xnuma_tag} ");
485            // N99=99999.9/99999.9 = 19 chars + 1 space = 20
486            let cell_width = 21;
487            let usable = if max_width > prefix.len() {
488                max_width - prefix.len()
489            } else {
490                60
491            };
492            let cells_per_row = (usable / cell_width).max(1);
493
494            for nid in 0..self.node_utils.len() {
495                let util = self.node_utils[nid];
496                let load = self.node_loads.get(nid).copied().unwrap_or(0.0);
497                if nid % cells_per_row == 0 {
498                    if nid > 0 {
499                        writeln!(w)?;
500                    }
501                    write!(w, "{prefix}")?;
502                } else {
503                    write!(w, " ")?;
504                }
505                write!(w, "N{}={:7.1}/{:7.1}", nid, util, load)?;
506            }
507            writeln!(w)?;
508        }
509
510        // node-pinned utilization and task counts (util/tasks per node)
511        if self.node_pinned_tasks.iter().any(|t| *t > 0) {
512            let prefix = "  pinned  util/tasks ";
513            // N99=99999.9/99999 = 18 chars + 1 space = 19
514            let cell_width = 19;
515            let usable = if max_width > prefix.len() {
516                max_width - prefix.len()
517            } else {
518                60
519            };
520            let cells_per_row = (usable / cell_width).max(1);
521
522            for nid in 0..self.node_pinned_utils.len() {
523                let util = self.node_pinned_utils[nid];
524                let tasks = self.node_pinned_tasks.get(nid).copied().unwrap_or(0);
525                if nid % cells_per_row == 0 {
526                    if nid > 0 {
527                        writeln!(w)?;
528                    }
529                    write!(w, "{prefix}")?;
530                } else {
531                    write!(w, " ")?;
532                }
533                write!(w, "N{}={:7.1}/{:5}", nid, util, tasks)?;
534            }
535            writeln!(w)?;
536        }
537
538        // cpumask
539        let cpumask = Cpumask::from_vec(self.cpus.clone());
540
541        if let Some(topo) = topo {
542            let header = topo.format_cpumask_header(&cpumask, self.min_nr_cpus, self.max_nr_cpus);
543            writeln!(w, "  {}", header)?;
544            if cpumask.weight() > 0 {
545                topo.format_cpumask_grid(w, &cpumask, "  ", max_width)?;
546            }
547        } else {
548            writeln!(
549                w,
550                "  cpus={:3} [{:3},{:3}] {}",
551                self.cur_nr_cpus, self.min_nr_cpus, self.max_nr_cpus, &cpumask,
552            )?;
553        }
554
555        // excl stats
556        if self.is_excl != 0 {
557            writeln!(
558                w,
559                "  excl_coll={} excl_preempt={}",
560                fmt_pct(self.excl_collision),
561                fmt_pct(self.excl_preempt),
562            )?;
563        } else if self.excl_collision != 0.0 || self.excl_preempt != 0.0 {
564            warn!(
565                "{}: exclusive is off but excl_coll={} excl_preempt={}",
566                name,
567                fmt_pct(self.excl_collision),
568                fmt_pct(self.excl_preempt),
569            );
570        }
571
572        // LLC stats (compact grid, skip inactive)
573        if !no_llc {
574            // Collect active LLCs (nr_cpus > 0 or frac > 0)
575            let active_llcs: Vec<(usize, f64, f64)> = self
576                .llc_fracs
577                .iter()
578                .zip(self.llc_lats.iter())
579                .enumerate()
580                .filter(|(i, (&frac, _))| {
581                    let nr_cpus = self.nr_llc_cpus.get(*i).copied().unwrap_or(0);
582                    nr_cpus > 0 || frac > 0.0
583                })
584                .map(|(i, (&frac, &lat))| (i, frac, lat))
585                .collect();
586
587            if !active_llcs.is_empty() {
588                let indent = "  ";
589                writeln!(w, "{indent}LLC sched%/lat_ms")?;
590                // Cell format: [XX]99.9/99.9 = 13 chars, + 1 space separator = 14
591                let cell_width = 14;
592                let usable = if max_width > indent.len() {
593                    max_width - indent.len()
594                } else {
595                    60
596                };
597                let cells_per_row = (usable / cell_width).max(1);
598
599                for (col, &(llc_id, frac, lat)) in active_llcs.iter().enumerate() {
600                    if col % cells_per_row == 0 {
601                        if col > 0 {
602                            writeln!(w)?;
603                        }
604                        write!(w, "{indent}")?;
605                    } else {
606                        write!(w, " ")?;
607                    }
608                    write!(w, "[{:02}]{}/{:4.1}", llc_id, fmt_pct(frac), lat * 1_000.0)?;
609                }
610                writeln!(w)?;
611            }
612        }
613
614        Ok(())
615    }
616}
617
618#[stat_doc]
619#[derive(Clone, Debug, Default, Serialize, Deserialize, Stats)]
620#[stat(top)]
621pub struct SysStats {
622    #[stat(desc = "timestamp", _om_skip)]
623    pub at: f64,
624    #[stat(desc = "# of NUMA nodes")]
625    pub nr_nodes: usize,
626    #[stat(desc = "# sched events during the period")]
627    pub total: u64,
628    #[stat(desc = "% dispatched directly into an idle CPU from select_cpu")]
629    pub local_sel: f64,
630    #[stat(desc = "% dispatched directly into an idle CPU from enqueue")]
631    pub local_enq: f64,
632    #[stat(desc = "% open layer tasks scheduled into allocated but idle CPUs")]
633    pub open_idle: f64,
634    #[stat(desc = "% violated config due to CPU affinity")]
635    pub affn_viol: f64,
636    #[stat(desc = "% sent to hi fallback DSQs")]
637    pub hi_fb: f64,
638    #[stat(desc = "% sent to lo fallback DSQs")]
639    pub lo_fb: f64,
640    #[stat(desc = "count of times an excl task skipped a CPU as the sibling was also excl")]
641    pub excl_collision: f64,
642    #[stat(desc = "count of times a sibling CPU was preempted for an excl task")]
643    pub excl_preempt: f64,
644    #[stat(desc = "count of times a CPU skipped dispatching due to an excl task on the sibling")]
645    pub excl_idle: f64,
646    #[stat(
647        desc = "count of times an idle sibling CPU was woken up after an excl task is finished"
648    )]
649    pub excl_wakeup: f64,
650    #[stat(desc = "CPU time this binary consumed during the period")]
651    pub proc_ms: u64,
652    #[stat(desc = "CPU busy % (100% means all CPU)")]
653    pub busy: f64,
654    #[stat(desc = "CPU util % (100% means one CPU)")]
655    pub util: f64,
656    #[stat(desc = "CPU util % used by hi fallback DSQs")]
657    pub hi_fb_util: f64,
658    #[stat(desc = "CPU util % used by lo fallback DSQs")]
659    pub lo_fb_util: f64,
660    #[stat(desc = "Number of tasks dispatched via antistall")]
661    pub antistall: u64,
662    #[stat(desc = "Number of times preemptions of non-scx tasks were avoided")]
663    pub skip_preempt: u64,
664    #[stat(desc = "Number of times vtime was out of range and fixed up")]
665    pub fixup_vtime: u64,
666    #[stat(desc = "Number of times cpuc->preempting_task didn't come on the CPU")]
667    pub preempting_mismatch: u64,
668    #[stat(desc = "per-node fallback CPUs")]
669    pub fallback_cpus: BTreeMap<u32, u32>,
670    #[stat(desc = "per-layer statistics")]
671    pub fallback_cpu_util: f64,
672    #[stat(desc = "fallback CPU util %")]
673    pub layers: BTreeMap<String, LayerStats>,
674    #[stat(desc = "Number of gpu tasks affinitized since scheduler start")]
675    pub gpu_tasks_affinitized: u64,
676    #[stat(desc = "Time (in ms) of last affinitization run.")]
677    pub gpu_task_affinitization_ms: u64,
678    #[stat(desc = "System CPU utilization EWMA (10s window)")]
679    pub system_cpu_util_ewma: f64,
680}
681
682impl SysStats {
683    pub fn new(
684        stats: &Stats,
685        bstats: &BpfStats,
686        fallback_cpus: &BTreeMap<usize, usize>,
687    ) -> Result<Self> {
688        let lsum = |idx| stats.bpf_stats.lstats_sums[idx];
689        let total = lsum(LSTAT_SEL_LOCAL)
690            + lsum(LSTAT_ENQ_LOCAL)
691            + lsum(LSTAT_ENQ_WAKEUP)
692            + lsum(LSTAT_ENQ_EXPIRE)
693            + lsum(LSTAT_ENQ_REENQ)
694            + lsum(LSTAT_KEEP);
695        let lsum_pct = |idx| {
696            if total != 0 {
697                lsum(idx) as f64 / total as f64 * 100.0
698            } else {
699                0.0
700            }
701        };
702
703        let elapsed_ns = stats.elapsed.as_nanos();
704
705        Ok(Self {
706            at: SystemTime::now().duration_since(UNIX_EPOCH)?.as_secs_f64(),
707            nr_nodes: stats.topo.nodes.len(),
708            total,
709            local_sel: lsum_pct(LSTAT_SEL_LOCAL),
710            local_enq: lsum_pct(LSTAT_ENQ_LOCAL),
711            open_idle: lsum_pct(LSTAT_OPEN_IDLE),
712            affn_viol: lsum_pct(LSTAT_AFFN_VIOL),
713            hi_fb: calc_frac(
714                stats.bpf_stats.gstats[GSTAT_HI_FB_EVENTS] as f64,
715                total as f64,
716            ),
717            lo_fb: calc_frac(
718                stats.bpf_stats.gstats[GSTAT_LO_FB_EVENTS] as f64,
719                total as f64,
720            ),
721            excl_collision: lsum_pct(LSTAT_EXCL_COLLISION),
722            excl_preempt: lsum_pct(LSTAT_EXCL_PREEMPT),
723            excl_idle: bstats.gstats[GSTAT_EXCL_IDLE] as f64 / total as f64,
724            excl_wakeup: bstats.gstats[GSTAT_EXCL_WAKEUP] as f64 / total as f64,
725            proc_ms: stats.processing_dur.as_millis() as u64,
726            busy: stats.cpu_busy * 100.0,
727            util: stats.total_util * 100.0,
728            hi_fb_util: stats.bpf_stats.gstats[GSTAT_HI_FB_USAGE] as f64 / elapsed_ns as f64
729                * 100.0,
730            lo_fb_util: stats.bpf_stats.gstats[GSTAT_LO_FB_USAGE] as f64 / elapsed_ns as f64
731                * 100.0,
732            antistall: stats.bpf_stats.gstats[GSTAT_ANTISTALL],
733            skip_preempt: stats.bpf_stats.gstats[GSTAT_SKIP_PREEMPT],
734            fixup_vtime: stats.bpf_stats.gstats[GSTAT_FIXUP_VTIME],
735            preempting_mismatch: stats.bpf_stats.gstats[GSTAT_PREEMPTING_MISMATCH],
736            fallback_cpus: fallback_cpus
737                .iter()
738                .map(|(&k, &v)| (k as u32, v as u32))
739                .collect(),
740            fallback_cpu_util: stats.bpf_stats.gstats[GSTAT_FB_CPU_USAGE] as f64
741                / elapsed_ns as f64
742                * 100.0,
743            layers: BTreeMap::new(),
744            gpu_tasks_affinitized: stats.gpu_tasks_affinitized,
745            gpu_task_affinitization_ms: stats.gpu_task_affinitization_ms,
746            system_cpu_util_ewma: stats.system_cpu_util_ewma * 100.0,
747        })
748    }
749
750    pub fn format<W: Write>(&self, w: &mut W) -> Result<()> {
751        writeln!(
752            w,
753            "tot={:7} local_sel/enq={}/{} open_idle={} affn_viol={} hi/lo={}/{}",
754            self.total,
755            fmt_pct(self.local_sel),
756            fmt_pct(self.local_enq),
757            fmt_pct(self.open_idle),
758            fmt_pct(self.affn_viol),
759            fmt_pct(self.hi_fb),
760            fmt_pct(self.lo_fb),
761        )?;
762
763        let single_node = self.fallback_cpus.len() == 1;
764        let fb_cpus_str: Vec<String> = self
765            .fallback_cpus
766            .iter()
767            .map(|(n, c)| {
768                if single_node {
769                    format!("{}", c)
770                } else {
771                    format!("N{}:{}", n, c)
772                }
773            })
774            .collect();
775        writeln!(
776            w,
777            "busy={:5.1} util/hi/lo={:7.1}/{}/{} fb_cpus=[{}]/util={:4.1} proc={}ms sys_util_10s={:5.1}",
778            self.busy,
779            self.util,
780            fmt_pct(self.hi_fb_util),
781            fmt_pct(self.lo_fb_util),
782            fb_cpus_str.join(","),
783            self.fallback_cpu_util,
784            self.proc_ms,
785            self.system_cpu_util_ewma,
786        )?;
787
788        writeln!(
789            w,
790            "excl_coll={:.2} excl_preempt={:.2} excl_idle={:.2} excl_wakeup={:.2}",
791            self.excl_collision, self.excl_preempt, self.excl_idle, self.excl_wakeup
792        )?;
793
794        writeln!(
795            w,
796            "skip_preempt={} antistall={} fixup_vtime={} preempting_mismatch={}",
797            self.skip_preempt, self.antistall, self.fixup_vtime, self.preempting_mismatch
798        )?;
799
800        writeln!(
801            w,
802            "gpu_tasks_affinitized={} gpu_task_affinitization_time={}",
803            self.gpu_tasks_affinitized, self.gpu_task_affinitization_ms
804        )?;
805
806        Ok(())
807    }
808
809    pub fn format_all<W: Write>(
810        &self,
811        w: &mut W,
812        topo: Option<&Topology>,
813        max_width: usize,
814        no_llc: bool,
815    ) -> Result<()> {
816        self.format(w)?;
817
818        let mut idx_to_name: Vec<(usize, &String)> =
819            self.layers.iter().map(|(k, v)| (v.index, k)).collect();
820
821        idx_to_name.sort();
822
823        for (_idx, name) in &idx_to_name {
824            self.layers[*name].format(w, name, topo, max_width, no_llc)?;
825        }
826
827        Ok(())
828    }
829}
830
831#[derive(Debug)]
832pub enum StatsReq {
833    Hello(ThreadId),
834    Refresh(ThreadId, Stats),
835    Bye(ThreadId),
836}
837
838#[derive(Debug)]
839pub enum StatsRes {
840    Hello(Stats),
841    Refreshed((Stats, SysStats)),
842    Bye,
843}
844
845pub fn server_data() -> StatsServerData<StatsReq, StatsRes> {
846    let open: Box<dyn StatsOpener<StatsReq, StatsRes>> = Box::new(move |(req_ch, res_ch)| {
847        let tid = current().id();
848        req_ch.send(StatsReq::Hello(tid))?;
849        let mut stats = Some(match res_ch.recv()? {
850            StatsRes::Hello(v) => v,
851            res => bail!("invalid response to Hello: {:?}", res),
852        });
853
854        let read: Box<dyn StatsReader<StatsReq, StatsRes>> =
855            Box::new(move |_args, (req_ch, res_ch)| {
856                req_ch.send(StatsReq::Refresh(tid, stats.take().unwrap()))?;
857                let (new_stats, sys_stats) = match res_ch.recv()? {
858                    StatsRes::Refreshed(v) => v,
859                    res => bail!("invalid response to Refresh: {:?}", res),
860                };
861                stats = Some(new_stats);
862                sys_stats.to_json()
863            });
864
865        Ok(read)
866    });
867
868    let close: Box<dyn StatsCloser<StatsReq, StatsRes>> = Box::new(move |(req_ch, res_ch)| {
869        req_ch.send(StatsReq::Bye(current().id())).unwrap();
870        match res_ch.recv().unwrap() {
871            StatsRes::Bye => {}
872            res => panic!("invalid response to Bye: {:?}", res),
873        }
874    });
875
876    StatsServerData::new()
877        .add_meta(LayerStats::meta())
878        .add_meta(SysStats::meta())
879        .add_ops(
880            "top",
881            StatsOps {
882                open,
883                close: Some(close),
884            },
885        )
886}
887
888pub fn monitor(
889    intv: Duration,
890    shutdown: Arc<AtomicBool>,
891    max_width: usize,
892    no_llc: bool,
893) -> Result<()> {
894    let topo = Topology::new().ok();
895    scx_utils::monitor_stats::<SysStats>(
896        &[],
897        intv,
898        || shutdown.load(Ordering::Relaxed),
899        |sst| {
900            let dt = DateTime::<Local>::from(UNIX_EPOCH + Duration::from_secs_f64(sst.at));
901            let header = format!("\u{2501}\u{2501} {} ", dt.to_rfc2822());
902            let pad = max_width.saturating_sub(header.chars().count());
903            println!("{}{}", header, "\u{2501}".repeat(pad));
904            sst.format_all(&mut std::io::stdout(), topo.as_ref(), max_width, no_llc)
905        },
906    )
907}