Skip to main content

scx_layered/
stats.rs

1use std::collections::BTreeMap;
2use std::io::Write;
3use std::sync::atomic::AtomicBool;
4use std::sync::atomic::Ordering;
5use std::sync::Arc;
6use std::thread::current;
7use std::thread::ThreadId;
8use std::time::Duration;
9use std::time::SystemTime;
10use std::time::UNIX_EPOCH;
11
12use anyhow::bail;
13use anyhow::Result;
14use chrono::DateTime;
15use chrono::Local;
16use scx_stats::prelude::*;
17use scx_stats_derive::stat_doc;
18use scx_stats_derive::Stats;
19use scx_utils::Cpumask;
20use scx_utils::Topology;
21use serde::Deserialize;
22use serde::Serialize;
23use tracing::warn;
24
25use crate::bpf_intf;
26use crate::BpfStats;
27use crate::Layer;
28use crate::LayerKind;
29use crate::Stats;
30use crate::LAYER_USAGE_OPEN;
31use crate::LAYER_USAGE_PROTECTED;
32use crate::LAYER_USAGE_PROTECTED_PREEMPT;
33use crate::LAYER_USAGE_SUM_UPTO;
34
35const GSTAT_EXCL_IDLE: usize = bpf_intf::global_stat_id_GSTAT_EXCL_IDLE as usize;
36const GSTAT_EXCL_WAKEUP: usize = bpf_intf::global_stat_id_GSTAT_EXCL_WAKEUP as usize;
37const GSTAT_HI_FB_EVENTS: usize = bpf_intf::global_stat_id_GSTAT_HI_FB_EVENTS as usize;
38const GSTAT_HI_FB_USAGE: usize = bpf_intf::global_stat_id_GSTAT_HI_FB_USAGE as usize;
39const GSTAT_LO_FB_EVENTS: usize = bpf_intf::global_stat_id_GSTAT_LO_FB_EVENTS as usize;
40const GSTAT_LO_FB_USAGE: usize = bpf_intf::global_stat_id_GSTAT_LO_FB_USAGE as usize;
41const GSTAT_FB_CPU_USAGE: usize = bpf_intf::global_stat_id_GSTAT_FB_CPU_USAGE as usize;
42const GSTAT_ANTISTALL: usize = bpf_intf::global_stat_id_GSTAT_ANTISTALL as usize;
43const GSTAT_SKIP_PREEMPT: usize = bpf_intf::global_stat_id_GSTAT_SKIP_PREEMPT as usize;
44const GSTAT_FIXUP_VTIME: usize = bpf_intf::global_stat_id_GSTAT_FIXUP_VTIME as usize;
45const GSTAT_PREEMPTING_MISMATCH: usize =
46    bpf_intf::global_stat_id_GSTAT_PREEMPTING_MISMATCH as usize;
47
48const LSTAT_SEL_LOCAL: usize = bpf_intf::layer_stat_id_LSTAT_SEL_LOCAL as usize;
49const LSTAT_ENQ_LOCAL: usize = bpf_intf::layer_stat_id_LSTAT_ENQ_LOCAL as usize;
50const LSTAT_ENQ_WAKEUP: usize = bpf_intf::layer_stat_id_LSTAT_ENQ_WAKEUP as usize;
51const LSTAT_ENQ_EXPIRE: usize = bpf_intf::layer_stat_id_LSTAT_ENQ_EXPIRE as usize;
52const LSTAT_ENQ_REENQ: usize = bpf_intf::layer_stat_id_LSTAT_ENQ_REENQ as usize;
53const LSTAT_ENQ_DSQ: usize = bpf_intf::layer_stat_id_LSTAT_ENQ_DSQ as usize;
54const LSTAT_MIN_EXEC: usize = bpf_intf::layer_stat_id_LSTAT_MIN_EXEC as usize;
55const LSTAT_MIN_EXEC_NS: usize = bpf_intf::layer_stat_id_LSTAT_MIN_EXEC_NS as usize;
56const LSTAT_OPEN_IDLE: usize = bpf_intf::layer_stat_id_LSTAT_OPEN_IDLE as usize;
57const LSTAT_AFFN_VIOL: usize = bpf_intf::layer_stat_id_LSTAT_AFFN_VIOL as usize;
58const LSTAT_KEEP: usize = bpf_intf::layer_stat_id_LSTAT_KEEP as usize;
59const LSTAT_KEEP_FAIL_MAX_EXEC: usize = bpf_intf::layer_stat_id_LSTAT_KEEP_FAIL_MAX_EXEC as usize;
60const LSTAT_KEEP_FAIL_BUSY: usize = bpf_intf::layer_stat_id_LSTAT_KEEP_FAIL_BUSY as usize;
61const LSTAT_PREEMPT: usize = bpf_intf::layer_stat_id_LSTAT_PREEMPT as usize;
62const LSTAT_PREEMPT_FIRST: usize = bpf_intf::layer_stat_id_LSTAT_PREEMPT_FIRST as usize;
63const LSTAT_PREEMPT_XLLC: usize = bpf_intf::layer_stat_id_LSTAT_PREEMPT_XLLC as usize;
64const LSTAT_PREEMPT_XNUMA: usize = bpf_intf::layer_stat_id_LSTAT_PREEMPT_XNUMA as usize;
65const LSTAT_PREEMPT_IDLE: usize = bpf_intf::layer_stat_id_LSTAT_PREEMPT_IDLE as usize;
66const LSTAT_PREEMPT_FAIL: usize = bpf_intf::layer_stat_id_LSTAT_PREEMPT_FAIL as usize;
67const LSTAT_EXCL_COLLISION: usize = bpf_intf::layer_stat_id_LSTAT_EXCL_COLLISION as usize;
68const LSTAT_EXCL_PREEMPT: usize = bpf_intf::layer_stat_id_LSTAT_EXCL_PREEMPT as usize;
69const LSTAT_YIELD: usize = bpf_intf::layer_stat_id_LSTAT_YIELD as usize;
70const LSTAT_YIELD_IGNORE: usize = bpf_intf::layer_stat_id_LSTAT_YIELD_IGNORE as usize;
71const LSTAT_MIGRATION: usize = bpf_intf::layer_stat_id_LSTAT_MIGRATION as usize;
72const LSTAT_XNUMA_MIGRATION: usize = bpf_intf::layer_stat_id_LSTAT_XNUMA_MIGRATION as usize;
73const LSTAT_XLLC_MIGRATION: usize = bpf_intf::layer_stat_id_LSTAT_XLLC_MIGRATION as usize;
74const LSTAT_XLLC_MIGRATION_SKIP: usize = bpf_intf::layer_stat_id_LSTAT_XLLC_MIGRATION_SKIP as usize;
75const LSTAT_XLAYER_WAKE: usize = bpf_intf::layer_stat_id_LSTAT_XLAYER_WAKE as usize;
76const LSTAT_XLAYER_REWAKE: usize = bpf_intf::layer_stat_id_LSTAT_XLAYER_REWAKE as usize;
77const LSTAT_LLC_DRAIN_TRY: usize = bpf_intf::layer_stat_id_LSTAT_LLC_DRAIN_TRY as usize;
78const LSTAT_LLC_DRAIN: usize = bpf_intf::layer_stat_id_LSTAT_LLC_DRAIN as usize;
79const LSTAT_SKIP_REMOTE_NODE: usize = bpf_intf::layer_stat_id_LSTAT_SKIP_REMOTE_NODE as usize;
80
81const LLC_LSTAT_LAT: usize = bpf_intf::llc_layer_stat_id_LLC_LSTAT_LAT as usize;
82const LLC_LSTAT_CNT: usize = bpf_intf::llc_layer_stat_id_LLC_LSTAT_CNT as usize;
83
84fn calc_frac(a: f64, b: f64) -> f64 {
85    if b != 0.0 {
86        a / b * 100.0
87    } else {
88        0.0
89    }
90}
91
92fn fmt_pct(v: f64) -> String {
93    if v >= 99.95 {
94        format!("{:4.0}", v)
95    } else if v >= 10.0 {
96        format!("{:4.1}", v)
97    } else if v > 0.0 && v < 0.01 {
98        format!("{:4.2}", 0.01)
99    } else {
100        format!("{:4.2}", v)
101    }
102}
103
104fn fmt_duration_ms(ms: f64) -> String {
105    if ms >= 60_000.0 {
106        let min = ms / 60_000.0;
107        if min >= 100.0 {
108            format!("{:.0}min", min)
109        } else {
110            format!("{:.1}min", min)
111        }
112    } else if ms >= 1_000.0 {
113        let s = ms / 1_000.0;
114        if s >= 100.0 {
115            format!("{:.0}s", s)
116        } else {
117            format!("{:.1}s", s)
118        }
119    } else if ms >= 10.0 {
120        format!("{:.0}ms", ms)
121    } else {
122        format!("{:.1}ms", ms)
123    }
124}
125
126fn fmt_num(v: u64) -> String {
127    if v > 1_000_000 {
128        format!("{:5.1}m", v as f64 / 1_000_000.0)
129    } else if v > 1_000 {
130        format!("{:5.1}k", v as f64 / 1_000.0)
131    } else {
132        format!("{:5.0} ", v)
133    }
134}
135
136#[stat_doc]
137#[derive(Clone, Debug, Default, Serialize, Deserialize, Stats)]
138#[stat(_om_prefix = "l_", _om_label = "layer_name")]
139pub struct LayerStats {
140    #[stat(desc = "index", _om_skip)]
141    pub index: usize,
142    #[stat(desc = "Total CPU utilization (100% means one full CPU)")]
143    pub util: f64,
144    #[stat(desc = "Compensated CPU utilization (adjusted for irq/softirq/stolen)")]
145    pub util_compensated: f64,
146    #[stat(desc = "Protected CPU utilization %")]
147    pub util_protected_frac: f64,
148    #[stat(desc = "Preempt-protected CPU utilization %")]
149    pub util_protected_preempt_frac: f64,
150    #[stat(desc = "Open CPU utilization %")]
151    pub util_open_frac: f64,
152    #[stat(desc = "fraction of total CPU utilization")]
153    pub util_frac: f64,
154    #[stat(desc = "number of tasks")]
155    pub tasks: u32,
156    #[stat(desc = "count of sched events during the period")]
157    pub total: u64,
158    #[stat(desc = "% dispatched into idle CPU from select_cpu")]
159    pub sel_local: f64,
160    #[stat(desc = "% dispatched into idle CPU from enqueue")]
161    pub enq_local: f64,
162    #[stat(desc = "% enqueued after wakeup")]
163    pub enq_wakeup: f64,
164    #[stat(desc = "% enqueued after slice expiration")]
165    pub enq_expire: f64,
166    #[stat(desc = "% re-enqueued due to RT preemption")]
167    pub enq_reenq: f64,
168    #[stat(desc = "% enqueued into the layer's LLC DSQs")]
169    pub enq_dsq: f64,
170    #[stat(desc = "count of times exec duration < min_exec_us")]
171    pub min_exec: f64,
172    #[stat(desc = "total exec durations extended due to min_exec_us")]
173    pub min_exec_us: u64,
174    #[stat(desc = "% dispatched into idle CPUs occupied by other layers")]
175    pub open_idle: f64,
176    #[stat(desc = "% preempted other tasks")]
177    pub preempt: f64,
178    #[stat(desc = "% preempted XLLC tasks")]
179    pub preempt_xllc: f64,
180    #[stat(desc = "% preempted across NUMA nodes")]
181    pub preempt_xnuma: f64,
182    #[stat(desc = "% first-preempted other tasks")]
183    pub preempt_first: f64,
184    #[stat(desc = "% idle-preempted other tasks")]
185    pub preempt_idle: f64,
186    #[stat(desc = "% attempted to preempt other tasks but failed")]
187    pub preempt_fail: f64,
188    #[stat(desc = "% violated config due to CPU affinity")]
189    pub affn_viol: f64,
190    #[stat(desc = "% continued executing after slice expiration")]
191    pub keep: f64,
192    #[stat(desc = "% disallowed to continue executing due to max_exec")]
193    pub keep_fail_max_exec: f64,
194    #[stat(desc = "% disallowed to continue executing due to other tasks")]
195    pub keep_fail_busy: f64,
196    #[stat(desc = "whether is exclusive", _om_skip)]
197    pub is_excl: u32,
198    #[stat(desc = "count of times an excl task skipped a CPU as the sibling was also excl")]
199    pub excl_collision: f64,
200    #[stat(desc = "% a sibling CPU was preempted for an exclusive task")]
201    pub excl_preempt: f64,
202    #[stat(desc = "% yielded")]
203    pub yielded: f64,
204    #[stat(desc = "count of times yield was ignored")]
205    pub yield_ignore: u64,
206    #[stat(desc = "% migrated across CPUs")]
207    pub migration: f64,
208    #[stat(desc = "% migrated across NUMA nodes")]
209    pub xnuma_migration: f64,
210    #[stat(desc = "% migrated across LLCs")]
211    pub xllc_migration: f64,
212    #[stat(desc = "% migration skipped across LLCs due to xllc_mig_min_us")]
213    pub xllc_migration_skip: f64,
214    #[stat(desc = "% wakers across layers")]
215    pub xlayer_wake: f64,
216    #[stat(desc = "% rewakers across layers where waker has waken the task previously")]
217    pub xlayer_rewake: f64,
218    #[stat(desc = "% LLC draining tried")]
219    pub llc_drain_try: f64,
220    #[stat(desc = "% LLC draining succeeded")]
221    pub llc_drain: f64,
222    #[stat(desc = "% skip LLC dispatch on remote node")]
223    pub skip_remote_node: f64,
224    #[stat(desc = "mask of allocated CPUs", _om_skip)]
225    pub cpus: Vec<u64>,
226    #[stat(desc = "count of CPUs assigned")]
227    pub cur_nr_cpus: u32,
228    #[stat(desc = "minimum # of CPUs assigned")]
229    pub min_nr_cpus: u32,
230    #[stat(desc = "maximum # of CPUs assigned")]
231    pub max_nr_cpus: u32,
232    #[stat(desc = "count of CPUs assigned per LLC")]
233    pub nr_llc_cpus: Vec<u32>,
234    #[stat(desc = "slice duration config")]
235    pub slice_us: u64,
236    #[stat(desc = "Per-LLC scheduling event fractions")]
237    pub llc_fracs: Vec<f64>,
238    #[stat(desc = "Per-LLC average latency")]
239    pub llc_lats: Vec<f64>,
240    #[stat(desc = "Layer memory bandwidth as a % of total allowed (0 for \"no limit\"")]
241    pub membw_pct: f64,
242    #[stat(desc = "DSQ insertion ratio EWMA (10s window)")]
243    pub dsq_insert_ewma: f64,
244    #[stat(desc = "Per-node layer utilization (100% = one full CPU)")]
245    pub node_utils: Vec<f64>,
246    #[stat(desc = "Per-node pinned task utilization (100% = one full CPU)")]
247    pub node_pinned_utils: Vec<f64>,
248    #[stat(desc = "Per-node pinned task counts")]
249    pub node_pinned_tasks: Vec<u64>,
250    #[stat(desc = "Per-node load (100% = one full CPU, from duty cycle sum)")]
251    pub node_loads: Vec<f64>,
252    #[stat(desc = "Whether xnuma gating is active for this layer (0/1)")]
253    pub xnuma_active: u32,
254}
255
256impl LayerStats {
257    pub fn new(
258        lidx: usize,
259        layer: &Layer,
260        stats: &Stats,
261        bstats: &BpfStats,
262        nr_cpus_range: (usize, usize),
263        xnuma_active: bool,
264    ) -> Self {
265        let lstat = |sidx| bstats.lstats[lidx][sidx];
266        let ltotal = lstat(LSTAT_SEL_LOCAL)
267            + lstat(LSTAT_ENQ_LOCAL)
268            + lstat(LSTAT_ENQ_WAKEUP)
269            + lstat(LSTAT_ENQ_EXPIRE)
270            + lstat(LSTAT_ENQ_REENQ)
271            + lstat(LSTAT_KEEP);
272        let lstat_pct = |sidx| {
273            if ltotal != 0 {
274                lstat(sidx) as f64 / ltotal as f64 * 100.0
275            } else {
276                0.0
277            }
278        };
279
280        let util_sum = stats.layer_utils[lidx]
281            .iter()
282            .take(LAYER_USAGE_SUM_UPTO + 1)
283            .sum::<f64>();
284
285        let util_comp_sum = stats.layer_utils_compensated[lidx]
286            .iter()
287            .take(LAYER_USAGE_SUM_UPTO + 1)
288            .sum::<f64>();
289
290        let membw_frac = match &layer.kind {
291            // Open layer's can't have a memory BW limit.
292            LayerKind::Open { .. } => 0.0,
293            LayerKind::Confined { membw_gb, .. } | LayerKind::Grouped { membw_gb, .. } => {
294                // Check if we have set a memory BW limit.
295                if let Some(membw_limit_gb) = membw_gb {
296                    stats.layer_membws[lidx]
297                        .iter()
298                        .take(LAYER_USAGE_SUM_UPTO + 1)
299                        .sum::<f64>()
300                        / (*membw_limit_gb * (1024_u64.pow(3) as f64))
301                } else {
302                    0.0
303                }
304            }
305        };
306
307        Self {
308            index: lidx,
309            util: util_sum * 100.0,
310            util_compensated: util_comp_sum * 100.0,
311            util_open_frac: calc_frac(stats.layer_utils[lidx][LAYER_USAGE_OPEN], util_sum),
312            util_protected_frac: calc_frac(
313                stats.layer_utils[lidx][LAYER_USAGE_PROTECTED],
314                util_sum,
315            ),
316            util_protected_preempt_frac: calc_frac(
317                stats.layer_utils[lidx][LAYER_USAGE_PROTECTED_PREEMPT],
318                util_sum,
319            ),
320            util_frac: calc_frac(util_sum, stats.total_util),
321            tasks: stats.nr_layer_tasks[lidx] as u32,
322            total: ltotal,
323            sel_local: lstat_pct(LSTAT_SEL_LOCAL),
324            enq_local: lstat_pct(LSTAT_ENQ_LOCAL),
325            enq_wakeup: lstat_pct(LSTAT_ENQ_WAKEUP),
326            enq_expire: lstat_pct(LSTAT_ENQ_EXPIRE),
327            enq_reenq: lstat_pct(LSTAT_ENQ_REENQ),
328            enq_dsq: lstat_pct(LSTAT_ENQ_DSQ),
329            min_exec: lstat_pct(LSTAT_MIN_EXEC),
330            min_exec_us: lstat(LSTAT_MIN_EXEC_NS) / 1000,
331            open_idle: lstat_pct(LSTAT_OPEN_IDLE),
332            preempt: lstat_pct(LSTAT_PREEMPT),
333            preempt_xllc: lstat_pct(LSTAT_PREEMPT_XLLC),
334            preempt_xnuma: lstat_pct(LSTAT_PREEMPT_XNUMA),
335            preempt_first: lstat_pct(LSTAT_PREEMPT_FIRST),
336            preempt_idle: lstat_pct(LSTAT_PREEMPT_IDLE),
337            preempt_fail: lstat_pct(LSTAT_PREEMPT_FAIL),
338            affn_viol: lstat_pct(LSTAT_AFFN_VIOL),
339            keep: lstat_pct(LSTAT_KEEP),
340            keep_fail_max_exec: lstat_pct(LSTAT_KEEP_FAIL_MAX_EXEC),
341            keep_fail_busy: lstat_pct(LSTAT_KEEP_FAIL_BUSY),
342            is_excl: layer.kind.common().exclusive as u32,
343            excl_collision: lstat_pct(LSTAT_EXCL_COLLISION),
344            excl_preempt: lstat_pct(LSTAT_EXCL_PREEMPT),
345            yielded: lstat_pct(LSTAT_YIELD),
346            yield_ignore: lstat(LSTAT_YIELD_IGNORE),
347            migration: lstat_pct(LSTAT_MIGRATION),
348            xnuma_migration: lstat_pct(LSTAT_XNUMA_MIGRATION),
349            xlayer_wake: lstat_pct(LSTAT_XLAYER_WAKE),
350            xlayer_rewake: lstat_pct(LSTAT_XLAYER_REWAKE),
351            xllc_migration: lstat_pct(LSTAT_XLLC_MIGRATION),
352            xllc_migration_skip: lstat_pct(LSTAT_XLLC_MIGRATION_SKIP),
353            llc_drain_try: lstat_pct(LSTAT_LLC_DRAIN_TRY),
354            llc_drain: lstat_pct(LSTAT_LLC_DRAIN),
355            skip_remote_node: lstat_pct(LSTAT_SKIP_REMOTE_NODE),
356            cpus: layer.cpus.as_raw_slice().to_vec(),
357            cur_nr_cpus: layer.cpus.weight() as u32,
358            min_nr_cpus: nr_cpus_range.0 as u32,
359            max_nr_cpus: nr_cpus_range.1 as u32,
360            nr_llc_cpus: layer.nr_llc_cpus.iter().map(|&v| v as u32).collect(),
361            slice_us: stats.layer_slice_us[lidx],
362            llc_fracs: {
363                let sid = LLC_LSTAT_CNT;
364                let sum = bstats.llc_lstats[lidx]
365                    .iter()
366                    .map(|lstats| lstats[sid])
367                    .sum::<u64>() as f64;
368                bstats.llc_lstats[lidx]
369                    .iter()
370                    .map(|lstats| calc_frac(lstats[sid] as f64, sum))
371                    .collect()
372            },
373            llc_lats: bstats.llc_lstats[lidx]
374                .iter()
375                .map(|lstats| lstats[LLC_LSTAT_LAT] as f64 / 1_000_000_000.0)
376                .collect(),
377            membw_pct: membw_frac * 100.0,
378            dsq_insert_ewma: stats.layer_dsq_insert_ewma[lidx] * 100.0,
379            node_utils: stats.layer_node_utils[lidx]
380                .iter()
381                .map(|u| u * 100.0)
382                .collect(),
383            node_pinned_utils: stats.layer_node_pinned_utils[lidx]
384                .iter()
385                .map(|u| u * 100.0)
386                .collect(),
387            node_pinned_tasks: stats.layer_nr_node_pinned_tasks[lidx].clone(),
388            node_loads: stats.layer_node_duty_sums[lidx]
389                .iter()
390                .map(|l| l * 100.0)
391                .collect(),
392            xnuma_active: if xnuma_active { 1 } else { 0 },
393        }
394    }
395
396    pub fn format<W: Write>(
397        &self,
398        w: &mut W,
399        name: &str,
400        topo: Option<&Topology>,
401        max_width: usize,
402        no_llc: bool,
403    ) -> Result<()> {
404        // Line 1: layer summary
405        let comp_str = if self.util > 0.1 && (self.util_compensated - self.util).abs() > 0.1 {
406            let overhead_pct = (1.0 - self.util / self.util_compensated) * 100.0;
407            format!(" comp_overhead={:.1}%", overhead_pct)
408        } else {
409            String::new()
410        };
411        writeln!(
412            w,
413            "\n\u{25B6} {} \u{2500} util/open/frac={:6.1}/{}/{:7.1}{} prot/prot_preempt={}/{} tasks={:6}",
414            name,
415            self.util,
416            fmt_pct(self.util_open_frac),
417            self.util_frac,
418            comp_str,
419            fmt_pct(self.util_protected_frac),
420            fmt_pct(self.util_protected_preempt_frac),
421            self.tasks,
422        )?;
423
424        // sched: scheduling event flow
425        writeln!(
426            w,
427            "  {:<7} tot={} dd_sel/enq={}/{} dsq/10s={}/{} wake/exp/re={}/{}/{}",
428            "sched",
429            fmt_num(self.total),
430            fmt_pct(self.sel_local),
431            fmt_pct(self.enq_local),
432            fmt_pct(self.enq_dsq),
433            fmt_pct(self.dsq_insert_ewma),
434            fmt_pct(self.enq_wakeup),
435            fmt_pct(self.enq_expire),
436            fmt_pct(self.enq_reenq),
437        )?;
438
439        // exec: execution behavior (merged keep/yield + slice/min_exec)
440        writeln!(
441            w,
442            "  {:<7} keep/max/busy={}/{}/{} yield/ign={}/{} slc={} min_ex={}/{}",
443            "exec",
444            fmt_pct(self.keep),
445            fmt_pct(self.keep_fail_max_exec),
446            fmt_pct(self.keep_fail_busy),
447            fmt_pct(self.yielded),
448            fmt_num(self.yield_ignore),
449            fmt_duration_ms(self.slice_us as f64 / 1000.0),
450            fmt_pct(self.min_exec),
451            fmt_duration_ms(self.min_exec_us as f64 / 1000.0),
452        )?;
453
454        // mig: CPU placement and movement
455        writeln!(
456            w,
457            "  {:<7} mig={} xnuma={} xllc/skip={}/{} open_idle={} affn_viol={}",
458            "mig",
459            fmt_pct(self.migration),
460            fmt_pct(self.xnuma_migration),
461            fmt_pct(self.xllc_migration),
462            fmt_pct(self.xllc_migration_skip),
463            fmt_pct(self.open_idle),
464            fmt_pct(self.affn_viol),
465        )?;
466
467        // preempt: preemption
468        writeln!(
469            w,
470            "  {:<7} preempt/first/xllc/xnuma/idle/fail={}/{}/{}/{}/{}/{}",
471            "preempt",
472            fmt_pct(self.preempt),
473            fmt_pct(self.preempt_first),
474            fmt_pct(self.preempt_xllc),
475            fmt_pct(self.preempt_xnuma),
476            fmt_pct(self.preempt_idle),
477            fmt_pct(self.preempt_fail),
478        )?;
479
480        // xlayer: cross-layer and LLC dispatch
481        writeln!(
482            w,
483            "  {:<7} wake/re={}/{} llc_drain/try={}/{} skip_rnode={}",
484            "xlayer",
485            fmt_pct(self.xlayer_wake),
486            fmt_pct(self.xlayer_rewake),
487            fmt_pct(self.llc_drain),
488            fmt_pct(self.llc_drain_try),
489            fmt_pct(self.skip_remote_node),
490        )?;
491
492        // per-node utilization, load, and pinned utilization (multi-node only)
493        if self.node_utils.len() > 1 {
494            let prefix = "  node    pin/ut/ld ";
495            // N99=999.9/999.9/99999.9 = 24 chars + 1 space = 25
496            let cell_width = 25;
497            let usable = if max_width > prefix.len() {
498                max_width - prefix.len()
499            } else {
500                60
501            };
502            let cells_per_row = (usable / cell_width).max(1);
503
504            for nid in 0..self.node_utils.len() {
505                let util = self.node_utils[nid];
506                let load = self.node_loads.get(nid).copied().unwrap_or(0.0);
507                let pin = self.node_pinned_utils.get(nid).copied().unwrap_or(0.0);
508                if nid % cells_per_row == 0 {
509                    if nid > 0 {
510                        writeln!(w)?;
511                    }
512                    write!(w, "{prefix}")?;
513                } else {
514                    write!(w, " ")?;
515                }
516                write!(w, "N{}={:5.1}/{:5.1}/{:7.1}", nid, pin, util, load)?;
517            }
518            writeln!(w)?;
519        }
520
521        // cpumask
522        let cpumask = Cpumask::from_vec(self.cpus.clone());
523
524        if let Some(topo) = topo {
525            let header = topo.format_cpumask_header(&cpumask, self.min_nr_cpus, self.max_nr_cpus);
526            writeln!(w, "  {}", header)?;
527            if cpumask.weight() > 0 {
528                topo.format_cpumask_grid(w, &cpumask, "  ", max_width)?;
529            }
530        } else {
531            writeln!(
532                w,
533                "  cpus={:3} [{:3},{:3}] {}",
534                self.cur_nr_cpus, self.min_nr_cpus, self.max_nr_cpus, &cpumask,
535            )?;
536        }
537
538        // excl stats
539        if self.is_excl != 0 {
540            writeln!(
541                w,
542                "  excl_coll={} excl_preempt={}",
543                fmt_pct(self.excl_collision),
544                fmt_pct(self.excl_preempt),
545            )?;
546        } else if self.excl_collision != 0.0 || self.excl_preempt != 0.0 {
547            warn!(
548                "{}: exclusive is off but excl_coll={} excl_preempt={}",
549                name,
550                fmt_pct(self.excl_collision),
551                fmt_pct(self.excl_preempt),
552            );
553        }
554
555        // LLC stats (compact grid, skip inactive)
556        if !no_llc {
557            // Collect active LLCs (nr_cpus > 0 or frac > 0)
558            let active_llcs: Vec<(usize, f64, f64)> = self
559                .llc_fracs
560                .iter()
561                .zip(self.llc_lats.iter())
562                .enumerate()
563                .filter(|(i, (&frac, _))| {
564                    let nr_cpus = self.nr_llc_cpus.get(*i).copied().unwrap_or(0);
565                    nr_cpus > 0 || frac > 0.0
566                })
567                .map(|(i, (&frac, &lat))| (i, frac, lat))
568                .collect();
569
570            if !active_llcs.is_empty() {
571                let indent = "  ";
572                writeln!(w, "{indent}LLC sched%/lat_ms")?;
573                // Cell format: [XX]99.9/99.9 = 13 chars, + 1 space separator = 14
574                let cell_width = 14;
575                let usable = if max_width > indent.len() {
576                    max_width - indent.len()
577                } else {
578                    60
579                };
580                let cells_per_row = (usable / cell_width).max(1);
581
582                for (col, &(llc_id, frac, lat)) in active_llcs.iter().enumerate() {
583                    if col % cells_per_row == 0 {
584                        if col > 0 {
585                            writeln!(w)?;
586                        }
587                        write!(w, "{indent}")?;
588                    } else {
589                        write!(w, " ")?;
590                    }
591                    write!(w, "[{:02}]{}/{:4.1}", llc_id, fmt_pct(frac), lat * 1_000.0)?;
592                }
593                writeln!(w)?;
594            }
595        }
596
597        Ok(())
598    }
599}
600
601#[stat_doc]
602#[derive(Clone, Debug, Default, Serialize, Deserialize, Stats)]
603#[stat(top)]
604pub struct SysStats {
605    #[stat(desc = "timestamp", _om_skip)]
606    pub at: f64,
607    #[stat(desc = "# of NUMA nodes")]
608    pub nr_nodes: usize,
609    #[stat(desc = "# sched events during the period")]
610    pub total: u64,
611    #[stat(desc = "% dispatched directly into an idle CPU from select_cpu")]
612    pub local_sel: f64,
613    #[stat(desc = "% dispatched directly into an idle CPU from enqueue")]
614    pub local_enq: f64,
615    #[stat(desc = "% open layer tasks scheduled into allocated but idle CPUs")]
616    pub open_idle: f64,
617    #[stat(desc = "% violated config due to CPU affinity")]
618    pub affn_viol: f64,
619    #[stat(desc = "% sent to hi fallback DSQs")]
620    pub hi_fb: f64,
621    #[stat(desc = "% sent to lo fallback DSQs")]
622    pub lo_fb: f64,
623    #[stat(desc = "count of times an excl task skipped a CPU as the sibling was also excl")]
624    pub excl_collision: f64,
625    #[stat(desc = "count of times a sibling CPU was preempted for an excl task")]
626    pub excl_preempt: f64,
627    #[stat(desc = "count of times a CPU skipped dispatching due to an excl task on the sibling")]
628    pub excl_idle: f64,
629    #[stat(
630        desc = "count of times an idle sibling CPU was woken up after an excl task is finished"
631    )]
632    pub excl_wakeup: f64,
633    #[stat(desc = "CPU time this binary consumed during the period")]
634    pub proc_ms: u64,
635    #[stat(desc = "CPU busy % (100% means all CPU)")]
636    pub busy: f64,
637    #[stat(desc = "CPU util % (100% means one CPU)")]
638    pub util: f64,
639    #[stat(desc = "CPU util % used by hi fallback DSQs")]
640    pub hi_fb_util: f64,
641    #[stat(desc = "CPU util % used by lo fallback DSQs")]
642    pub lo_fb_util: f64,
643    #[stat(desc = "Number of tasks dispatched via antistall")]
644    pub antistall: u64,
645    #[stat(desc = "Number of times preemptions of non-scx tasks were avoided")]
646    pub skip_preempt: u64,
647    #[stat(desc = "Number of times vtime was out of range and fixed up")]
648    pub fixup_vtime: u64,
649    #[stat(desc = "Number of times cpuc->preempting_task didn't come on the CPU")]
650    pub preempting_mismatch: u64,
651    #[stat(desc = "per-node fallback CPUs")]
652    pub fallback_cpus: BTreeMap<u32, u32>,
653    #[stat(desc = "per-layer statistics")]
654    pub fallback_cpu_util: f64,
655    #[stat(desc = "fallback CPU util %")]
656    pub layers: BTreeMap<String, LayerStats>,
657    #[stat(desc = "Number of gpu tasks affinitized since scheduler start")]
658    pub gpu_tasks_affinitized: u64,
659    #[stat(desc = "Time (in ms) of last affinitization run.")]
660    pub gpu_task_affinitization_ms: u64,
661    #[stat(desc = "System CPU utilization EWMA (10s window)")]
662    pub system_cpu_util_ewma: f64,
663}
664
665impl SysStats {
666    pub fn new(
667        stats: &Stats,
668        bstats: &BpfStats,
669        fallback_cpus: &BTreeMap<usize, usize>,
670    ) -> Result<Self> {
671        let lsum = |idx| stats.bpf_stats.lstats_sums[idx];
672        let total = lsum(LSTAT_SEL_LOCAL)
673            + lsum(LSTAT_ENQ_LOCAL)
674            + lsum(LSTAT_ENQ_WAKEUP)
675            + lsum(LSTAT_ENQ_EXPIRE)
676            + lsum(LSTAT_ENQ_REENQ)
677            + lsum(LSTAT_KEEP);
678        let lsum_pct = |idx| {
679            if total != 0 {
680                lsum(idx) as f64 / total as f64 * 100.0
681            } else {
682                0.0
683            }
684        };
685
686        let elapsed_ns = stats.elapsed.as_nanos();
687
688        Ok(Self {
689            at: SystemTime::now().duration_since(UNIX_EPOCH)?.as_secs_f64(),
690            nr_nodes: stats.topo.nodes.len(),
691            total,
692            local_sel: lsum_pct(LSTAT_SEL_LOCAL),
693            local_enq: lsum_pct(LSTAT_ENQ_LOCAL),
694            open_idle: lsum_pct(LSTAT_OPEN_IDLE),
695            affn_viol: lsum_pct(LSTAT_AFFN_VIOL),
696            hi_fb: calc_frac(
697                stats.bpf_stats.gstats[GSTAT_HI_FB_EVENTS] as f64,
698                total as f64,
699            ),
700            lo_fb: calc_frac(
701                stats.bpf_stats.gstats[GSTAT_LO_FB_EVENTS] as f64,
702                total as f64,
703            ),
704            excl_collision: lsum_pct(LSTAT_EXCL_COLLISION),
705            excl_preempt: lsum_pct(LSTAT_EXCL_PREEMPT),
706            excl_idle: bstats.gstats[GSTAT_EXCL_IDLE] as f64 / total as f64,
707            excl_wakeup: bstats.gstats[GSTAT_EXCL_WAKEUP] as f64 / total as f64,
708            proc_ms: stats.processing_dur.as_millis() as u64,
709            busy: stats.cpu_busy * 100.0,
710            util: stats.total_util * 100.0,
711            hi_fb_util: stats.bpf_stats.gstats[GSTAT_HI_FB_USAGE] as f64 / elapsed_ns as f64
712                * 100.0,
713            lo_fb_util: stats.bpf_stats.gstats[GSTAT_LO_FB_USAGE] as f64 / elapsed_ns as f64
714                * 100.0,
715            antistall: stats.bpf_stats.gstats[GSTAT_ANTISTALL],
716            skip_preempt: stats.bpf_stats.gstats[GSTAT_SKIP_PREEMPT],
717            fixup_vtime: stats.bpf_stats.gstats[GSTAT_FIXUP_VTIME],
718            preempting_mismatch: stats.bpf_stats.gstats[GSTAT_PREEMPTING_MISMATCH],
719            fallback_cpus: fallback_cpus
720                .iter()
721                .map(|(&k, &v)| (k as u32, v as u32))
722                .collect(),
723            fallback_cpu_util: stats.bpf_stats.gstats[GSTAT_FB_CPU_USAGE] as f64
724                / elapsed_ns as f64
725                * 100.0,
726            layers: BTreeMap::new(),
727            gpu_tasks_affinitized: stats.gpu_tasks_affinitized,
728            gpu_task_affinitization_ms: stats.gpu_task_affinitization_ms,
729            system_cpu_util_ewma: stats.system_cpu_util_ewma * 100.0,
730        })
731    }
732
733    pub fn format<W: Write>(&self, w: &mut W) -> Result<()> {
734        writeln!(
735            w,
736            "tot={:7} local_sel/enq={}/{} open_idle={} affn_viol={} hi/lo={}/{}",
737            self.total,
738            fmt_pct(self.local_sel),
739            fmt_pct(self.local_enq),
740            fmt_pct(self.open_idle),
741            fmt_pct(self.affn_viol),
742            fmt_pct(self.hi_fb),
743            fmt_pct(self.lo_fb),
744        )?;
745
746        let single_node = self.fallback_cpus.len() == 1;
747        let fb_cpus_str: Vec<String> = self
748            .fallback_cpus
749            .iter()
750            .map(|(n, c)| {
751                if single_node {
752                    format!("{}", c)
753                } else {
754                    format!("N{}:{}", n, c)
755                }
756            })
757            .collect();
758        writeln!(
759            w,
760            "busy={:5.1} util/hi/lo={:7.1}/{}/{} fb_cpus=[{}]/util={:4.1} proc={}ms sys_util_10s={:5.1}",
761            self.busy,
762            self.util,
763            fmt_pct(self.hi_fb_util),
764            fmt_pct(self.lo_fb_util),
765            fb_cpus_str.join(","),
766            self.fallback_cpu_util,
767            self.proc_ms,
768            self.system_cpu_util_ewma,
769        )?;
770
771        writeln!(
772            w,
773            "excl_coll={:.2} excl_preempt={:.2} excl_idle={:.2} excl_wakeup={:.2}",
774            self.excl_collision, self.excl_preempt, self.excl_idle, self.excl_wakeup
775        )?;
776
777        writeln!(
778            w,
779            "skip_preempt={} antistall={} fixup_vtime={} preempting_mismatch={}",
780            self.skip_preempt, self.antistall, self.fixup_vtime, self.preempting_mismatch
781        )?;
782
783        writeln!(
784            w,
785            "gpu_tasks_affinitized={} gpu_task_affinitization_time={}",
786            self.gpu_tasks_affinitized, self.gpu_task_affinitization_ms
787        )?;
788
789        Ok(())
790    }
791
792    pub fn format_all<W: Write>(
793        &self,
794        w: &mut W,
795        topo: Option<&Topology>,
796        max_width: usize,
797        no_llc: bool,
798    ) -> Result<()> {
799        self.format(w)?;
800
801        let mut idx_to_name: Vec<(usize, &String)> =
802            self.layers.iter().map(|(k, v)| (v.index, k)).collect();
803
804        idx_to_name.sort();
805
806        for (_idx, name) in &idx_to_name {
807            self.layers[*name].format(w, name, topo, max_width, no_llc)?;
808        }
809
810        Ok(())
811    }
812}
813
814#[derive(Debug)]
815pub enum StatsReq {
816    Hello(ThreadId),
817    Refresh(ThreadId, Box<Stats>),
818    Bye(ThreadId),
819}
820
821#[derive(Debug)]
822pub enum StatsRes {
823    Hello(Box<Stats>),
824    Refreshed(Box<(Stats, SysStats)>),
825    Bye,
826}
827
828pub fn server_data() -> StatsServerData<StatsReq, StatsRes> {
829    let open: Box<dyn StatsOpener<StatsReq, StatsRes>> = Box::new(move |(req_ch, res_ch)| {
830        let tid = current().id();
831        req_ch.send(StatsReq::Hello(tid))?;
832        let mut stats = Some(match res_ch.recv()? {
833            StatsRes::Hello(v) => *v,
834            res => bail!("invalid response to Hello: {:?}", res),
835        });
836
837        let read: Box<dyn StatsReader<StatsReq, StatsRes>> =
838            Box::new(move |_args, (req_ch, res_ch)| {
839                req_ch.send(StatsReq::Refresh(tid, Box::new(stats.take().unwrap())))?;
840                let (new_stats, sys_stats) = match res_ch.recv()? {
841                    StatsRes::Refreshed(v) => *v,
842                    res => bail!("invalid response to Refresh: {:?}", res),
843                };
844                stats = Some(new_stats);
845                sys_stats.to_json()
846            });
847
848        Ok(read)
849    });
850
851    let close: Box<dyn StatsCloser<StatsReq, StatsRes>> = Box::new(move |(req_ch, res_ch)| {
852        req_ch.send(StatsReq::Bye(current().id())).unwrap();
853        match res_ch.recv().unwrap() {
854            StatsRes::Bye => {}
855            res => panic!("invalid response to Bye: {:?}", res),
856        }
857    });
858
859    StatsServerData::new()
860        .add_meta(LayerStats::meta())
861        .add_meta(SysStats::meta())
862        .add_ops(
863            "top",
864            StatsOps {
865                open,
866                close: Some(close),
867            },
868        )
869}
870
871pub fn monitor(
872    intv: Duration,
873    shutdown: Arc<AtomicBool>,
874    max_width: usize,
875    no_llc: bool,
876) -> Result<()> {
877    let topo = Topology::new().ok();
878    scx_utils::monitor_stats::<SysStats>(
879        &[],
880        intv,
881        || shutdown.load(Ordering::Relaxed),
882        |sst| {
883            let dt = DateTime::<Local>::from(UNIX_EPOCH + Duration::from_secs_f64(sst.at));
884            let header = format!("\u{2501}\u{2501} {} ", dt.to_rfc2822());
885            let pad = max_width.saturating_sub(header.chars().count());
886            println!("{}{}", header, "\u{2501}".repeat(pad));
887            sst.format_all(&mut std::io::stdout(), topo.as_ref(), max_width, no_llc)
888        },
889    )
890}