scx_layered/
stats.rs

1use std::collections::BTreeMap;
2use std::io::Write;
3use std::sync::atomic::AtomicBool;
4use std::sync::atomic::Ordering;
5use std::sync::Arc;
6use std::thread::current;
7use std::thread::ThreadId;
8use std::time::Duration;
9use std::time::SystemTime;
10use std::time::UNIX_EPOCH;
11
12use anyhow::bail;
13use anyhow::Result;
14use chrono::DateTime;
15use chrono::Local;
16use scx_stats::prelude::*;
17use scx_stats_derive::stat_doc;
18use scx_stats_derive::Stats;
19use scx_utils::Cpumask;
20use serde::Deserialize;
21use serde::Serialize;
22use tracing::warn;
23
24use crate::bpf_intf;
25use crate::BpfStats;
26use crate::Layer;
27use crate::LayerKind;
28use crate::Stats;
29use crate::LAYER_USAGE_OPEN;
30use crate::LAYER_USAGE_PROTECTED;
31use crate::LAYER_USAGE_PROTECTED_PREEMPT;
32use crate::LAYER_USAGE_SUM_UPTO;
33
34const GSTAT_EXCL_IDLE: usize = bpf_intf::global_stat_id_GSTAT_EXCL_IDLE as usize;
35const GSTAT_EXCL_WAKEUP: usize = bpf_intf::global_stat_id_GSTAT_EXCL_WAKEUP as usize;
36const GSTAT_HI_FB_EVENTS: usize = bpf_intf::global_stat_id_GSTAT_HI_FB_EVENTS as usize;
37const GSTAT_HI_FB_USAGE: usize = bpf_intf::global_stat_id_GSTAT_HI_FB_USAGE as usize;
38const GSTAT_LO_FB_EVENTS: usize = bpf_intf::global_stat_id_GSTAT_LO_FB_EVENTS as usize;
39const GSTAT_LO_FB_USAGE: usize = bpf_intf::global_stat_id_GSTAT_LO_FB_USAGE as usize;
40const GSTAT_FB_CPU_USAGE: usize = bpf_intf::global_stat_id_GSTAT_FB_CPU_USAGE as usize;
41const GSTAT_ANTISTALL: usize = bpf_intf::global_stat_id_GSTAT_ANTISTALL as usize;
42const GSTAT_SKIP_PREEMPT: usize = bpf_intf::global_stat_id_GSTAT_SKIP_PREEMPT as usize;
43const GSTAT_FIXUP_VTIME: usize = bpf_intf::global_stat_id_GSTAT_FIXUP_VTIME as usize;
44const GSTAT_PREEMPTING_MISMATCH: usize =
45    bpf_intf::global_stat_id_GSTAT_PREEMPTING_MISMATCH as usize;
46
47const LSTAT_SEL_LOCAL: usize = bpf_intf::layer_stat_id_LSTAT_SEL_LOCAL as usize;
48const LSTAT_ENQ_LOCAL: usize = bpf_intf::layer_stat_id_LSTAT_ENQ_LOCAL as usize;
49const LSTAT_ENQ_WAKEUP: usize = bpf_intf::layer_stat_id_LSTAT_ENQ_WAKEUP as usize;
50const LSTAT_ENQ_EXPIRE: usize = bpf_intf::layer_stat_id_LSTAT_ENQ_EXPIRE as usize;
51const LSTAT_ENQ_REENQ: usize = bpf_intf::layer_stat_id_LSTAT_ENQ_REENQ as usize;
52const LSTAT_ENQ_DSQ: usize = bpf_intf::layer_stat_id_LSTAT_ENQ_DSQ as usize;
53const LSTAT_MIN_EXEC: usize = bpf_intf::layer_stat_id_LSTAT_MIN_EXEC as usize;
54const LSTAT_MIN_EXEC_NS: usize = bpf_intf::layer_stat_id_LSTAT_MIN_EXEC_NS as usize;
55const LSTAT_OPEN_IDLE: usize = bpf_intf::layer_stat_id_LSTAT_OPEN_IDLE as usize;
56const LSTAT_AFFN_VIOL: usize = bpf_intf::layer_stat_id_LSTAT_AFFN_VIOL as usize;
57const LSTAT_KEEP: usize = bpf_intf::layer_stat_id_LSTAT_KEEP as usize;
58const LSTAT_KEEP_FAIL_MAX_EXEC: usize = bpf_intf::layer_stat_id_LSTAT_KEEP_FAIL_MAX_EXEC as usize;
59const LSTAT_KEEP_FAIL_BUSY: usize = bpf_intf::layer_stat_id_LSTAT_KEEP_FAIL_BUSY as usize;
60const LSTAT_PREEMPT: usize = bpf_intf::layer_stat_id_LSTAT_PREEMPT as usize;
61const LSTAT_PREEMPT_FIRST: usize = bpf_intf::layer_stat_id_LSTAT_PREEMPT_FIRST as usize;
62const LSTAT_PREEMPT_XLLC: usize = bpf_intf::layer_stat_id_LSTAT_PREEMPT_XLLC as usize;
63const LSTAT_PREEMPT_XNUMA: usize = bpf_intf::layer_stat_id_LSTAT_PREEMPT_XNUMA as usize;
64const LSTAT_PREEMPT_IDLE: usize = bpf_intf::layer_stat_id_LSTAT_PREEMPT_IDLE as usize;
65const LSTAT_PREEMPT_FAIL: usize = bpf_intf::layer_stat_id_LSTAT_PREEMPT_FAIL as usize;
66const LSTAT_EXCL_COLLISION: usize = bpf_intf::layer_stat_id_LSTAT_EXCL_COLLISION as usize;
67const LSTAT_EXCL_PREEMPT: usize = bpf_intf::layer_stat_id_LSTAT_EXCL_PREEMPT as usize;
68const LSTAT_YIELD: usize = bpf_intf::layer_stat_id_LSTAT_YIELD as usize;
69const LSTAT_YIELD_IGNORE: usize = bpf_intf::layer_stat_id_LSTAT_YIELD_IGNORE as usize;
70const LSTAT_MIGRATION: usize = bpf_intf::layer_stat_id_LSTAT_MIGRATION as usize;
71const LSTAT_XNUMA_MIGRATION: usize = bpf_intf::layer_stat_id_LSTAT_XNUMA_MIGRATION as usize;
72const LSTAT_XLLC_MIGRATION: usize = bpf_intf::layer_stat_id_LSTAT_XLLC_MIGRATION as usize;
73const LSTAT_XLLC_MIGRATION_SKIP: usize = bpf_intf::layer_stat_id_LSTAT_XLLC_MIGRATION_SKIP as usize;
74const LSTAT_XLAYER_WAKE: usize = bpf_intf::layer_stat_id_LSTAT_XLAYER_WAKE as usize;
75const LSTAT_XLAYER_REWAKE: usize = bpf_intf::layer_stat_id_LSTAT_XLAYER_REWAKE as usize;
76const LSTAT_LLC_DRAIN_TRY: usize = bpf_intf::layer_stat_id_LSTAT_LLC_DRAIN_TRY as usize;
77const LSTAT_LLC_DRAIN: usize = bpf_intf::layer_stat_id_LSTAT_LLC_DRAIN as usize;
78const LSTAT_SKIP_REMOTE_NODE: usize = bpf_intf::layer_stat_id_LSTAT_SKIP_REMOTE_NODE as usize;
79
80const LLC_LSTAT_LAT: usize = bpf_intf::llc_layer_stat_id_LLC_LSTAT_LAT as usize;
81const LLC_LSTAT_CNT: usize = bpf_intf::llc_layer_stat_id_LLC_LSTAT_CNT as usize;
82
83fn calc_frac(a: f64, b: f64) -> f64 {
84    if b != 0.0 {
85        a / b * 100.0
86    } else {
87        0.0
88    }
89}
90
91fn fmt_pct(v: f64) -> String {
92    if v >= 99.995 {
93        format!("{:5.1}", v)
94    } else if v > 0.0 && v < 0.01 {
95        format!("{:5.2}", 0.01)
96    } else {
97        format!("{:5.2}", v)
98    }
99}
100
101fn fmt_num(v: u64) -> String {
102    if v > 1_000_000 {
103        format!("{:5.1}m", v as f64 / 1_000_000.0)
104    } else if v > 1_000 {
105        format!("{:5.1}k", v as f64 / 1_000.0)
106    } else {
107        format!("{:5.0} ", v)
108    }
109}
110
111#[stat_doc]
112#[derive(Clone, Debug, Default, Serialize, Deserialize, Stats)]
113#[stat(_om_prefix = "l_", _om_label = "layer_name")]
114pub struct LayerStats {
115    #[stat(desc = "index", _om_skip)]
116    pub index: usize,
117    #[stat(desc = "Total CPU utilization (100% means one full CPU)")]
118    pub util: f64,
119    #[stat(desc = "Protected CPU utilization %")]
120    pub util_protected_frac: f64,
121    #[stat(desc = "Preempt-protected CPU utilization %")]
122    pub util_protected_preempt_frac: f64,
123    #[stat(desc = "Open CPU utilization %")]
124    pub util_open_frac: f64,
125    #[stat(desc = "fraction of total CPU utilization")]
126    pub util_frac: f64,
127    #[stat(desc = "number of tasks")]
128    pub tasks: u32,
129    #[stat(desc = "count of sched events during the period")]
130    pub total: u64,
131    #[stat(desc = "% dispatched into idle CPU from select_cpu")]
132    pub sel_local: f64,
133    #[stat(desc = "% dispatched into idle CPU from enqueue")]
134    pub enq_local: f64,
135    #[stat(desc = "% enqueued after wakeup")]
136    pub enq_wakeup: f64,
137    #[stat(desc = "% enqueued after slice expiration")]
138    pub enq_expire: f64,
139    #[stat(desc = "% re-enqueued due to RT preemption")]
140    pub enq_reenq: f64,
141    #[stat(desc = "% enqueued into the layer's LLC DSQs")]
142    pub enq_dsq: f64,
143    #[stat(desc = "count of times exec duration < min_exec_us")]
144    pub min_exec: f64,
145    #[stat(desc = "total exec durations extended due to min_exec_us")]
146    pub min_exec_us: u64,
147    #[stat(desc = "% dispatched into idle CPUs occupied by other layers")]
148    pub open_idle: f64,
149    #[stat(desc = "% preempted other tasks")]
150    pub preempt: f64,
151    #[stat(desc = "% preempted XLLC tasks")]
152    pub preempt_xllc: f64,
153    #[stat(desc = "% preempted XNUMA tasks")]
154    pub preempt_xnuma: f64,
155    #[stat(desc = "% first-preempted other tasks")]
156    pub preempt_first: f64,
157    #[stat(desc = "% idle-preempted other tasks")]
158    pub preempt_idle: f64,
159    #[stat(desc = "% attempted to preempt other tasks but failed")]
160    pub preempt_fail: f64,
161    #[stat(desc = "% violated config due to CPU affinity")]
162    pub affn_viol: f64,
163    #[stat(desc = "% continued executing after slice expiration")]
164    pub keep: f64,
165    #[stat(desc = "% disallowed to continue executing due to max_exec")]
166    pub keep_fail_max_exec: f64,
167    #[stat(desc = "% disallowed to continue executing due to other tasks")]
168    pub keep_fail_busy: f64,
169    #[stat(desc = "whether is exclusive", _om_skip)]
170    pub is_excl: u32,
171    #[stat(desc = "count of times an excl task skipped a CPU as the sibling was also excl")]
172    pub excl_collision: f64,
173    #[stat(desc = "% a sibling CPU was preempted for an exclusive task")]
174    pub excl_preempt: f64,
175    #[stat(desc = "% yielded")]
176    pub yielded: f64,
177    #[stat(desc = "count of times yield was ignored")]
178    pub yield_ignore: u64,
179    #[stat(desc = "% migrated across CPUs")]
180    pub migration: f64,
181    #[stat(desc = "% migrated across NUMA nodes")]
182    pub xnuma_migration: f64,
183    #[stat(desc = "% migrated across LLCs")]
184    pub xllc_migration: f64,
185    #[stat(desc = "% migration skipped across LLCs due to xllc_mig_min_us")]
186    pub xllc_migration_skip: f64,
187    #[stat(desc = "% wakers across layers")]
188    pub xlayer_wake: f64,
189    #[stat(desc = "% rewakers across layers where waker has waken the task previously")]
190    pub xlayer_rewake: f64,
191    #[stat(desc = "% LLC draining tried")]
192    pub llc_drain_try: f64,
193    #[stat(desc = "% LLC draining succeeded")]
194    pub llc_drain: f64,
195    #[stat(desc = "% skip LLC dispatch on remote node")]
196    pub skip_remote_node: f64,
197    #[stat(desc = "mask of allocated CPUs", _om_skip)]
198    pub cpus: Vec<u64>,
199    #[stat(desc = "count of CPUs assigned")]
200    pub cur_nr_cpus: u32,
201    #[stat(desc = "minimum # of CPUs assigned")]
202    pub min_nr_cpus: u32,
203    #[stat(desc = "maximum # of CPUs assigned")]
204    pub max_nr_cpus: u32,
205    #[stat(desc = "count of CPUs assigned per LLC")]
206    pub nr_llc_cpus: Vec<u32>,
207    #[stat(desc = "slice duration config")]
208    pub slice_us: u64,
209    #[stat(desc = "Per-LLC scheduling event fractions")]
210    pub llc_fracs: Vec<f64>,
211    #[stat(desc = "Per-LLC average latency")]
212    pub llc_lats: Vec<f64>,
213    #[stat(desc = "Layer memory bandwidth as a % of total allowed (0 for \"no limit\"")]
214    pub membw_pct: f64,
215    #[stat(desc = "DSQ insertion ratio EWMA (10s window)")]
216    pub dsq_insert_ewma: f64,
217}
218
219impl LayerStats {
220    pub fn new(
221        lidx: usize,
222        layer: &Layer,
223        stats: &Stats,
224        bstats: &BpfStats,
225        nr_cpus_range: (usize, usize),
226    ) -> Self {
227        let lstat = |sidx| bstats.lstats[lidx][sidx];
228        let ltotal = lstat(LSTAT_SEL_LOCAL)
229            + lstat(LSTAT_ENQ_LOCAL)
230            + lstat(LSTAT_ENQ_WAKEUP)
231            + lstat(LSTAT_ENQ_EXPIRE)
232            + lstat(LSTAT_ENQ_REENQ)
233            + lstat(LSTAT_KEEP);
234        let lstat_pct = |sidx| {
235            if ltotal != 0 {
236                lstat(sidx) as f64 / ltotal as f64 * 100.0
237            } else {
238                0.0
239            }
240        };
241
242        let util_sum = stats.layer_utils[lidx]
243            .iter()
244            .take(LAYER_USAGE_SUM_UPTO + 1)
245            .sum::<f64>();
246
247        let membw_frac = match &layer.kind {
248            // Open layer's can't have a memory BW limit.
249            LayerKind::Open { .. } => 0.0,
250            LayerKind::Confined { membw_gb, .. } | LayerKind::Grouped { membw_gb, .. } => {
251                // Check if we have set a memory BW limit.
252                if let Some(membw_limit_gb) = membw_gb {
253                    stats.layer_membws[lidx]
254                        .iter()
255                        .take(LAYER_USAGE_SUM_UPTO + 1)
256                        .sum::<f64>()
257                        / ((*membw_limit_gb * (1024_u64.pow(3) as f64)) as f64)
258                } else {
259                    0.0
260                }
261            }
262        };
263
264        Self {
265            index: lidx,
266            util: util_sum * 100.0,
267            util_open_frac: calc_frac(stats.layer_utils[lidx][LAYER_USAGE_OPEN], util_sum),
268            util_protected_frac: calc_frac(
269                stats.layer_utils[lidx][LAYER_USAGE_PROTECTED],
270                util_sum,
271            ),
272            util_protected_preempt_frac: calc_frac(
273                stats.layer_utils[lidx][LAYER_USAGE_PROTECTED_PREEMPT],
274                util_sum,
275            ),
276            util_frac: calc_frac(util_sum, stats.total_util),
277            tasks: stats.nr_layer_tasks[lidx] as u32,
278            total: ltotal,
279            sel_local: lstat_pct(LSTAT_SEL_LOCAL),
280            enq_local: lstat_pct(LSTAT_ENQ_LOCAL),
281            enq_wakeup: lstat_pct(LSTAT_ENQ_WAKEUP),
282            enq_expire: lstat_pct(LSTAT_ENQ_EXPIRE),
283            enq_reenq: lstat_pct(LSTAT_ENQ_REENQ),
284            enq_dsq: lstat_pct(LSTAT_ENQ_DSQ),
285            min_exec: lstat_pct(LSTAT_MIN_EXEC),
286            min_exec_us: (lstat(LSTAT_MIN_EXEC_NS) / 1000) as u64,
287            open_idle: lstat_pct(LSTAT_OPEN_IDLE),
288            preempt: lstat_pct(LSTAT_PREEMPT),
289            preempt_xllc: lstat_pct(LSTAT_PREEMPT_XLLC),
290            preempt_xnuma: lstat_pct(LSTAT_PREEMPT_XNUMA),
291            preempt_first: lstat_pct(LSTAT_PREEMPT_FIRST),
292            preempt_idle: lstat_pct(LSTAT_PREEMPT_IDLE),
293            preempt_fail: lstat_pct(LSTAT_PREEMPT_FAIL),
294            affn_viol: lstat_pct(LSTAT_AFFN_VIOL),
295            keep: lstat_pct(LSTAT_KEEP),
296            keep_fail_max_exec: lstat_pct(LSTAT_KEEP_FAIL_MAX_EXEC),
297            keep_fail_busy: lstat_pct(LSTAT_KEEP_FAIL_BUSY),
298            is_excl: layer.kind.common().exclusive as u32,
299            excl_collision: lstat_pct(LSTAT_EXCL_COLLISION),
300            excl_preempt: lstat_pct(LSTAT_EXCL_PREEMPT),
301            yielded: lstat_pct(LSTAT_YIELD),
302            yield_ignore: lstat(LSTAT_YIELD_IGNORE) as u64,
303            migration: lstat_pct(LSTAT_MIGRATION),
304            xnuma_migration: lstat_pct(LSTAT_XNUMA_MIGRATION),
305            xlayer_wake: lstat_pct(LSTAT_XLAYER_WAKE),
306            xlayer_rewake: lstat_pct(LSTAT_XLAYER_REWAKE),
307            xllc_migration: lstat_pct(LSTAT_XLLC_MIGRATION),
308            xllc_migration_skip: lstat_pct(LSTAT_XLLC_MIGRATION_SKIP),
309            llc_drain_try: lstat_pct(LSTAT_LLC_DRAIN_TRY),
310            llc_drain: lstat_pct(LSTAT_LLC_DRAIN),
311            skip_remote_node: lstat_pct(LSTAT_SKIP_REMOTE_NODE),
312            cpus: layer.cpus.as_raw_slice().to_vec(),
313            cur_nr_cpus: layer.cpus.weight() as u32,
314            min_nr_cpus: nr_cpus_range.0 as u32,
315            max_nr_cpus: nr_cpus_range.1 as u32,
316            nr_llc_cpus: layer.nr_llc_cpus.iter().map(|&v| v as u32).collect(),
317            slice_us: stats.layer_slice_us[lidx],
318            llc_fracs: {
319                let sid = LLC_LSTAT_CNT;
320                let sum = bstats.llc_lstats[lidx]
321                    .iter()
322                    .map(|lstats| lstats[sid])
323                    .sum::<u64>() as f64;
324                bstats.llc_lstats[lidx]
325                    .iter()
326                    .map(|lstats| calc_frac(lstats[sid] as f64, sum))
327                    .collect()
328            },
329            llc_lats: bstats.llc_lstats[lidx]
330                .iter()
331                .map(|lstats| lstats[LLC_LSTAT_LAT] as f64 / 1_000_000_000.0)
332                .collect(),
333            membw_pct: membw_frac * 100.0,
334            dsq_insert_ewma: stats.layer_dsq_insert_ewma[lidx] * 100.0,
335        }
336    }
337
338    pub fn format<W: Write>(&self, w: &mut W, name: &str, header_width: usize) -> Result<()> {
339        writeln!(
340            w,
341            "  {:<width$}: util/open/frac={:6.1}/{}/{:7.1} prot/prot_preempt={}/{} tasks={:6}",
342            name,
343            self.util,
344            fmt_pct(self.util_open_frac),
345            self.util_frac,
346            fmt_pct(self.util_protected_frac),
347            fmt_pct(self.util_protected_preempt_frac),
348            self.tasks,
349            width = header_width,
350        )?;
351
352        writeln!(
353            w,
354            "  {:<width$}  tot={:7} local_sel/enq={}/{} enq_dsq={} wake/exp/reenq={}/{}/{} dsq_ewma={}",
355            "",
356            self.total,
357            fmt_pct(self.sel_local),
358            fmt_pct(self.enq_local),
359            fmt_pct(self.enq_dsq),
360            fmt_pct(self.enq_wakeup),
361            fmt_pct(self.enq_expire),
362            fmt_pct(self.enq_reenq),
363            fmt_pct(self.dsq_insert_ewma),
364            width = header_width,
365        )?;
366
367        writeln!(
368            w,
369            "  {:<width$}  keep/max/busy={}/{}/{} yield/ign={}/{}",
370            "",
371            fmt_pct(self.keep),
372            fmt_pct(self.keep_fail_max_exec),
373            fmt_pct(self.keep_fail_busy),
374            fmt_pct(self.yielded),
375            fmt_num(self.yield_ignore),
376            width = header_width,
377        )?;
378
379        writeln!(
380            w,
381            "  {:<width$}  open_idle={} mig={} xnuma_mig={} xllc_mig/skip={}/{} affn_viol={}",
382            "",
383            fmt_pct(self.open_idle),
384            fmt_pct(self.migration),
385            fmt_pct(self.xnuma_migration),
386            fmt_pct(self.xllc_migration),
387            fmt_pct(self.xllc_migration_skip),
388            fmt_pct(self.affn_viol),
389            width = header_width,
390        )?;
391
392        writeln!(
393            w,
394            "  {:<width$}  preempt/first/xllc/xnuma/idle/fail={}/{}/{}/{}/{}/{}",
395            "",
396            fmt_pct(self.preempt),
397            fmt_pct(self.preempt_first),
398            fmt_pct(self.preempt_xllc),
399            fmt_pct(self.preempt_xnuma),
400            fmt_pct(self.preempt_idle),
401            fmt_pct(self.preempt_fail),
402            width = header_width,
403        )?;
404
405        writeln!(
406            w,
407            "  {:<width$}  xlayer_wake/re={}/{} llc_drain/try={}/{} skip_rnode={}",
408            "",
409            fmt_pct(self.xlayer_wake),
410            fmt_pct(self.xlayer_rewake),
411            fmt_pct(self.llc_drain),
412            fmt_pct(self.llc_drain_try),
413            fmt_pct(self.skip_remote_node),
414            width = header_width,
415        )?;
416
417        writeln!(
418            w,
419            "  {:<width$}  slice={}ms min_exec={}/{:7.2}ms",
420            "",
421            self.slice_us as f64 / 1000.0,
422            fmt_pct(self.min_exec),
423            self.min_exec_us as f64 / 1000.0,
424            width = header_width
425        )?;
426
427        let cpumask = Cpumask::from_vec(self.cpus.clone());
428
429        writeln!(
430            w,
431            "  {:<width$}  cpus={:3} [{:3},{:3}] {}",
432            "",
433            self.cur_nr_cpus,
434            self.min_nr_cpus,
435            self.max_nr_cpus,
436            &cpumask,
437            width = header_width
438        )?;
439
440        write!(
441            w,
442            "  {:<width$}  [LLC] nr_cpus: sched% lat_ms",
443            "",
444            width = header_width
445        )?;
446
447        for (i, (&frac, &lat)) in self.llc_fracs.iter().zip(self.llc_lats.iter()).enumerate() {
448            if (i % 4) == 0 {
449                writeln!(w, "")?;
450                write!(w, "  {:<width$}  [{:03}]", "", i, width = header_width)?;
451            } else {
452                write!(w, " |")?;
453            }
454            write!(
455                w,
456                " {:2}:{}%{:7.2}",
457                self.nr_llc_cpus[i],
458                fmt_pct(frac),
459                lat * 1_000.0
460            )?;
461        }
462        writeln!(w, "")?;
463
464        if self.is_excl != 0 {
465            writeln!(
466                w,
467                "  {:<width$}  excl_coll={} excl_preempt={}",
468                "",
469                fmt_pct(self.excl_collision),
470                fmt_pct(self.excl_preempt),
471                width = header_width,
472            )?;
473        } else if self.excl_collision != 0.0 || self.excl_preempt != 0.0 {
474            warn!(
475                "{}: exclusive is off but excl_coll={} excl_preempt={}",
476                name,
477                fmt_pct(self.excl_collision),
478                fmt_pct(self.excl_preempt),
479            );
480        }
481
482        Ok(())
483    }
484}
485
486#[stat_doc]
487#[derive(Clone, Debug, Default, Serialize, Deserialize, Stats)]
488#[stat(top)]
489pub struct SysStats {
490    #[stat(desc = "timestamp", _om_skip)]
491    pub at: f64,
492    #[stat(desc = "# of NUMA nodes")]
493    pub nr_nodes: usize,
494    #[stat(desc = "# sched events during the period")]
495    pub total: u64,
496    #[stat(desc = "% dispatched directly into an idle CPU from select_cpu")]
497    pub local_sel: f64,
498    #[stat(desc = "% dispatched directly into an idle CPU from enqueue")]
499    pub local_enq: f64,
500    #[stat(desc = "% open layer tasks scheduled into allocated but idle CPUs")]
501    pub open_idle: f64,
502    #[stat(desc = "% violated config due to CPU affinity")]
503    pub affn_viol: f64,
504    #[stat(desc = "% sent to hi fallback DSQs")]
505    pub hi_fb: f64,
506    #[stat(desc = "% sent to lo fallback DSQs")]
507    pub lo_fb: f64,
508    #[stat(desc = "count of times an excl task skipped a CPU as the sibling was also excl")]
509    pub excl_collision: f64,
510    #[stat(desc = "count of times a sibling CPU was preempted for an excl task")]
511    pub excl_preempt: f64,
512    #[stat(desc = "count of times a CPU skipped dispatching due to an excl task on the sibling")]
513    pub excl_idle: f64,
514    #[stat(
515        desc = "count of times an idle sibling CPU was woken up after an excl task is finished"
516    )]
517    pub excl_wakeup: f64,
518    #[stat(desc = "CPU time this binary consumed during the period")]
519    pub proc_ms: u64,
520    #[stat(desc = "CPU busy % (100% means all CPU)")]
521    pub busy: f64,
522    #[stat(desc = "CPU util % (100% means one CPU)")]
523    pub util: f64,
524    #[stat(desc = "CPU util % used by hi fallback DSQs")]
525    pub hi_fb_util: f64,
526    #[stat(desc = "CPU util % used by lo fallback DSQs")]
527    pub lo_fb_util: f64,
528    #[stat(desc = "Number of tasks dispatched via antistall")]
529    pub antistall: u64,
530    #[stat(desc = "Number of times preemptions of non-scx tasks were avoided")]
531    pub skip_preempt: u64,
532    #[stat(desc = "Number of times vtime was out of range and fixed up")]
533    pub fixup_vtime: u64,
534    #[stat(desc = "Number of times cpuc->preempting_task didn't come on the CPU")]
535    pub preempting_mismatch: u64,
536    #[stat(desc = "fallback CPU")]
537    pub fallback_cpu: u32,
538    #[stat(desc = "per-layer statistics")]
539    pub fallback_cpu_util: f64,
540    #[stat(desc = "fallback CPU util %")]
541    pub layers: BTreeMap<String, LayerStats>,
542    #[stat(desc = "Number of gpu tasks affinitized since scheduler start")]
543    pub gpu_tasks_affinitized: u64,
544    #[stat(desc = "Time (in ms) of last affinitization run.")]
545    pub gpu_task_affinitization_ms: u64,
546    #[stat(desc = "System CPU utilization EWMA (10s window)")]
547    pub system_cpu_util_ewma: f64,
548}
549
550impl SysStats {
551    pub fn new(stats: &Stats, bstats: &BpfStats, fallback_cpu: usize) -> Result<Self> {
552        let lsum = |idx| stats.bpf_stats.lstats_sums[idx];
553        let total = lsum(LSTAT_SEL_LOCAL)
554            + lsum(LSTAT_ENQ_LOCAL)
555            + lsum(LSTAT_ENQ_WAKEUP)
556            + lsum(LSTAT_ENQ_EXPIRE)
557            + lsum(LSTAT_ENQ_REENQ)
558            + lsum(LSTAT_KEEP);
559        let lsum_pct = |idx| {
560            if total != 0 {
561                lsum(idx) as f64 / total as f64 * 100.0
562            } else {
563                0.0
564            }
565        };
566
567        let elapsed_ns = stats.elapsed.as_nanos();
568
569        Ok(Self {
570            at: SystemTime::now().duration_since(UNIX_EPOCH)?.as_secs_f64(),
571            nr_nodes: stats.nr_nodes,
572            total,
573            local_sel: lsum_pct(LSTAT_SEL_LOCAL),
574            local_enq: lsum_pct(LSTAT_ENQ_LOCAL),
575            open_idle: lsum_pct(LSTAT_OPEN_IDLE),
576            affn_viol: lsum_pct(LSTAT_AFFN_VIOL),
577            hi_fb: calc_frac(
578                stats.bpf_stats.gstats[GSTAT_HI_FB_EVENTS] as f64,
579                total as f64,
580            ),
581            lo_fb: calc_frac(
582                stats.bpf_stats.gstats[GSTAT_LO_FB_EVENTS] as f64,
583                total as f64,
584            ),
585            excl_collision: lsum_pct(LSTAT_EXCL_COLLISION),
586            excl_preempt: lsum_pct(LSTAT_EXCL_PREEMPT),
587            excl_idle: bstats.gstats[GSTAT_EXCL_IDLE] as f64 / total as f64,
588            excl_wakeup: bstats.gstats[GSTAT_EXCL_WAKEUP] as f64 / total as f64,
589            proc_ms: stats.processing_dur.as_millis() as u64,
590            busy: stats.cpu_busy * 100.0,
591            util: stats.total_util * 100.0,
592            hi_fb_util: stats.bpf_stats.gstats[GSTAT_HI_FB_USAGE] as f64 / elapsed_ns as f64
593                * 100.0,
594            lo_fb_util: stats.bpf_stats.gstats[GSTAT_LO_FB_USAGE] as f64 / elapsed_ns as f64
595                * 100.0,
596            antistall: stats.bpf_stats.gstats[GSTAT_ANTISTALL],
597            skip_preempt: stats.bpf_stats.gstats[GSTAT_SKIP_PREEMPT],
598            fixup_vtime: stats.bpf_stats.gstats[GSTAT_FIXUP_VTIME],
599            preempting_mismatch: stats.bpf_stats.gstats[GSTAT_PREEMPTING_MISMATCH],
600            fallback_cpu: fallback_cpu as u32,
601            fallback_cpu_util: stats.bpf_stats.gstats[GSTAT_FB_CPU_USAGE] as f64
602                / elapsed_ns as f64
603                * 100.0,
604            layers: BTreeMap::new(),
605            gpu_tasks_affinitized: stats.gpu_tasks_affinitized,
606            gpu_task_affinitization_ms: stats.gpu_task_affinitization_ms,
607            system_cpu_util_ewma: stats.system_cpu_util_ewma * 100.0,
608        })
609    }
610
611    pub fn format<W: Write>(&self, w: &mut W) -> Result<()> {
612        writeln!(
613            w,
614            "tot={:7} local_sel/enq={}/{} open_idle={} affn_viol={} hi/lo={}/{}",
615            self.total,
616            fmt_pct(self.local_sel),
617            fmt_pct(self.local_enq),
618            fmt_pct(self.open_idle),
619            fmt_pct(self.affn_viol),
620            fmt_pct(self.hi_fb),
621            fmt_pct(self.lo_fb),
622        )?;
623
624        writeln!(
625            w,
626            "busy={:5.1} util/hi/lo={:7.1}/{}/{} fallback_cpu/util={:3}/{:4.1} proc={:?}ms sys_util_ewma={:5.1}",
627            self.busy,
628            self.util,
629            fmt_pct(self.hi_fb_util),
630            fmt_pct(self.lo_fb_util),
631            self.fallback_cpu,
632            self.fallback_cpu_util,
633            self.proc_ms,
634            self.system_cpu_util_ewma,
635        )?;
636
637        writeln!(
638            w,
639            "excl_coll={:.2} excl_preempt={:.2} excl_idle={:.2} excl_wakeup={:.2}",
640            self.excl_collision, self.excl_preempt, self.excl_idle, self.excl_wakeup
641        )?;
642
643        writeln!(
644            w,
645            "skip_preempt={} antistall={} fixup_vtime={} preempting_mismatch={}",
646            self.skip_preempt, self.antistall, self.fixup_vtime, self.preempting_mismatch
647        )?;
648
649        writeln!(
650            w,
651            "gpu_tasks_affinitized={} gpu_task_affinitization_time={}",
652            self.gpu_tasks_affinitized, self.gpu_task_affinitization_ms
653        )?;
654
655        Ok(())
656    }
657
658    pub fn format_all<W: Write>(&self, w: &mut W) -> Result<()> {
659        self.format(w)?;
660
661        let header_width = self
662            .layers
663            .keys()
664            .map(|name| name.len())
665            .max()
666            .unwrap_or(0)
667            .max(4);
668
669        let mut idx_to_name: Vec<(usize, &String)> =
670            self.layers.iter().map(|(k, v)| (v.index, k)).collect();
671
672        idx_to_name.sort();
673
674        for (_idx, name) in &idx_to_name {
675            self.layers[*name].format(w, name, header_width)?;
676        }
677
678        Ok(())
679    }
680}
681
682#[derive(Debug)]
683pub enum StatsReq {
684    Hello(ThreadId),
685    Refresh(ThreadId, Stats),
686    Bye(ThreadId),
687}
688
689#[derive(Debug)]
690pub enum StatsRes {
691    Hello(Stats),
692    Refreshed((Stats, SysStats)),
693    Bye,
694}
695
696pub fn server_data() -> StatsServerData<StatsReq, StatsRes> {
697    let open: Box<dyn StatsOpener<StatsReq, StatsRes>> = Box::new(move |(req_ch, res_ch)| {
698        let tid = current().id();
699        req_ch.send(StatsReq::Hello(tid))?;
700        let mut stats = Some(match res_ch.recv()? {
701            StatsRes::Hello(v) => v,
702            res => bail!("invalid response to Hello: {:?}", res),
703        });
704
705        let read: Box<dyn StatsReader<StatsReq, StatsRes>> =
706            Box::new(move |_args, (req_ch, res_ch)| {
707                req_ch.send(StatsReq::Refresh(tid, stats.take().unwrap()))?;
708                let (new_stats, sys_stats) = match res_ch.recv()? {
709                    StatsRes::Refreshed(v) => v,
710                    res => bail!("invalid response to Refresh: {:?}", res),
711                };
712                stats = Some(new_stats);
713                sys_stats.to_json()
714            });
715
716        Ok(read)
717    });
718
719    let close: Box<dyn StatsCloser<StatsReq, StatsRes>> = Box::new(move |(req_ch, res_ch)| {
720        req_ch.send(StatsReq::Bye(current().id())).unwrap();
721        match res_ch.recv().unwrap() {
722            StatsRes::Bye => {}
723            res => panic!("invalid response to Bye: {:?}", res),
724        }
725    });
726
727    StatsServerData::new()
728        .add_meta(LayerStats::meta())
729        .add_meta(SysStats::meta())
730        .add_ops(
731            "top",
732            StatsOps {
733                open,
734                close: Some(close),
735            },
736        )
737}
738
739pub fn monitor(intv: Duration, shutdown: Arc<AtomicBool>) -> Result<()> {
740    scx_utils::monitor_stats::<SysStats>(
741        &[],
742        intv,
743        || shutdown.load(Ordering::Relaxed),
744        |sst| {
745            let dt = DateTime::<Local>::from(UNIX_EPOCH + Duration::from_secs_f64(sst.at));
746            println!("###### {} ######", dt.to_rfc2822());
747            sst.format_all(&mut std::io::stdout())
748        },
749    )
750}