scx_utils/
topology.rs

1// Copyright (c) Meta Platforms, Inc. and affiliates.
2
3// This software may be used and distributed according to the terms of the
4// GNU General Public License version 2.
5
6//! # SCX Topology
7//!
8//! A crate that allows schedulers to inspect and model the host's topology, in
9//! service of creating scheduling domains.
10//!
11//! A Topology is comprised of one or more Node objects, which themselves are
12//! comprised hierarchically of LLC -> Core -> Cpu objects respectively:
13//!```rust,ignore
14//!                                   Topology
15//!                                       |
16//! o--------------------------------o   ...   o----------------o---------------o
17//! |         Node                   |         |         Node                   |
18//! | ID      0                      |         | ID      1                      |
19//! | LLCs    <id, Llc>              |         | LLCs    <id, Llc>              |
20//! | Span    0x00000fffff00000fffff |         | Span    0xfffff00000fffff00000 |
21//! o--------------------------------o         o--------------------------------o
22//!                 \
23//!                  --------------------
24//!                                      \
25//! o--------------------------------o   ...   o--------------------------------o
26//! |             Llc                |         |             Llc                |
27//! | ID     0                       |         | ID     1                       |
28//! | Cores  <id, Core>              |         | Cores  <id, Core>              |
29//! | Span   0x00000ffc0000000ffc00  |         | Span   0x00000003ff00000003ff  |
30//! o--------------------------------o         o----------------o---------------o
31//!                                                             /
32//!                                        ---------------------
33//!                                       /
34//! o--------------------------------o   ...   o--------------------------------o
35//! |              Core              |         |              Core              |
36//! | ID     0                       |         | ID     9                       |
37//! | Cpus   <id, Cpu>               |         | Cpus   <id, Cpu>               |
38//! | Span   0x00000000010000000001  |         | Span   0x00000002000000000200  |
39//! o--------------------------------o         o----------------o---------------o
40//!                                                             /
41//!                                        ---------------------
42//!                                       /
43//! o--------------------------------o   ...   o---------------------------------o
44//! |              Cpu               |         |               Cpu               |
45//! | ID       9                     |         | ID       49                     |
46//! | online   1                     |         | online   1                      |
47//! | min_freq 400000                |         | min_freq 400000                 |
48//! | max_freq 5881000               |         | min_freq 5881000                |
49//! o--------------------------------o         o---------------------------------o
50//!```
51//! Every object contains a Cpumask that spans all CPUs in that point in the
52//! topological hierarchy.
53//!
54//! Creating Topology
55//! -----------------
56//!
57//! Topology objects are created using the static new function:
58//!
59//!```  
60//!     use scx_utils::Topology;
61//!     let top = Topology::new().unwrap();
62//!```
63//!
64//! Querying Topology
65//! -----------------
66//!
67//! With a created Topology, you can query the topological hierarchy using the
68//! set of accessor functions defined below. All objects in the topological
69//! hierarchy are entirely read-only. If the host topology were to change (due
70//! to e.g. hotplug), a new Topology object should be created.
71
72use crate::compat::ROOT_PREFIX;
73use crate::cpumask::read_cpulist;
74use crate::misc::find_best_split_size;
75use crate::misc::read_file_byte;
76use crate::misc::read_file_usize_vec;
77use crate::misc::read_from_file;
78use crate::Cpumask;
79use anyhow::bail;
80use anyhow::Result;
81use glob::glob;
82use log::warn;
83use sscanf::sscanf;
84use std::cmp::min;
85use std::collections::BTreeMap;
86use std::path::Path;
87use std::sync::Arc;
88
89#[cfg(feature = "gpu-topology")]
90use crate::gpu::{create_gpus, Gpu, GpuIndex};
91
92lazy_static::lazy_static! {
93    /// The maximum possible number of CPU IDs in the system. As mentioned
94    /// above, this is different than the number of possible CPUs on the
95    /// system (though very seldom is). This number may differ from the
96    /// number of possible CPUs on the system when e.g. there are fully
97    /// disabled CPUs in the middle of the range of possible CPUs (i.e. CPUs
98    /// that may not be onlined).
99    pub static ref NR_CPU_IDS: usize = read_cpu_ids().unwrap().last().unwrap() + 1;
100
101    /// The number of possible CPUs that may be active on the system. Note
102    /// that this value is separate from the number of possible _CPU IDs_ in
103    /// the system, as there may be gaps in what CPUs are allowed to be
104    /// onlined. For example, some BIOS implementations may report spans of
105    /// disabled CPUs that may not be onlined, whose IDs are lower than the
106    /// IDs of other CPUs that may be onlined.
107    pub static ref NR_CPUS_POSSIBLE: usize = libbpf_rs::num_possible_cpus().unwrap();
108
109    /// The range to search for when finding the number of physical cores
110    /// assigned to a partition to split a large number of cores that share
111    /// an LLC domain. The suggested split for the cores isn't a function of
112    /// the underlying hardware's capability, but rather some sane number
113    /// to help determine the number of CPUs that share the same DSQ.
114    pub static ref NR_PARTITION_MIN_CORES: usize = 2;
115    pub static ref NR_PARTITION_MAX_CORES: usize = 8;
116}
117
118#[derive(Debug, Clone, Eq, Hash, Ord, PartialEq, PartialOrd)]
119pub enum CoreType {
120    Big { turbo: bool },
121    Little,
122}
123
124#[derive(Debug, Clone, Eq, Hash, Ord, PartialEq, PartialOrd)]
125pub struct Cpu {
126    pub id: usize,
127    pub min_freq: usize,
128    pub max_freq: usize,
129    /// Base operational frqeuency. Only available on Intel Turbo Boost
130    /// CPUs. If not available, this will simply return maximum frequency.
131    pub base_freq: usize,
132    /// The best-effort guessing of cpu_capacity scaled to 1024.
133    pub cpu_capacity: usize,
134    pub smt_level: usize,
135    /// CPU idle resume latency
136    pub pm_qos_resume_latency_us: usize,
137    pub trans_lat_ns: usize,
138    pub l2_id: usize,
139    pub l3_id: usize,
140    /// Per-CPU cache size of all levels.
141    pub cache_size: usize,
142    pub core_type: CoreType,
143
144    /// Ancestor IDs.
145    pub core_id: usize,
146    pub llc_id: usize,
147    pub node_id: usize,
148    pub package_id: usize,
149    pub cluster_id: isize,
150}
151
152#[derive(Clone, Debug, Eq, Ord, PartialEq, PartialOrd)]
153pub struct Core {
154    /// Monotonically increasing unique id
155    pub id: usize,
156    /// The sysfs value of core_id
157    pub kernel_id: usize,
158    pub cluster_id: isize,
159    pub cpus: BTreeMap<usize, Arc<Cpu>>,
160    /// Cpumask of all CPUs in this core.
161    pub span: Cpumask,
162    pub core_type: CoreType,
163
164    /// Ancestor IDs.
165    pub llc_id: usize,
166    pub node_id: usize,
167}
168
169#[derive(Debug, Clone)]
170pub struct Llc {
171    /// Monotonically increasing unique id
172    pub id: usize,
173    /// The kernel id of the llc
174    pub kernel_id: usize,
175    pub cores: BTreeMap<usize, Arc<Core>>,
176    /// Cpumask of all CPUs in this llc.
177    pub span: Cpumask,
178
179    /// Ancestor IDs.
180    pub node_id: usize,
181
182    /// Skip indices to access lower level members easily.
183    pub all_cpus: BTreeMap<usize, Arc<Cpu>>,
184}
185
186#[derive(Debug, Clone)]
187pub struct Node {
188    pub id: usize,
189    pub distance: Vec<usize>,
190    pub llcs: BTreeMap<usize, Arc<Llc>>,
191    /// Cpumask of all CPUs in this node.
192    pub span: Cpumask,
193
194    /// Skip indices to access lower level members easily.
195    pub all_cores: BTreeMap<usize, Arc<Core>>,
196    pub all_cpus: BTreeMap<usize, Arc<Cpu>>,
197
198    #[cfg(feature = "gpu-topology")]
199    pub gpus: BTreeMap<GpuIndex, Gpu>,
200}
201
202#[derive(Debug)]
203pub struct Topology {
204    pub nodes: BTreeMap<usize, Node>,
205    /// Cpumask all CPUs in the system.
206    pub span: Cpumask,
207    /// True if SMT is enabled in the system, false otherwise.
208    pub smt_enabled: bool,
209
210    /// Skip indices to access lower level members easily.
211    pub all_llcs: BTreeMap<usize, Arc<Llc>>,
212    pub all_cores: BTreeMap<usize, Arc<Core>>,
213    pub all_cpus: BTreeMap<usize, Arc<Cpu>>,
214}
215
216impl Topology {
217    fn instantiate(span: Cpumask, mut nodes: BTreeMap<usize, Node>) -> Result<Self> {
218        // Build skip indices prefixed with all_ for easy lookups. As Arc
219        // objects can only be modified while there's only one reference,
220        // skip indices must be built from bottom to top.
221        let mut topo_llcs = BTreeMap::new();
222        let mut topo_cores = BTreeMap::new();
223        let mut topo_cpus = BTreeMap::new();
224
225        for (_node_id, node) in nodes.iter_mut() {
226            let mut node_cores = BTreeMap::new();
227            let mut node_cpus = BTreeMap::new();
228
229            for (&llc_id, llc) in node.llcs.iter_mut() {
230                let llc_mut = Arc::get_mut(llc).unwrap();
231                let mut llc_cpus = BTreeMap::new();
232
233                for (&core_id, core) in llc_mut.cores.iter_mut() {
234                    let core_mut = Arc::get_mut(core).unwrap();
235                    let smt_level = core_mut.cpus.len();
236
237                    for (&cpu_id, cpu) in core_mut.cpus.iter_mut() {
238                        let cpu_mut = Arc::get_mut(cpu).unwrap();
239                        cpu_mut.smt_level = smt_level;
240
241                        if topo_cpus
242                            .insert(cpu_id, cpu.clone())
243                            .or(node_cpus.insert(cpu_id, cpu.clone()))
244                            .or(llc_cpus.insert(cpu_id, cpu.clone()))
245                            .is_some()
246                        {
247                            bail!("Duplicate CPU ID {}", cpu_id);
248                        }
249                    }
250
251                    // Note that in some weird architectures, core ids can be
252                    // duplicated in different LLC domains.
253                    topo_cores
254                        .insert(core_id, core.clone())
255                        .or(node_cores.insert(core_id, core.clone()));
256                }
257
258                llc_mut.all_cpus = llc_cpus;
259
260                if topo_llcs.insert(llc_id, llc.clone()).is_some() {
261                    bail!("Duplicate LLC ID {}", llc_id);
262                }
263            }
264
265            node.all_cores = node_cores;
266            node.all_cpus = node_cpus;
267        }
268
269        Ok(Topology {
270            nodes,
271            span,
272            smt_enabled: is_smt_active().unwrap_or(false),
273            all_llcs: topo_llcs,
274            all_cores: topo_cores,
275            all_cpus: topo_cpus,
276        })
277    }
278
279    /// Build a complete host Topology
280    pub fn new() -> Result<Topology> {
281        Self::with_virt_llcs(None)
282    }
283
284    pub fn with_virt_llcs(nr_cores_per_vllc: Option<(usize, usize)>) -> Result<Topology> {
285        let span = cpus_online()?;
286        let mut topo_ctx = TopoCtx::new();
287
288        // If the kernel is compiled with CONFIG_NUMA, then build a topology
289        // from the NUMA hierarchy in sysfs. Otherwise, just make a single
290        // default node of ID 0 which contains all cores.
291        let path = format!("{}/sys/devices/system/node", *ROOT_PREFIX);
292        let nodes = if Path::new(&path).exists() {
293            create_numa_nodes(&span, &mut topo_ctx, nr_cores_per_vllc)?
294        } else {
295            create_default_node(&span, &mut topo_ctx, false, nr_cores_per_vllc)?
296        };
297
298        Self::instantiate(span, nodes)
299    }
300
301    pub fn with_flattened_llc_node() -> Result<Topology> {
302        let span = cpus_online()?;
303        let mut topo_ctx = TopoCtx::new();
304        let nodes = create_default_node(&span, &mut topo_ctx, true, None)?;
305        Self::instantiate(span, nodes)
306    }
307
308    /// Build a topology with configuration from CLI arguments.
309    /// This method integrates with the TopologyArgs from the cli module to
310    /// create a topology based on command line parameters.
311    pub fn with_args(topology_args: &crate::cli::TopologyArgs) -> Result<Topology> {
312        // Validate the CLI arguments first
313        topology_args.validate()?;
314
315        // Get the virtual LLC configuration
316        let nr_cores_per_vllc = topology_args.get_nr_cores_per_vllc();
317
318        // Build topology with the specified configuration
319        Self::with_virt_llcs(nr_cores_per_vllc)
320    }
321
322    /// Get a vec of all GPUs on the hosts.
323    #[cfg(feature = "gpu-topology")]
324    pub fn gpus(&self) -> BTreeMap<GpuIndex, &Gpu> {
325        let mut gpus = BTreeMap::new();
326        for node in self.nodes.values() {
327            for (idx, gpu) in &node.gpus {
328                gpus.insert(*idx, gpu);
329            }
330        }
331        gpus
332    }
333
334    /// Returns whether the Topology has a hybrid architecture of big and little cores.
335    pub fn has_little_cores(&self) -> bool {
336        self.all_cores
337            .values()
338            .any(|c| c.core_type == CoreType::Little)
339    }
340
341    /// Returns a vector that maps the index of each logical CPU to the
342    /// sibling CPU. This represents the "next sibling" CPU within a package
343    /// in systems that support SMT. The sibling CPU is the other logical
344    /// CPU that shares the physical resources of the same physical core.
345    ///
346    /// Assuming each core holds exactly at most two cpus.
347    pub fn sibling_cpus(&self) -> Vec<i32> {
348        let mut sibling_cpu = vec![-1i32; *NR_CPUS_POSSIBLE];
349        for core in self.all_cores.values() {
350            let mut first = -1i32;
351            for &cpu in core.cpus.keys() {
352                if first < 0 {
353                    first = cpu as i32;
354                } else {
355                    sibling_cpu[first as usize] = cpu as i32;
356                    sibling_cpu[cpu] = first;
357                    break;
358                }
359            }
360        }
361        sibling_cpu
362    }
363}
364
365/******************************************************
366 * Helper structs/functions for creating the Topology *
367 ******************************************************/
368/// TopoCtx is a helper struct used to build a topology.
369struct TopoCtx {
370    /// Mapping of NUMA node core ids
371    node_core_kernel_ids: BTreeMap<(usize, usize, usize), usize>,
372    /// Mapping of NUMA node LLC ids
373    node_llc_kernel_ids: BTreeMap<(usize, usize, usize), usize>,
374    /// Mapping of L2 ids
375    l2_ids: BTreeMap<String, usize>,
376    /// Mapping of L3 ids
377    l3_ids: BTreeMap<String, usize>,
378}
379
380impl TopoCtx {
381    fn new() -> TopoCtx {
382        let core_kernel_ids = BTreeMap::new();
383        let llc_kernel_ids = BTreeMap::new();
384        let l2_ids = BTreeMap::new();
385        let l3_ids = BTreeMap::new();
386        TopoCtx {
387            node_core_kernel_ids: core_kernel_ids,
388            node_llc_kernel_ids: llc_kernel_ids,
389            l2_ids,
390            l3_ids,
391        }
392    }
393}
394
395fn cpus_online() -> Result<Cpumask> {
396    let path = format!("{}/sys/devices/system/cpu/online", *ROOT_PREFIX);
397    let online = std::fs::read_to_string(path)?;
398    Cpumask::from_cpulist(&online)
399}
400
401fn get_cache_id(topo_ctx: &mut TopoCtx, cache_level_path: &Path, cache_level: usize) -> usize {
402    // Check if the cache id is already cached
403    let id_map = match cache_level {
404        2 => &mut topo_ctx.l2_ids,
405        3 => &mut topo_ctx.l3_ids,
406        _ => return usize::MAX,
407    };
408
409    let path = &cache_level_path.join("shared_cpu_list");
410    let key = match std::fs::read_to_string(path) {
411        Ok(key) => key,
412        Err(_) => return usize::MAX,
413    };
414
415    let id = *id_map.get(&key).unwrap_or(&usize::MAX);
416    if id != usize::MAX {
417        return id;
418    }
419
420    // In case of a cache miss, try to get the id from the sysfs first.
421    let id = read_from_file(&cache_level_path.join("id")).unwrap_or(usize::MAX);
422    if id != usize::MAX {
423        // Keep the id in the map
424        id_map.insert(key, id);
425        return id;
426    }
427
428    // If the id file does not exist, assign an id and keep it in the map.
429    let id = id_map.len();
430    id_map.insert(key, id);
431
432    id
433}
434
435fn get_per_cpu_cache_size(cache_path: &Path) -> Result<usize> {
436    let path_str = cache_path.to_str().unwrap();
437    let paths = glob(&(path_str.to_owned() + "/index[0-9]*"))?;
438    let mut tot_size = 0;
439
440    for index in paths.filter_map(Result::ok) {
441        // If there is no size information under sysfs (e.g., many ARM SoCs),
442        // give 1024 as a default value. 1024 is small enough compared to the
443        // real cache size of the CPU, but it is large enough to give a penalty
444        // when multiple CPUs share the cache.
445        let size = read_file_byte(&index.join("size")).unwrap_or(1024_usize);
446        let cpulist: String = read_from_file(&index.join("shared_cpu_list"))?;
447        let num_cpus = read_cpulist(&cpulist)?.len();
448        tot_size += size / num_cpus;
449    }
450
451    Ok(tot_size)
452}
453
454#[allow(clippy::too_many_arguments)]
455fn create_insert_cpu(
456    id: usize,
457    node: &mut Node,
458    online_mask: &Cpumask,
459    topo_ctx: &mut TopoCtx,
460    cs: &CapacitySource,
461    flatten_llc: bool,
462) -> Result<()> {
463    // CPU is offline. The Topology hierarchy is read-only, and assumes
464    // that hotplug will cause the scheduler to restart. Thus, we can
465    // just skip this CPU altogether.
466    if !online_mask.test_cpu(id) {
467        return Ok(());
468    }
469
470    let cpu_str = format!("{}/sys/devices/system/cpu/cpu{}", *ROOT_PREFIX, id);
471    let cpu_path = Path::new(&cpu_str);
472
473    // Physical core ID
474    let top_path = cpu_path.join("topology");
475    let core_kernel_id = read_from_file(&top_path.join("core_id"))?;
476    let package_id = read_from_file(&top_path.join("physical_package_id"))?;
477    let cluster_id = read_from_file(&top_path.join("cluster_id"))?;
478
479    // Evaluate L2, L3 and LLC cache IDs.
480    //
481    // Use ID 0 if we fail to detect the cache hierarchy. This seems to happen on certain SKUs, so
482    // if there's no cache information then we have no option but to assume a single unified cache
483    // per node.
484    let cache_path = cpu_path.join("cache");
485    let l2_id = get_cache_id(topo_ctx, &cache_path.join(format!("index{}", 2)), 2);
486    let l3_id = get_cache_id(topo_ctx, &cache_path.join(format!("index{}", 3)), 3);
487    let llc_kernel_id = if flatten_llc {
488        0
489    } else if l3_id == usize::MAX {
490        l2_id
491    } else {
492        l3_id
493    };
494
495    // Per-CPU cache size
496    let cache_size = get_per_cpu_cache_size(&cache_path).unwrap_or(0_usize);
497
498    // Min and max frequencies. If the kernel is not compiled with
499    // CONFIG_CPU_FREQ, just assume 0 for both frequencies.
500    let freq_path = cpu_path.join("cpufreq");
501    let min_freq = read_from_file(&freq_path.join("scaling_min_freq")).unwrap_or(0_usize);
502    let max_freq = read_from_file(&freq_path.join("scaling_max_freq")).unwrap_or(0_usize);
503    let base_freq = read_from_file(&freq_path.join("base_frequency")).unwrap_or(max_freq);
504    let trans_lat_ns =
505        read_from_file(&freq_path.join("cpuinfo_transition_latency")).unwrap_or(0_usize);
506
507    // Cpu capacity
508    let cap_path = cpu_path.join(cs.suffix.clone());
509    let rcap = read_from_file(&cap_path).unwrap_or(cs.max_rcap);
510    let cpu_capacity = (rcap * 1024) / cs.max_rcap;
511
512    // Power management
513    let power_path = cpu_path.join("power");
514    let pm_qos_resume_latency_us =
515        read_from_file(&power_path.join("pm_qos_resume_latency_us")).unwrap_or(0_usize);
516
517    let num_llcs = topo_ctx.node_llc_kernel_ids.len();
518    let llc_id = topo_ctx
519        .node_llc_kernel_ids
520        .entry((node.id, package_id, llc_kernel_id))
521        .or_insert(num_llcs);
522
523    let llc = node.llcs.entry(*llc_id).or_insert(Arc::new(Llc {
524        id: *llc_id,
525        cores: BTreeMap::new(),
526        span: Cpumask::new(),
527        all_cpus: BTreeMap::new(),
528
529        node_id: node.id,
530        kernel_id: llc_kernel_id,
531    }));
532    let llc_mut = Arc::get_mut(llc).unwrap();
533
534    let core_type = if cs.avg_rcap < cs.max_rcap && rcap == cs.max_rcap {
535        CoreType::Big { turbo: true }
536    } else if !cs.has_biglittle || rcap >= cs.avg_rcap {
537        CoreType::Big { turbo: false }
538    } else {
539        CoreType::Little
540    };
541
542    let num_cores = topo_ctx.node_core_kernel_ids.len();
543    let core_id = topo_ctx
544        .node_core_kernel_ids
545        .entry((node.id, package_id, core_kernel_id))
546        .or_insert(num_cores);
547
548    let core = llc_mut.cores.entry(*core_id).or_insert(Arc::new(Core {
549        id: *core_id,
550        cpus: BTreeMap::new(),
551        span: Cpumask::new(),
552        core_type: core_type.clone(),
553
554        llc_id: *llc_id,
555        node_id: node.id,
556        kernel_id: core_kernel_id,
557        cluster_id,
558    }));
559    let core_mut = Arc::get_mut(core).unwrap();
560
561    core_mut.cpus.insert(
562        id,
563        Arc::new(Cpu {
564            id,
565            min_freq,
566            max_freq,
567            base_freq,
568            cpu_capacity,
569            smt_level: 0, // Will be initialized at instantiate().
570            pm_qos_resume_latency_us,
571            trans_lat_ns,
572            l2_id,
573            l3_id,
574            cache_size,
575            core_type: core_type.clone(),
576
577            core_id: *core_id,
578            llc_id: *llc_id,
579            node_id: node.id,
580            package_id,
581            cluster_id,
582        }),
583    );
584
585    if node.span.test_cpu(id) {
586        bail!("Node {} already had CPU {}", node.id, id);
587    }
588
589    // Update all of the devices' spans to include this CPU.
590    core_mut.span.set_cpu(id)?;
591    llc_mut.span.set_cpu(id)?;
592    node.span.set_cpu(id)?;
593
594    Ok(())
595}
596
597fn read_cpu_ids() -> Result<Vec<usize>> {
598    let mut cpu_ids = vec![];
599    let path = format!("{}/sys/devices/system/cpu/cpu[0-9]*", *ROOT_PREFIX);
600    let cpu_paths = glob(&path)?;
601    for cpu_path in cpu_paths.filter_map(Result::ok) {
602        let cpu_str = cpu_path.to_str().unwrap().trim();
603        if ROOT_PREFIX.is_empty() {
604            match sscanf!(cpu_str, "/sys/devices/system/cpu/cpu{usize}") {
605                Ok(val) => cpu_ids.push(val),
606                Err(_) => {
607                    bail!("Failed to parse cpu ID {}", cpu_str);
608                }
609            }
610        } else {
611            match sscanf!(cpu_str, "{str}/sys/devices/system/cpu/cpu{usize}") {
612                Ok((_, val)) => cpu_ids.push(val),
613                Err(_) => {
614                    bail!("Failed to parse cpu ID {}", cpu_str);
615                }
616            }
617        }
618    }
619    cpu_ids.sort();
620    Ok(cpu_ids)
621}
622
623struct CapacitySource {
624    /// Path suffix after /sys/devices/system/cpu/cpuX
625    suffix: String,
626    /// Average raw capacity value
627    avg_rcap: usize,
628    /// Maximum raw capacity value
629    max_rcap: usize,
630    /// Does a system have little cores?
631    has_biglittle: bool,
632}
633
634fn get_capacity_source() -> Option<CapacitySource> {
635    // Sources for guessing cpu_capacity under /sys/devices/system/cpu/cpuX.
636    // They should be ordered from the most precise to the least precise.
637    let sources = [
638        "cpufreq/amd_pstate_prefcore_ranking",
639        "cpufreq/amd_pstate_highest_perf",
640        "acpi_cppc/highest_perf",
641        "cpu_capacity",
642        "cpufreq/cpuinfo_max_freq",
643    ];
644
645    // Find the most precise source for cpu_capacity estimation.
646    let prefix = format!("{}/sys/devices/system/cpu/cpu0", *ROOT_PREFIX);
647    let mut raw_capacity;
648    let mut suffix = sources[sources.len() - 1];
649    'outer: for src in sources {
650        let path_str = [prefix.clone(), src.to_string()].join("/");
651        let path = Path::new(&path_str);
652        raw_capacity = read_from_file(&path).unwrap_or(0_usize);
653        if raw_capacity > 0 {
654            // It would be an okay source...
655            suffix = src;
656            // But double-check if the source has meaningful information.
657            let path = format!("{}/sys/devices/system/cpu/cpu[0-9]*", *ROOT_PREFIX);
658            let cpu_paths = glob(&path).ok()?;
659            for cpu_path in cpu_paths.filter_map(Result::ok) {
660                let raw_capacity2 = read_from_file(&cpu_path.join(suffix)).unwrap_or(0_usize);
661                if raw_capacity != raw_capacity2 {
662                    break 'outer;
663                }
664            }
665            // The source exists, but it tells that all CPUs have the same
666            // capacity. Let's search more if there is any source that can
667            // tell the capacity differences among CPUs. This can happen when
668            // a buggy driver lies (e.g., "acpi_cppc/highest_perf").
669        }
670    }
671
672    // Find the max raw_capacity value for scaling to 1024.
673    let mut max_rcap = 0;
674    let mut min_rcap = usize::MAX;
675    let mut avg_rcap = 0;
676    let mut nr_cpus = 0;
677    let mut has_biglittle = false;
678    let path = format!("{}/sys/devices/system/cpu/cpu[0-9]*", *ROOT_PREFIX);
679    let cpu_paths = glob(&path).ok()?;
680    for cpu_path in cpu_paths.filter_map(Result::ok) {
681        let rcap = read_from_file(&cpu_path.join(suffix)).unwrap_or(0_usize);
682        if max_rcap < rcap {
683            max_rcap = rcap;
684        }
685        if min_rcap > rcap {
686            min_rcap = rcap;
687        }
688        avg_rcap += rcap;
689        nr_cpus += 1;
690    }
691
692    if nr_cpus == 0 || max_rcap == 0 {
693        suffix = "";
694        avg_rcap = 1024;
695        max_rcap = 1024;
696        warn!("CPU capacity information is not available under sysfs.");
697    } else {
698        avg_rcap /= nr_cpus;
699        // We consider a system to have a heterogeneous CPU architecture only
700        // when there is a significant capacity gap (e.g., 1.3x). CPU capacities
701        // can still vary in a homogeneous architecture—for instance, due to
702        // chip binning or when only a subset of CPUs supports turbo boost.
703        //
704        // Note that we need a more systematic approach to accurately detect
705        // big/LITTLE architectures across various SoC designs. The current
706        // approach, with a significant capacity difference, is somewhat ad-hoc.
707        has_biglittle = max_rcap as f32 >= (1.3 * min_rcap as f32);
708    }
709
710    Some(CapacitySource {
711        suffix: suffix.to_string(),
712        avg_rcap,
713        max_rcap,
714        has_biglittle,
715    })
716}
717
718fn is_smt_active() -> Option<bool> {
719    let path = format!("{}/sys/devices/system/cpu/smt/active", *ROOT_PREFIX);
720    let smt_on: u8 = read_from_file(Path::new(&path)).ok()?;
721    Some(smt_on == 1)
722}
723
724fn replace_with_virt_llcs(node: &mut Node, min_cores: usize, max_cores: usize) -> Result<()> {
725    let mut partition_id = 0;
726    let mut core_to_partition: BTreeMap<usize, usize> = BTreeMap::new();
727    let mut partition_to_kernel_id: BTreeMap<usize, usize> = BTreeMap::new();
728    let mut total_partitions = 0;
729
730    // First pass: determine core to partition mapping, partition to kernel_id mapping, and total partitions needed
731    for (_llc_id, llc) in node.llcs.iter() {
732        // Group cores by type (big/little) to partition separately
733        let mut cores_by_type: BTreeMap<bool, Vec<usize>> = BTreeMap::new();
734
735        for (core_id, core) in llc.cores.iter() {
736            let core_type = core.core_type == CoreType::Little;
737            cores_by_type
738                .entry(core_type)
739                .or_insert(Vec::new())
740                .push(*core_id);
741        }
742
743        for (_core_type, core_ids) in cores_by_type.iter() {
744            let num_cores_in_bucket = core_ids.len();
745
746            // Find optimal partition size within specified range
747            let best_split = find_best_split_size(num_cores_in_bucket, min_cores, max_cores);
748            let num_partitions = num_cores_in_bucket / best_split;
749
750            // Assign cores to partitions within a group type
751            for (bucket_idx, &core_id) in core_ids.iter().enumerate() {
752                let partition_idx = min(bucket_idx / best_split, num_partitions - 1);
753                let current_partition_id = partition_id + partition_idx;
754                core_to_partition.insert(core_id, current_partition_id);
755                partition_to_kernel_id.insert(current_partition_id, llc.kernel_id);
756            }
757
758            partition_id += num_partitions;
759            total_partitions = partition_id;
760        }
761    }
762
763    // Create new virtual LLC structures based on partitioning found above
764    let mut virt_llcs: BTreeMap<usize, Arc<Llc>> = BTreeMap::new();
765
766    for partition_id in 0..total_partitions {
767        let kernel_id = partition_to_kernel_id.get(&partition_id).copied().unwrap();
768        virt_llcs.insert(
769            partition_id,
770            Arc::new(Llc {
771                id: partition_id,
772                kernel_id,
773                cores: BTreeMap::new(),
774                span: Cpumask::new(),
775                node_id: node.id,
776                all_cpus: BTreeMap::new(),
777            }),
778        );
779    }
780
781    // Second pass: move cores to the appropriate new LLC based on partition
782    for (_llc_id, llc) in node.llcs.iter_mut() {
783        for (core_id, core) in llc.cores.iter() {
784            if let Some(&target_partition_id) = core_to_partition.get(core_id) {
785                if let Some(target_llc) = virt_llcs.get_mut(&target_partition_id) {
786                    let target_llc_mut = Arc::get_mut(target_llc).unwrap();
787
788                    // Clone core and update its LLC ID to match new partition
789                    let mut new_core = (**core).clone();
790                    new_core.llc_id = target_partition_id;
791
792                    // Update all CPUs within this core to reference new LLC ID
793                    let mut updated_cpus = BTreeMap::new();
794                    for (cpu_id, cpu) in new_core.cpus.iter() {
795                        let mut new_cpu = (**cpu).clone();
796                        new_cpu.llc_id = target_partition_id;
797
798                        // Add CPU to the virtual LLC's span
799                        target_llc_mut.span.set_cpu(*cpu_id)?;
800
801                        updated_cpus.insert(*cpu_id, Arc::new(new_cpu));
802                    }
803                    new_core.cpus = updated_cpus;
804
805                    // Add the updated core to the virtual LLC
806                    target_llc_mut.cores.insert(*core_id, Arc::new(new_core));
807                }
808            }
809        }
810    }
811
812    // Replace original LLCs with virtual LLCs
813    node.llcs = virt_llcs;
814
815    Ok(())
816}
817
818fn create_default_node(
819    online_mask: &Cpumask,
820    topo_ctx: &mut TopoCtx,
821    flatten_llc: bool,
822    nr_cores_per_vllc: Option<(usize, usize)>,
823) -> Result<BTreeMap<usize, Node>> {
824    let mut nodes = BTreeMap::<usize, Node>::new();
825
826    let mut node = Node {
827        id: 0,
828        distance: vec![],
829        llcs: BTreeMap::new(),
830        span: Cpumask::new(),
831        #[cfg(feature = "gpu-topology")]
832        gpus: BTreeMap::new(),
833        all_cores: BTreeMap::new(),
834        all_cpus: BTreeMap::new(),
835    };
836
837    #[cfg(feature = "gpu-topology")]
838    {
839        let system_gpus = create_gpus();
840        if let Some(gpus) = system_gpus.get(&0) {
841            for gpu in gpus {
842                node.gpus.insert(gpu.index, gpu.clone());
843            }
844        }
845    }
846
847    let path = format!("{}/sys/devices/system/cpu", *ROOT_PREFIX);
848    if !Path::new(&path).exists() {
849        bail!("/sys/devices/system/cpu sysfs node not found");
850    }
851
852    let cs = get_capacity_source().unwrap();
853    let cpu_ids = read_cpu_ids()?;
854    for cpu_id in cpu_ids.iter() {
855        create_insert_cpu(*cpu_id, &mut node, online_mask, topo_ctx, &cs, flatten_llc)?;
856    }
857
858    if let Some((min_cores_val, max_cores_val)) = nr_cores_per_vllc {
859        replace_with_virt_llcs(&mut node, min_cores_val, max_cores_val)?;
860    }
861
862    nodes.insert(node.id, node);
863
864    Ok(nodes)
865}
866
867fn create_numa_nodes(
868    online_mask: &Cpumask,
869    topo_ctx: &mut TopoCtx,
870    nr_cores_per_vllc: Option<(usize, usize)>,
871) -> Result<BTreeMap<usize, Node>> {
872    let mut nodes = BTreeMap::<usize, Node>::new();
873
874    #[cfg(feature = "gpu-topology")]
875    let system_gpus = create_gpus();
876
877    let path = format!("{}/sys/devices/system/node/node*", *ROOT_PREFIX);
878    let numa_paths = glob(&path)?;
879    for numa_path in numa_paths.filter_map(Result::ok) {
880        let numa_str = numa_path.to_str().unwrap().trim();
881        let node_id = if ROOT_PREFIX.is_empty() {
882            match sscanf!(numa_str, "/sys/devices/system/node/node{usize}") {
883                Ok(val) => val,
884                Err(_) => {
885                    bail!("Failed to parse NUMA node ID {}", numa_str);
886                }
887            }
888        } else {
889            match sscanf!(numa_str, "{str}/sys/devices/system/node/node{usize}") {
890                Ok((_, val)) => val,
891                Err(_) => {
892                    bail!("Failed to parse NUMA node ID {}", numa_str);
893                }
894            }
895        };
896
897        let distance = read_file_usize_vec(
898            Path::new(&format!(
899                "{}/sys/devices/system/node/node{}/distance",
900                *ROOT_PREFIX, node_id
901            )),
902            ' ',
903        )?;
904        let mut node = Node {
905            id: node_id,
906            distance,
907            llcs: BTreeMap::new(),
908            span: Cpumask::new(),
909
910            all_cores: BTreeMap::new(),
911            all_cpus: BTreeMap::new(),
912
913            #[cfg(feature = "gpu-topology")]
914            gpus: BTreeMap::new(),
915        };
916
917        #[cfg(feature = "gpu-topology")]
918        {
919            if let Some(gpus) = system_gpus.get(&node_id) {
920                for gpu in gpus {
921                    node.gpus.insert(gpu.index, gpu.clone());
922                }
923            }
924        }
925
926        let cpu_pattern = numa_path.join("cpu[0-9]*");
927        let cpu_paths = glob(cpu_pattern.to_string_lossy().as_ref())?;
928        let cs = get_capacity_source().unwrap();
929        let mut cpu_ids = vec![];
930        for cpu_path in cpu_paths.filter_map(Result::ok) {
931            let cpu_str = cpu_path.to_str().unwrap().trim();
932            let cpu_id = if ROOT_PREFIX.is_empty() {
933                match sscanf!(cpu_str, "/sys/devices/system/node/node{usize}/cpu{usize}") {
934                    Ok((_, val)) => val,
935                    Err(_) => {
936                        bail!("Failed to parse cpu ID {}", cpu_str);
937                    }
938                }
939            } else {
940                match sscanf!(
941                    cpu_str,
942                    "{str}/sys/devices/system/node/node{usize}/cpu{usize}"
943                ) {
944                    Ok((_, _, val)) => val,
945                    Err(_) => {
946                        bail!("Failed to parse cpu ID {}", cpu_str);
947                    }
948                }
949            };
950            cpu_ids.push(cpu_id);
951        }
952        cpu_ids.sort();
953
954        for cpu_id in cpu_ids {
955            create_insert_cpu(cpu_id, &mut node, online_mask, topo_ctx, &cs, false)?;
956        }
957
958        if let Some((min_cores_val, max_cores_val)) = nr_cores_per_vllc {
959            replace_with_virt_llcs(&mut node, min_cores_val, max_cores_val)?;
960        }
961
962        nodes.insert(node.id, node);
963    }
964    Ok(nodes)
965}