scx_utils/
topology.rs

1// Copyright (c) Meta Platforms, Inc. and affiliates.
2
3// This software may be used and distributed according to the terms of the
4// GNU General Public License version 2.
5
6//! # SCX Topology
7//!
8//! A crate that allows schedulers to inspect and model the host's topology, in
9//! service of creating scheduling domains.
10//!
11//! A Topology is comprised of one or more Node objects, which themselves are
12//! comprised hierarchically of LLC -> Core -> Cpu objects respectively:
13//!```rust,ignore
14//!                                   Topology
15//!                                       |
16//! o--------------------------------o   ...   o----------------o---------------o
17//! |         Node                   |         |         Node                   |
18//! | ID      0                      |         | ID      1                      |
19//! | LLCs    <id, Llc>              |         | LLCs    <id, Llc>              |
20//! | Span    0x00000fffff00000fffff |         | Span    0xfffff00000fffff00000 |
21//! o--------------------------------o         o--------------------------------o
22//!                 \
23//!                  --------------------
24//!                                      \
25//! o--------------------------------o   ...   o--------------------------------o
26//! |             Llc                |         |             Llc                |
27//! | ID     0                       |         | ID     1                       |
28//! | Cores  <id, Core>              |         | Cores  <id, Core>              |
29//! | Span   0x00000ffc0000000ffc00  |         | Span   0x00000003ff00000003ff  |
30//! o--------------------------------o         o----------------o---------------o
31//!                                                             /
32//!                                        ---------------------
33//!                                       /
34//! o--------------------------------o   ...   o--------------------------------o
35//! |              Core              |         |              Core              |
36//! | ID     0                       |         | ID     9                       |
37//! | Cpus   <id, Cpu>               |         | Cpus   <id, Cpu>               |
38//! | Span   0x00000000010000000001  |         | Span   0x00000002000000000200  |
39//! o--------------------------------o         o----------------o---------------o
40//!                                                             /
41//!                                        ---------------------
42//!                                       /
43//! o--------------------------------o   ...   o---------------------------------o
44//! |              Cpu               |         |               Cpu               |
45//! | ID       9                     |         | ID       49                     |
46//! | online   1                     |         | online   1                      |
47//! | min_freq 400000                |         | min_freq 400000                 |
48//! | max_freq 5881000               |         | min_freq 5881000                |
49//! o--------------------------------o         o---------------------------------o
50//!```
51//! Every object contains a Cpumask that spans all CPUs in that point in the
52//! topological hierarchy.
53//!
54//! Creating Topology
55//! -----------------
56//!
57//! Topology objects are created using the static new function:
58//!
59//!```  
60//!     use scx_utils::Topology;
61//!     let top = Topology::new().unwrap();
62//!```
63//!
64//! Querying Topology
65//! -----------------
66//!
67//! With a created Topology, you can query the topological hierarchy using the
68//! set of accessor functions defined below. All objects in the topological
69//! hierarchy are entirely read-only. If the host topology were to change (due
70//! to e.g. hotplug), a new Topology object should be created.
71
72use crate::compat::ROOT_PREFIX;
73use crate::cpumask::read_cpulist;
74use crate::misc::find_best_split_size;
75use crate::misc::read_file_byte;
76use crate::misc::read_file_usize_vec;
77use crate::misc::read_from_file;
78use crate::Cpumask;
79use anyhow::bail;
80use anyhow::Result;
81use glob::glob;
82use log::info;
83use log::warn;
84use sscanf::sscanf;
85use std::cmp::min;
86use std::collections::BTreeMap;
87use std::path::Path;
88use std::sync::Arc;
89
90#[cfg(feature = "gpu-topology")]
91use crate::gpu::{create_gpus, Gpu, GpuIndex};
92
93lazy_static::lazy_static! {
94    /// The maximum possible number of CPU IDs in the system. As mentioned
95    /// above, this is different than the number of possible CPUs on the
96    /// system (though very seldom is). This number may differ from the
97    /// number of possible CPUs on the system when e.g. there are fully
98    /// disabled CPUs in the middle of the range of possible CPUs (i.e. CPUs
99    /// that may not be onlined).
100    pub static ref NR_CPU_IDS: usize = read_cpu_ids().unwrap().last().unwrap() + 1;
101
102    /// The number of possible CPUs that may be active on the system. Note
103    /// that this value is separate from the number of possible _CPU IDs_ in
104    /// the system, as there may be gaps in what CPUs are allowed to be
105    /// onlined. For example, some BIOS implementations may report spans of
106    /// disabled CPUs that may not be onlined, whose IDs are lower than the
107    /// IDs of other CPUs that may be onlined.
108    pub static ref NR_CPUS_POSSIBLE: usize = libbpf_rs::num_possible_cpus().unwrap();
109
110    /// The range to search for when finding the number of physical cores
111    /// assigned to a partition to split a large number of cores that share
112    /// an LLC domain. The suggested split for the cores isn't a function of
113    /// the underlying hardware's capability, but rather some sane number
114    /// to help determine the number of CPUs that share the same DSQ.
115    pub static ref NR_PARTITION_MIN_CORES: usize = 2;
116    pub static ref NR_PARTITION_MAX_CORES: usize = 8;
117}
118
119#[derive(Debug, Clone, Eq, Hash, Ord, PartialEq, PartialOrd)]
120pub enum CoreType {
121    Big { turbo: bool },
122    Little,
123}
124
125#[derive(Debug, Clone, Eq, Hash, Ord, PartialEq, PartialOrd)]
126pub struct Cpu {
127    pub id: usize,
128    pub min_freq: usize,
129    pub max_freq: usize,
130    /// Base operational frqeuency. Only available on Intel Turbo Boost
131    /// CPUs. If not available, this will simply return maximum frequency.
132    pub base_freq: usize,
133    /// The best-effort guessing of cpu_capacity scaled to 1024.
134    pub cpu_capacity: usize,
135    pub smt_level: usize,
136    /// CPU idle resume latency
137    pub pm_qos_resume_latency_us: usize,
138    pub trans_lat_ns: usize,
139    pub l2_id: usize,
140    pub l3_id: usize,
141    /// Per-CPU cache size of all levels.
142    pub cache_size: usize,
143    pub core_type: CoreType,
144
145    /// Ancestor IDs.
146    pub core_id: usize,
147    pub llc_id: usize,
148    pub node_id: usize,
149    pub package_id: usize,
150    pub cluster_id: isize,
151}
152
153#[derive(Clone, Debug, Eq, Ord, PartialEq, PartialOrd)]
154pub struct Core {
155    /// Monotonically increasing unique id
156    pub id: usize,
157    /// The sysfs value of core_id
158    pub kernel_id: usize,
159    pub cluster_id: isize,
160    pub cpus: BTreeMap<usize, Arc<Cpu>>,
161    /// Cpumask of all CPUs in this core.
162    pub span: Cpumask,
163    pub core_type: CoreType,
164
165    /// Ancestor IDs.
166    pub llc_id: usize,
167    pub node_id: usize,
168}
169
170#[derive(Debug, Clone)]
171pub struct Llc {
172    /// Monotonically increasing unique id
173    pub id: usize,
174    /// The kernel id of the llc
175    pub kernel_id: usize,
176    pub cores: BTreeMap<usize, Arc<Core>>,
177    /// Cpumask of all CPUs in this llc.
178    pub span: Cpumask,
179
180    /// Ancestor IDs.
181    pub node_id: usize,
182
183    /// Skip indices to access lower level members easily.
184    pub all_cpus: BTreeMap<usize, Arc<Cpu>>,
185}
186
187#[derive(Debug, Clone)]
188pub struct Node {
189    pub id: usize,
190    pub distance: Vec<usize>,
191    pub llcs: BTreeMap<usize, Arc<Llc>>,
192    /// Cpumask of all CPUs in this node.
193    pub span: Cpumask,
194
195    /// Skip indices to access lower level members easily.
196    pub all_cores: BTreeMap<usize, Arc<Core>>,
197    pub all_cpus: BTreeMap<usize, Arc<Cpu>>,
198
199    #[cfg(feature = "gpu-topology")]
200    pub gpus: BTreeMap<GpuIndex, Gpu>,
201}
202
203#[derive(Debug)]
204pub struct Topology {
205    pub nodes: BTreeMap<usize, Node>,
206    /// Cpumask all CPUs in the system.
207    pub span: Cpumask,
208    /// True if SMT is enabled in the system, false otherwise.
209    pub smt_enabled: bool,
210
211    /// Skip indices to access lower level members easily.
212    pub all_llcs: BTreeMap<usize, Arc<Llc>>,
213    pub all_cores: BTreeMap<usize, Arc<Core>>,
214    pub all_cpus: BTreeMap<usize, Arc<Cpu>>,
215}
216
217impl Topology {
218    fn instantiate(span: Cpumask, mut nodes: BTreeMap<usize, Node>) -> Result<Self> {
219        // Build skip indices prefixed with all_ for easy lookups. As Arc
220        // objects can only be modified while there's only one reference,
221        // skip indices must be built from bottom to top.
222        let mut topo_llcs = BTreeMap::new();
223        let mut topo_cores = BTreeMap::new();
224        let mut topo_cpus = BTreeMap::new();
225
226        for (_node_id, node) in nodes.iter_mut() {
227            let mut node_cores = BTreeMap::new();
228            let mut node_cpus = BTreeMap::new();
229
230            for (&llc_id, llc) in node.llcs.iter_mut() {
231                let llc_mut = Arc::get_mut(llc).unwrap();
232                let mut llc_cpus = BTreeMap::new();
233
234                for (&core_id, core) in llc_mut.cores.iter_mut() {
235                    let core_mut = Arc::get_mut(core).unwrap();
236                    let smt_level = core_mut.cpus.len();
237
238                    for (&cpu_id, cpu) in core_mut.cpus.iter_mut() {
239                        let cpu_mut = Arc::get_mut(cpu).unwrap();
240                        cpu_mut.smt_level = smt_level;
241
242                        if topo_cpus
243                            .insert(cpu_id, cpu.clone())
244                            .or(node_cpus.insert(cpu_id, cpu.clone()))
245                            .or(llc_cpus.insert(cpu_id, cpu.clone()))
246                            .is_some()
247                        {
248                            bail!("Duplicate CPU ID {}", cpu_id);
249                        }
250                    }
251
252                    // Note that in some weird architectures, core ids can be
253                    // duplicated in different LLC domains.
254                    topo_cores
255                        .insert(core_id, core.clone())
256                        .or(node_cores.insert(core_id, core.clone()));
257                }
258
259                llc_mut.all_cpus = llc_cpus;
260
261                if topo_llcs.insert(llc_id, llc.clone()).is_some() {
262                    bail!("Duplicate LLC ID {}", llc_id);
263                }
264            }
265
266            node.all_cores = node_cores;
267            node.all_cpus = node_cpus;
268        }
269
270        Ok(Topology {
271            nodes,
272            span,
273            smt_enabled: is_smt_active().unwrap_or(false),
274            all_llcs: topo_llcs,
275            all_cores: topo_cores,
276            all_cpus: topo_cpus,
277        })
278    }
279
280    /// Build a complete host Topology
281    pub fn new() -> Result<Topology> {
282        Self::with_virt_llcs(None)
283    }
284
285    pub fn with_virt_llcs(nr_cores_per_vllc: Option<(usize, usize)>) -> Result<Topology> {
286        let span = cpus_online()?;
287        let mut topo_ctx = TopoCtx::new();
288
289        // If the kernel is compiled with CONFIG_NUMA, then build a topology
290        // from the NUMA hierarchy in sysfs. Otherwise, just make a single
291        // default node of ID 0 which contains all cores.
292        let path = format!("{}/sys/devices/system/node", *ROOT_PREFIX);
293        let nodes = if Path::new(&path).exists() {
294            create_numa_nodes(&span, &mut topo_ctx, nr_cores_per_vllc)?
295        } else {
296            create_default_node(&span, &mut topo_ctx, false, nr_cores_per_vllc)?
297        };
298
299        Self::instantiate(span, nodes)
300    }
301
302    pub fn with_flattened_llc_node() -> Result<Topology> {
303        let span = cpus_online()?;
304        let mut topo_ctx = TopoCtx::new();
305        let nodes = create_default_node(&span, &mut topo_ctx, true, None)?;
306        Self::instantiate(span, nodes)
307    }
308
309    /// Build a topology with configuration from CLI arguments.
310    /// This method integrates with the TopologyArgs from the cli module to
311    /// create a topology based on command line parameters.
312    pub fn with_args(topology_args: &crate::cli::TopologyArgs) -> Result<Topology> {
313        // Validate the CLI arguments first
314        topology_args.validate()?;
315
316        // Get the virtual LLC configuration
317        let nr_cores_per_vllc = topology_args.get_nr_cores_per_vllc();
318
319        // Build topology with the specified configuration
320        Self::with_virt_llcs(nr_cores_per_vllc)
321    }
322
323    /// Get a vec of all GPUs on the hosts.
324    #[cfg(feature = "gpu-topology")]
325    pub fn gpus(&self) -> BTreeMap<GpuIndex, &Gpu> {
326        let mut gpus = BTreeMap::new();
327        for node in self.nodes.values() {
328            for (idx, gpu) in &node.gpus {
329                gpus.insert(*idx, gpu);
330            }
331        }
332        gpus
333    }
334
335    /// Returns whether the Topology has a hybrid architecture of big and little cores.
336    pub fn has_little_cores(&self) -> bool {
337        self.all_cores
338            .values()
339            .any(|c| c.core_type == CoreType::Little)
340    }
341
342    /// Returns a vector that maps the index of each logical CPU to the
343    /// sibling CPU. This represents the "next sibling" CPU within a package
344    /// in systems that support SMT. The sibling CPU is the other logical
345    /// CPU that shares the physical resources of the same physical core.
346    ///
347    /// Assuming each core holds exactly at most two cpus.
348    pub fn sibling_cpus(&self) -> Vec<i32> {
349        let mut sibling_cpu = vec![-1i32; *NR_CPUS_POSSIBLE];
350        for core in self.all_cores.values() {
351            let mut first = -1i32;
352            for &cpu in core.cpus.keys() {
353                if first < 0 {
354                    first = cpu as i32;
355                } else {
356                    sibling_cpu[first as usize] = cpu as i32;
357                    sibling_cpu[cpu] = first;
358                    break;
359                }
360            }
361        }
362        sibling_cpu
363    }
364}
365
366/******************************************************
367 * Helper structs/functions for creating the Topology *
368 ******************************************************/
369/// TopoCtx is a helper struct used to build a topology.
370struct TopoCtx {
371    /// Mapping of NUMA node core ids
372    node_core_kernel_ids: BTreeMap<(usize, usize, usize), usize>,
373    /// Mapping of NUMA node LLC ids
374    node_llc_kernel_ids: BTreeMap<(usize, usize, usize), usize>,
375    /// Mapping of L2 ids
376    l2_ids: BTreeMap<String, usize>,
377    /// Mapping of L3 ids
378    l3_ids: BTreeMap<String, usize>,
379}
380
381impl TopoCtx {
382    fn new() -> TopoCtx {
383        let core_kernel_ids = BTreeMap::new();
384        let llc_kernel_ids = BTreeMap::new();
385        let l2_ids = BTreeMap::new();
386        let l3_ids = BTreeMap::new();
387        TopoCtx {
388            node_core_kernel_ids: core_kernel_ids,
389            node_llc_kernel_ids: llc_kernel_ids,
390            l2_ids,
391            l3_ids,
392        }
393    }
394}
395
396fn cpus_online() -> Result<Cpumask> {
397    let path = format!("{}/sys/devices/system/cpu/online", *ROOT_PREFIX);
398    let online = std::fs::read_to_string(path)?;
399    Cpumask::from_cpulist(&online)
400}
401
402fn get_cache_id(topo_ctx: &mut TopoCtx, cache_level_path: &Path, cache_level: usize) -> usize {
403    // Check if the cache id is already cached
404    let id_map = match cache_level {
405        2 => &mut topo_ctx.l2_ids,
406        3 => &mut topo_ctx.l3_ids,
407        _ => return usize::MAX,
408    };
409
410    let path = &cache_level_path.join("shared_cpu_list");
411    let key = match std::fs::read_to_string(path) {
412        Ok(key) => key,
413        Err(_) => return usize::MAX,
414    };
415
416    let id = *id_map.get(&key).unwrap_or(&usize::MAX);
417    if id != usize::MAX {
418        return id;
419    }
420
421    // In case of a cache miss, try to get the id from the sysfs first.
422    let id = read_from_file(&cache_level_path.join("id")).unwrap_or(usize::MAX);
423    if id != usize::MAX {
424        // Keep the id in the map
425        id_map.insert(key, id);
426        return id;
427    }
428
429    // If the id file does not exist, assign an id and keep it in the map.
430    let id = id_map.len();
431    id_map.insert(key, id);
432
433    id
434}
435
436fn get_per_cpu_cache_size(cache_path: &Path) -> Result<usize> {
437    let path_str = cache_path.to_str().unwrap();
438    let paths = glob(&(path_str.to_owned() + "/index[0-9]*"))?;
439    let mut tot_size = 0;
440
441    for index in paths.filter_map(Result::ok) {
442        // If there is no size information under sysfs (e.g., many ARM SoCs),
443        // give 1024 as a default value. 1024 is small enough compared to the
444        // real cache size of the CPU, but it is large enough to give a penalty
445        // when multiple CPUs share the cache.
446        let size = read_file_byte(&index.join("size")).unwrap_or(1024_usize);
447        let cpulist: String = read_from_file(&index.join("shared_cpu_list"))?;
448        let num_cpus = read_cpulist(&cpulist)?.len();
449        tot_size += size / num_cpus;
450    }
451
452    Ok(tot_size)
453}
454
455#[allow(clippy::too_many_arguments)]
456fn create_insert_cpu(
457    id: usize,
458    node: &mut Node,
459    online_mask: &Cpumask,
460    topo_ctx: &mut TopoCtx,
461    cs: &CapacitySource,
462    flatten_llc: bool,
463) -> Result<()> {
464    // CPU is offline. The Topology hierarchy is read-only, and assumes
465    // that hotplug will cause the scheduler to restart. Thus, we can
466    // just skip this CPU altogether.
467    if !online_mask.test_cpu(id) {
468        return Ok(());
469    }
470
471    let cpu_str = format!("{}/sys/devices/system/cpu/cpu{}", *ROOT_PREFIX, id);
472    let cpu_path = Path::new(&cpu_str);
473
474    // Physical core ID
475    let top_path = cpu_path.join("topology");
476    let core_kernel_id = read_from_file(&top_path.join("core_id"))?;
477    let package_id = read_from_file(&top_path.join("physical_package_id"))?;
478    let cluster_id = read_from_file(&top_path.join("cluster_id"))?;
479
480    // Evaluate L2, L3 and LLC cache IDs.
481    //
482    // Use ID 0 if we fail to detect the cache hierarchy. This seems to happen on certain SKUs, so
483    // if there's no cache information then we have no option but to assume a single unified cache
484    // per node.
485    let cache_path = cpu_path.join("cache");
486    let l2_id = get_cache_id(topo_ctx, &cache_path.join(format!("index{}", 2)), 2);
487    let l3_id = get_cache_id(topo_ctx, &cache_path.join(format!("index{}", 3)), 3);
488    let llc_kernel_id = if flatten_llc {
489        0
490    } else if l3_id == usize::MAX {
491        l2_id
492    } else {
493        l3_id
494    };
495
496    // Per-CPU cache size
497    let cache_size = get_per_cpu_cache_size(&cache_path).unwrap_or(0_usize);
498
499    // Min and max frequencies. If the kernel is not compiled with
500    // CONFIG_CPU_FREQ, just assume 0 for both frequencies.
501    let freq_path = cpu_path.join("cpufreq");
502    let min_freq = read_from_file(&freq_path.join("scaling_min_freq")).unwrap_or(0_usize);
503    let max_freq = read_from_file(&freq_path.join("scaling_max_freq")).unwrap_or(0_usize);
504    let base_freq = read_from_file(&freq_path.join("base_frequency")).unwrap_or(max_freq);
505    let trans_lat_ns =
506        read_from_file(&freq_path.join("cpuinfo_transition_latency")).unwrap_or(0_usize);
507
508    // Cpu capacity
509    let cap_path = cpu_path.join(cs.suffix.clone());
510    let rcap = read_from_file(&cap_path).unwrap_or(cs.max_rcap);
511    let cpu_capacity = (rcap * 1024) / cs.max_rcap;
512
513    // Power management
514    let power_path = cpu_path.join("power");
515    let pm_qos_resume_latency_us =
516        read_from_file(&power_path.join("pm_qos_resume_latency_us")).unwrap_or(0_usize);
517
518    let num_llcs = topo_ctx.node_llc_kernel_ids.len();
519    let llc_id = topo_ctx
520        .node_llc_kernel_ids
521        .entry((node.id, package_id, llc_kernel_id))
522        .or_insert(num_llcs);
523
524    let llc = node.llcs.entry(*llc_id).or_insert(Arc::new(Llc {
525        id: *llc_id,
526        cores: BTreeMap::new(),
527        span: Cpumask::new(),
528        all_cpus: BTreeMap::new(),
529
530        node_id: node.id,
531        kernel_id: llc_kernel_id,
532    }));
533    let llc_mut = Arc::get_mut(llc).unwrap();
534
535    let core_type = if cs.avg_rcap < cs.max_rcap && rcap == cs.max_rcap {
536        CoreType::Big { turbo: true }
537    } else if !cs.has_biglittle || rcap >= cs.avg_rcap {
538        CoreType::Big { turbo: false }
539    } else {
540        CoreType::Little
541    };
542
543    let num_cores = topo_ctx.node_core_kernel_ids.len();
544    let core_id = topo_ctx
545        .node_core_kernel_ids
546        .entry((node.id, package_id, core_kernel_id))
547        .or_insert(num_cores);
548
549    let core = llc_mut.cores.entry(*core_id).or_insert(Arc::new(Core {
550        id: *core_id,
551        cpus: BTreeMap::new(),
552        span: Cpumask::new(),
553        core_type: core_type.clone(),
554
555        llc_id: *llc_id,
556        node_id: node.id,
557        kernel_id: core_kernel_id,
558        cluster_id,
559    }));
560    let core_mut = Arc::get_mut(core).unwrap();
561
562    core_mut.cpus.insert(
563        id,
564        Arc::new(Cpu {
565            id,
566            min_freq,
567            max_freq,
568            base_freq,
569            cpu_capacity,
570            smt_level: 0, // Will be initialized at instantiate().
571            pm_qos_resume_latency_us,
572            trans_lat_ns,
573            l2_id,
574            l3_id,
575            cache_size,
576            core_type: core_type.clone(),
577
578            core_id: *core_id,
579            llc_id: *llc_id,
580            node_id: node.id,
581            package_id,
582            cluster_id,
583        }),
584    );
585
586    if node.span.test_cpu(id) {
587        bail!("Node {} already had CPU {}", node.id, id);
588    }
589
590    // Update all of the devices' spans to include this CPU.
591    core_mut.span.set_cpu(id)?;
592    llc_mut.span.set_cpu(id)?;
593    node.span.set_cpu(id)?;
594
595    Ok(())
596}
597
598fn read_cpu_ids() -> Result<Vec<usize>> {
599    let mut cpu_ids = vec![];
600    let path = format!("{}/sys/devices/system/cpu/cpu[0-9]*", *ROOT_PREFIX);
601    let cpu_paths = glob(&path)?;
602    for cpu_path in cpu_paths.filter_map(Result::ok) {
603        let cpu_str = cpu_path.to_str().unwrap().trim();
604        if ROOT_PREFIX.is_empty() {
605            match sscanf!(cpu_str, "/sys/devices/system/cpu/cpu{usize}") {
606                Ok(val) => cpu_ids.push(val),
607                Err(_) => {
608                    bail!("Failed to parse cpu ID {}", cpu_str);
609                }
610            }
611        } else {
612            match sscanf!(cpu_str, "{str}/sys/devices/system/cpu/cpu{usize}") {
613                Ok((_, val)) => cpu_ids.push(val),
614                Err(_) => {
615                    bail!("Failed to parse cpu ID {}", cpu_str);
616                }
617            }
618        }
619    }
620    cpu_ids.sort();
621    Ok(cpu_ids)
622}
623
624struct CapacitySource {
625    /// Path suffix after /sys/devices/system/cpu/cpuX
626    suffix: String,
627    /// Average raw capacity value
628    avg_rcap: usize,
629    /// Maximum raw capacity value
630    max_rcap: usize,
631    /// Does a system have little cores?
632    has_biglittle: bool,
633}
634
635fn get_capacity_source() -> Option<CapacitySource> {
636    // Sources for guessing cpu_capacity under /sys/devices/system/cpu/cpuX.
637    // They should be ordered from the most precise to the least precise.
638    let sources = [
639        "cpufreq/amd_pstate_prefcore_ranking",
640        "cpufreq/amd_pstate_highest_perf",
641        "acpi_cppc/highest_perf",
642        "cpu_capacity",
643        "cpufreq/cpuinfo_max_freq",
644    ];
645
646    // Find the most precise source for cpu_capacity estimation.
647    let prefix = format!("{}/sys/devices/system/cpu/cpu0", *ROOT_PREFIX);
648    let mut raw_capacity;
649    let mut suffix = sources[sources.len() - 1];
650    'outer: for src in sources {
651        let path_str = [prefix.clone(), src.to_string()].join("/");
652        let path = Path::new(&path_str);
653        raw_capacity = read_from_file(&path).unwrap_or(0_usize);
654        if raw_capacity > 0 {
655            // It would be an okay source...
656            suffix = src;
657            // But double-check if the source has meaningful information.
658            let path = format!("{}/sys/devices/system/cpu/cpu[0-9]*", *ROOT_PREFIX);
659            let cpu_paths = glob(&path).ok()?;
660            for cpu_path in cpu_paths.filter_map(Result::ok) {
661                let raw_capacity2 = read_from_file(&cpu_path.join(suffix)).unwrap_or(0_usize);
662                if raw_capacity != raw_capacity2 {
663                    break 'outer;
664                }
665            }
666            // The source exists, but it tells that all CPUs have the same
667            // capacity. Let's search more if there is any source that can
668            // tell the capacity differences among CPUs. This can happen when
669            // a buggy driver lies (e.g., "acpi_cppc/highest_perf").
670        }
671    }
672
673    // Find the max raw_capacity value for scaling to 1024.
674    let mut max_rcap = 0;
675    let mut min_rcap = usize::MAX;
676    let mut avg_rcap = 0;
677    let mut nr_cpus = 0;
678    let mut has_biglittle = false;
679    let path = format!("{}/sys/devices/system/cpu/cpu[0-9]*", *ROOT_PREFIX);
680    let cpu_paths = glob(&path).ok()?;
681    for cpu_path in cpu_paths.filter_map(Result::ok) {
682        let rcap = read_from_file(&cpu_path.join(suffix)).unwrap_or(0_usize);
683        if max_rcap < rcap {
684            max_rcap = rcap;
685        }
686        if min_rcap > rcap {
687            min_rcap = rcap;
688        }
689        avg_rcap += rcap;
690        nr_cpus += 1;
691    }
692
693    if nr_cpus == 0 || max_rcap == 0 {
694        suffix = "";
695        avg_rcap = 1024;
696        max_rcap = 1024;
697        warn!("CPU capacity information is not available under sysfs.");
698    } else {
699        avg_rcap /= nr_cpus;
700        // We consider a system to have a heterogeneous CPU architecture only
701        // when there is a significant capacity gap (e.g., 1.3x). CPU capacities
702        // can still vary in a homogeneous architecture—for instance, due to
703        // chip binning or when only a subset of CPUs supports turbo boost.
704        //
705        // Note that we need a more systematic approach to accurately detect
706        // big/LITTLE architectures across various SoC designs. The current
707        // approach, with a significant capacity difference, is somewhat ad-hoc.
708        has_biglittle = max_rcap as f32 >= (1.3 * min_rcap as f32);
709    }
710
711    Some(CapacitySource {
712        suffix: suffix.to_string(),
713        avg_rcap,
714        max_rcap,
715        has_biglittle,
716    })
717}
718
719fn is_smt_active() -> Option<bool> {
720    let path = format!("{}/sys/devices/system/cpu/smt/active", *ROOT_PREFIX);
721    let smt_on: u8 = read_from_file(Path::new(&path)).ok()?;
722    Some(smt_on == 1)
723}
724
725fn replace_with_virt_llcs(
726    node: &mut Node,
727    min_cores: usize,
728    max_cores: usize,
729    start_id: usize,
730) -> Result<usize> {
731    let mut next_id = start_id;
732    let mut core_to_partition: BTreeMap<usize, usize> = BTreeMap::new();
733    let mut partition_to_kernel_id: BTreeMap<usize, usize> = BTreeMap::new();
734    let num_orig_llcs = node.llcs.len();
735
736    // First pass: determine core to partition mapping, partition to
737    // kernel_id mapping, and total partitions needed
738    for (_llc_id, llc) in node.llcs.iter() {
739        // Group cores by type (big/little) to partition separately
740        let mut cores_by_type: BTreeMap<bool, Vec<usize>> = BTreeMap::new();
741
742        for (core_id, core) in llc.cores.iter() {
743            let core_type = core.core_type == CoreType::Little;
744            cores_by_type
745                .entry(core_type)
746                .or_insert(Vec::new())
747                .push(*core_id);
748        }
749
750        for (_core_type, core_ids) in cores_by_type.iter() {
751            let num_cores_in_bucket = core_ids.len();
752
753            // Find optimal partition size within specified range
754            let best_split = find_best_split_size(num_cores_in_bucket, min_cores, max_cores);
755            let num_partitions = num_cores_in_bucket / best_split;
756
757            // Assign cores to partitions within a group type
758            for (bucket_idx, &core_id) in core_ids.iter().enumerate() {
759                let partition_idx = min(bucket_idx / best_split, num_partitions - 1);
760                let current_partition_id = next_id + partition_idx;
761                core_to_partition.insert(core_id, current_partition_id);
762                partition_to_kernel_id.insert(current_partition_id, llc.kernel_id);
763            }
764
765            next_id += num_partitions;
766        }
767    }
768
769    // Create new virtual LLC structures based on partitioning found above
770    let mut virt_llcs: BTreeMap<usize, Arc<Llc>> = BTreeMap::new();
771
772    for vllc_id in start_id..next_id {
773        let kernel_id = partition_to_kernel_id.get(&vllc_id).copied().unwrap();
774        virt_llcs.insert(
775            vllc_id,
776            Arc::new(Llc {
777                id: vllc_id,
778                kernel_id,
779                cores: BTreeMap::new(),
780                span: Cpumask::new(),
781                node_id: node.id,
782                all_cpus: BTreeMap::new(),
783            }),
784        );
785    }
786
787    // Second pass: move cores to the appropriate new LLC based on partition
788    for (_llc_id, llc) in node.llcs.iter_mut() {
789        for (core_id, core) in llc.cores.iter() {
790            if let Some(&target_partition_id) = core_to_partition.get(core_id) {
791                if let Some(target_llc) = virt_llcs.get_mut(&target_partition_id) {
792                    let target_llc_mut = Arc::get_mut(target_llc).unwrap();
793
794                    // Clone core and update its LLC ID to match new partition
795                    let mut new_core = (**core).clone();
796                    new_core.llc_id = target_partition_id;
797
798                    // Update all CPUs within this core to reference new LLC ID
799                    let mut updated_cpus = BTreeMap::new();
800                    for (cpu_id, cpu) in new_core.cpus.iter() {
801                        let mut new_cpu = (**cpu).clone();
802                        new_cpu.llc_id = target_partition_id;
803
804                        // Add CPU to the virtual LLC's span
805                        target_llc_mut.span.set_cpu(*cpu_id)?;
806
807                        updated_cpus.insert(*cpu_id, Arc::new(new_cpu));
808                    }
809                    new_core.cpus = updated_cpus;
810
811                    // Add the updated core to the virtual LLC
812                    target_llc_mut.cores.insert(*core_id, Arc::new(new_core));
813                }
814            }
815        }
816    }
817
818    // Replace original LLCs with virtual LLCs
819    node.llcs = virt_llcs;
820
821    let num_virt_llcs = next_id - start_id;
822    let vllc_sizes: Vec<usize> = node.llcs.values().map(|llc| llc.cores.len()).collect();
823
824    if vllc_sizes.is_empty() {
825        return Ok(next_id);
826    }
827
828    // Most vLLCs should have the same size, only the last one might differ
829    let common_size = vllc_sizes[0];
830    let last_size = *vllc_sizes.last().unwrap();
831
832    if common_size == last_size {
833        info!(
834            "Node {}: split {} LLC(s) into {} virtual LLCs with {} cores each",
835            node.id, num_orig_llcs, num_virt_llcs, common_size
836        );
837    } else {
838        info!(
839            "Node {}: split {} LLC(s) into {} virtual LLCs with {} cores each (last with {})",
840            node.id, num_orig_llcs, num_virt_llcs, common_size, last_size
841        );
842    }
843
844    Ok(next_id)
845}
846
847fn create_default_node(
848    online_mask: &Cpumask,
849    topo_ctx: &mut TopoCtx,
850    flatten_llc: bool,
851    nr_cores_per_vllc: Option<(usize, usize)>,
852) -> Result<BTreeMap<usize, Node>> {
853    let mut nodes = BTreeMap::<usize, Node>::new();
854
855    let mut node = Node {
856        id: 0,
857        distance: vec![],
858        llcs: BTreeMap::new(),
859        span: Cpumask::new(),
860        #[cfg(feature = "gpu-topology")]
861        gpus: BTreeMap::new(),
862        all_cores: BTreeMap::new(),
863        all_cpus: BTreeMap::new(),
864    };
865
866    #[cfg(feature = "gpu-topology")]
867    {
868        let system_gpus = create_gpus();
869        if let Some(gpus) = system_gpus.get(&0) {
870            for gpu in gpus {
871                node.gpus.insert(gpu.index, gpu.clone());
872            }
873        }
874    }
875
876    let path = format!("{}/sys/devices/system/cpu", *ROOT_PREFIX);
877    if !Path::new(&path).exists() {
878        bail!("/sys/devices/system/cpu sysfs node not found");
879    }
880
881    let cs = get_capacity_source().unwrap();
882    let cpu_ids = read_cpu_ids()?;
883    for cpu_id in cpu_ids.iter() {
884        create_insert_cpu(*cpu_id, &mut node, online_mask, topo_ctx, &cs, flatten_llc)?;
885    }
886
887    if let Some((min_cores_val, max_cores_val)) = nr_cores_per_vllc {
888        replace_with_virt_llcs(&mut node, min_cores_val, max_cores_val, 0)?;
889    }
890
891    nodes.insert(node.id, node);
892
893    Ok(nodes)
894}
895
896fn create_numa_nodes(
897    online_mask: &Cpumask,
898    topo_ctx: &mut TopoCtx,
899    nr_cores_per_vllc: Option<(usize, usize)>,
900) -> Result<BTreeMap<usize, Node>> {
901    let mut nodes = BTreeMap::<usize, Node>::new();
902    let mut next_virt_llc_id = 0;
903
904    #[cfg(feature = "gpu-topology")]
905    let system_gpus = create_gpus();
906
907    let path = format!("{}/sys/devices/system/node/node*", *ROOT_PREFIX);
908    let numa_paths = glob(&path)?;
909    for numa_path in numa_paths.filter_map(Result::ok) {
910        let numa_str = numa_path.to_str().unwrap().trim();
911        let node_id = if ROOT_PREFIX.is_empty() {
912            match sscanf!(numa_str, "/sys/devices/system/node/node{usize}") {
913                Ok(val) => val,
914                Err(_) => {
915                    bail!("Failed to parse NUMA node ID {}", numa_str);
916                }
917            }
918        } else {
919            match sscanf!(numa_str, "{str}/sys/devices/system/node/node{usize}") {
920                Ok((_, val)) => val,
921                Err(_) => {
922                    bail!("Failed to parse NUMA node ID {}", numa_str);
923                }
924            }
925        };
926
927        let distance = read_file_usize_vec(
928            Path::new(&format!(
929                "{}/sys/devices/system/node/node{}/distance",
930                *ROOT_PREFIX, node_id
931            )),
932            ' ',
933        )?;
934        let mut node = Node {
935            id: node_id,
936            distance,
937            llcs: BTreeMap::new(),
938            span: Cpumask::new(),
939
940            all_cores: BTreeMap::new(),
941            all_cpus: BTreeMap::new(),
942
943            #[cfg(feature = "gpu-topology")]
944            gpus: BTreeMap::new(),
945        };
946
947        #[cfg(feature = "gpu-topology")]
948        {
949            if let Some(gpus) = system_gpus.get(&node_id) {
950                for gpu in gpus {
951                    node.gpus.insert(gpu.index, gpu.clone());
952                }
953            }
954        }
955
956        let cpu_pattern = numa_path.join("cpu[0-9]*");
957        let cpu_paths = glob(cpu_pattern.to_string_lossy().as_ref())?;
958        let cs = get_capacity_source().unwrap();
959        let mut cpu_ids = vec![];
960        for cpu_path in cpu_paths.filter_map(Result::ok) {
961            let cpu_str = cpu_path.to_str().unwrap().trim();
962            let cpu_id = if ROOT_PREFIX.is_empty() {
963                match sscanf!(cpu_str, "/sys/devices/system/node/node{usize}/cpu{usize}") {
964                    Ok((_, val)) => val,
965                    Err(_) => {
966                        bail!("Failed to parse cpu ID {}", cpu_str);
967                    }
968                }
969            } else {
970                match sscanf!(
971                    cpu_str,
972                    "{str}/sys/devices/system/node/node{usize}/cpu{usize}"
973                ) {
974                    Ok((_, _, val)) => val,
975                    Err(_) => {
976                        bail!("Failed to parse cpu ID {}", cpu_str);
977                    }
978                }
979            };
980            cpu_ids.push(cpu_id);
981        }
982        cpu_ids.sort();
983
984        for cpu_id in cpu_ids {
985            create_insert_cpu(cpu_id, &mut node, online_mask, topo_ctx, &cs, false)?;
986        }
987
988        if let Some((min_cores_val, max_cores_val)) = nr_cores_per_vllc {
989            next_virt_llc_id =
990                replace_with_virt_llcs(&mut node, min_cores_val, max_cores_val, next_virt_llc_id)?;
991        }
992
993        nodes.insert(node.id, node);
994    }
995    Ok(nodes)
996}