Skip to main content

scx_utils/
gpu.rs

1#![cfg(feature = "gpu-topology")]
2
3use crate::misc::read_from_file;
4use crate::{Cpumask, NR_CPU_IDS};
5use nvml_wrapper::bitmasks::InitFlags;
6use nvml_wrapper::enum_wrappers::device::{Clock, PerformanceState, TopologyLevel};
7use nvml_wrapper::Nvml;
8use nvml_wrapper_sys::bindings::NVML_AFFINITY_SCOPE_NODE;
9use std::collections::BTreeMap;
10use std::path::Path;
11
12#[derive(Clone, Copy, Debug, Eq, Hash, Ord, PartialOrd, PartialEq)]
13pub enum GpuIndex {
14    Nvidia { nvml_id: u32 },
15}
16
17#[derive(Debug, Clone)]
18pub struct Gpu {
19    pub index: GpuIndex,
20    pub node_id: usize,
21    pub max_graphics_clock: usize,
22    // AMD uses CU for this value
23    pub max_sm_clock: usize,
24    // Frequency of the GPU's memory
25    pub max_mem_clock: usize,
26    // Streaming Multiprocessor count
27    pub multiproc_count: usize,
28    pub memory: u64,
29    pub cpu_mask: Cpumask,
30    // Represents the ordered list of nearest
31    // available devices in term of topology
32    // connectivity (as for now in term of PCI board).
33    pub nearest: Vec<GpuIndex>,
34    // Current (P)State which determines the
35    // performance level/energy consumption ratio
36    // starting with Zero being the highest.
37    pub perf_state: PerformanceState,
38}
39
40pub fn create_gpus() -> BTreeMap<usize, Vec<Gpu>> {
41    let mut gpus: BTreeMap<usize, Vec<Gpu>> = BTreeMap::new();
42
43    // Don't fail if the system has no NVIDIA GPUs.
44    let Ok(nvml) = Nvml::init_with_flags(InitFlags::NO_GPUS) else {
45        return BTreeMap::new();
46    };
47    if let Ok(nvidia_gpu_count) = nvml.device_count() {
48        for i in 0..nvidia_gpu_count {
49            let Ok(nvidia_gpu) = nvml.device_by_index(i) else {
50                continue;
51            };
52            let graphics_boost_clock = nvidia_gpu
53                .max_customer_boost_clock(Clock::Graphics)
54                .unwrap_or(0);
55            let sm_boost_clock = nvidia_gpu.max_customer_boost_clock(Clock::SM).unwrap_or(0);
56            let mem_boost_clock = nvidia_gpu
57                .max_customer_boost_clock(Clock::Memory)
58                .unwrap_or(0);
59            let Ok(memory_info) = nvidia_gpu.memory_info() else {
60                continue;
61            };
62            let Ok(pci_info) = nvidia_gpu.pci_info() else {
63                continue;
64            };
65            let Ok(index) = nvidia_gpu.index() else {
66                continue;
67            };
68
69            let cpu_mask = if let Ok(cpu_affinity) =
70                nvidia_gpu.cpu_affinity_within_scope(*NR_CPU_IDS, NVML_AFFINITY_SCOPE_NODE)
71            {
72                // Note: nvml returns it as an arch dependent array of integrals
73                #[cfg(target_pointer_width = "32")]
74                let cpu_affinity: Vec<u64> = cpu_affinity
75                    .chunks_exact(2)
76                    .map(|pair| (pair[1] as u64) << 32 | pair[0] as u64)
77                    .collect();
78                Cpumask::from_vec(cpu_affinity)
79            } else {
80                Cpumask::new()
81            };
82
83            let multiproc_count = if let Ok(attributes) = nvidia_gpu.attributes() {
84                attributes.multiprocessor_count
85            } else {
86                0
87            };
88
89            let nearest_gpu_topology_level = if nvidia_gpu.is_multi_gpu_board().unwrap_or(false) {
90                // e.g. for some Tesla models, this mode is faster
91                // as units are part of the same physical device
92                TopologyLevel::Internal
93            } else {
94                TopologyLevel::HostBridge
95            };
96
97            // NVML_TOPOLOGY_HOSTBRIDGE is supported by all models and
98            // NVML_TOPOLOGY_SYSTEM is an expensive check (i.e. all sort of
99            // multi gpus connection types).
100            let nearest = if let Ok(nearest_gpus) =
101                nvidia_gpu.topology_nearest_gpus(nearest_gpu_topology_level)
102            {
103                nearest_gpus
104                    .iter()
105                    .filter_map(|d| {
106                        if let Ok(idx) = d.index() {
107                            Some(GpuIndex::Nvidia { nvml_id: idx })
108                        } else {
109                            None
110                        }
111                    })
112                    .collect()
113            } else {
114                Vec::new()
115            };
116
117            let perf_state = nvidia_gpu
118                .performance_state()
119                .unwrap_or(PerformanceState::Unknown);
120
121            // The NVML library doesn't return a PCIe bus ID compatible with sysfs. It includes
122            // uppercase bus ID values and an extra four leading 0s.
123            let bus_id = pci_info.bus_id.to_lowercase();
124            let fixed_bus_id = bus_id.strip_prefix("0000").unwrap_or("");
125            let numa_path = format!("/sys/bus/pci/devices/{}/numa_node", fixed_bus_id);
126            let numa_node = read_from_file(Path::new(&numa_path)).unwrap_or(0_usize);
127
128            let gpu = Gpu {
129                index: GpuIndex::Nvidia { nvml_id: index },
130                node_id: numa_node as usize,
131                max_graphics_clock: graphics_boost_clock as usize,
132                max_sm_clock: sm_boost_clock as usize,
133                max_mem_clock: mem_boost_clock as usize,
134                multiproc_count: multiproc_count as usize,
135                memory: memory_info.total,
136                cpu_mask,
137                nearest,
138                perf_state,
139            };
140            gpus.entry(gpu.node_id).or_default().push(gpu);
141        }
142    }
143
144    gpus
145}