scx_lavd/
cpu_order.rs

1// SPDX-License-Identifier: GPL-2.0
2//
3// Copyright (c) 2025 Valve Corporation.
4// Author: Changwoo Min <changwoo@igalia.com>
5
6// This software may be used and distributed according to the terms of the
7// GNU General Public License version 2.
8
9use anyhow::Result;
10use combinations::Combinations;
11use itertools::iproduct;
12use scx_utils::CoreType;
13use scx_utils::Cpumask;
14use scx_utils::EnergyModel;
15use scx_utils::PerfDomain;
16use scx_utils::PerfState;
17use scx_utils::Topology;
18use scx_utils::NR_CPU_IDS;
19use std::cell::Cell;
20use std::cell::RefCell;
21use std::collections::BTreeMap;
22use std::collections::BTreeSet;
23use std::collections::HashSet;
24use std::fmt;
25use std::hash::{Hash, Hasher};
26use tracing::debug;
27
28#[derive(Debug, Clone)]
29pub struct CpuId {
30    // - *_adx: an absolute index within a system scope
31    // - *_rdx: a relative index under a parent
32    //
33    // - numa_adx: a NUMA domain within a system
34    // - pd_adx: a performance domain (CPU frequency domain) within a system
35    //   - llc_rdx: an LLC domain (CCX) under a NUMA domain
36    //   - llc_kernel_id: physical LLC domain ID provided by the kernel
37    //     - core_rdx: a core under a LLC domain
38    //       - cpu_rdx: a CPU under a core
39    pub numa_adx: usize,
40    pub pd_adx: usize,
41    pub llc_adx: usize,
42    pub llc_rdx: usize,
43    pub llc_kernel_id: usize,
44    pub core_rdx: usize,
45    pub cpu_rdx: usize,
46    pub cpu_adx: usize,
47    pub smt_level: usize,
48    pub cache_size: usize,
49    pub cpu_cap: usize,
50    pub big_core: bool,
51    pub turbo_core: bool,
52    pub cpu_sibling: usize,
53}
54
55#[derive(Debug, Eq, PartialEq, Ord, PartialOrd, Clone)]
56pub struct ComputeDomainId {
57    pub numa_adx: usize,
58    pub llc_adx: usize,
59    pub llc_rdx: usize,
60    pub llc_kernel_id: usize,
61    pub is_big: bool,
62}
63
64#[derive(Debug, Clone)]
65pub struct ComputeDomain {
66    pub cpdom_id: usize,
67    pub cpdom_alt_id: Cell<usize>,
68    pub cpu_ids: Vec<usize>,
69    pub neighbor_map: RefCell<BTreeMap<usize, RefCell<Vec<usize>>>>,
70}
71
72#[derive(Debug, Clone)]
73#[allow(dead_code)]
74pub struct PerfCpuOrder {
75    pub perf_cap: usize,                 // performance in capacity
76    pub perf_util: f32,                  // performance in utilization, [0, 1]
77    pub cpus_perf: RefCell<Vec<usize>>,  // CPU adx order within the performance range by @perf_cap
78    pub cpus_ovflw: RefCell<Vec<usize>>, // CPU adx order beyond @perf_cap
79}
80
81#[derive(Debug)]
82#[allow(dead_code)]
83pub struct CpuOrder {
84    pub all_cpus_mask: Cpumask,
85    pub cpuids: Vec<CpuId>,
86    pub perf_cpu_order: BTreeMap<usize, PerfCpuOrder>,
87    pub cpdom_map: BTreeMap<ComputeDomainId, ComputeDomain>,
88    pub nr_cpus: usize,
89    pub nr_cores: usize,
90    pub nr_cpdoms: usize,
91    pub nr_llcs: usize,
92    pub nr_numa: usize,
93    pub smt_enabled: bool,
94    pub has_biglittle: bool,
95    pub has_energy_model: bool,
96}
97
98impl CpuOrder {
99    /// Build a cpu preference order with optional topology configuration
100    pub fn new(topology_args: Option<&scx_utils::TopologyArgs>) -> Result<CpuOrder> {
101        let ctx = CpuOrderCtx::new(topology_args)?;
102        let cpus_pf = ctx.build_topo_order(false).unwrap();
103        let cpus_ps = ctx.build_topo_order(true).unwrap();
104        let cpdom_map = CpuOrderCtx::build_cpdom(&cpus_pf).unwrap();
105        let perf_cpu_order = if ctx.em.is_ok() {
106            let em = ctx.em.unwrap();
107            EnergyModelOptimizer::get_perf_cpu_order_table(&em, &cpus_pf)
108        } else {
109            EnergyModelOptimizer::get_fake_perf_cpu_order_table(&cpus_pf, &cpus_ps)
110        };
111
112        let nr_cpdoms = cpdom_map.len();
113        Ok(CpuOrder {
114            all_cpus_mask: ctx.topo.span,
115            cpuids: cpus_pf,
116            perf_cpu_order,
117            cpdom_map,
118            nr_cpus: ctx.topo.all_cpus.len(),
119            nr_cores: ctx.topo.all_cores.len(),
120            nr_cpdoms,
121            nr_llcs: ctx.topo.all_llcs.len(),
122            nr_numa: ctx.topo.nodes.len(),
123            smt_enabled: ctx.smt_enabled,
124            has_biglittle: ctx.has_biglittle,
125            has_energy_model: ctx.has_energy_model,
126        })
127    }
128}
129
130/// CpuOrderCtx is a helper struct used to build a CpuOrder
131struct CpuOrderCtx {
132    topo: Topology,
133    em: Result<EnergyModel>,
134    smt_enabled: bool,
135    has_biglittle: bool,
136    has_energy_model: bool,
137}
138
139impl CpuOrderCtx {
140    fn new(topology_args: Option<&scx_utils::TopologyArgs>) -> Result<Self> {
141        let topo = match topology_args {
142            Some(args) => Topology::with_args(args)?,
143            None => Topology::new()?,
144        };
145
146        let em = EnergyModel::new();
147        let smt_enabled = topo.smt_enabled;
148        let has_biglittle = topo.has_little_cores();
149        let has_energy_model = em.is_ok();
150
151        debug!("{:#?}", topo);
152        debug!("{:#?}", em);
153
154        Ok(CpuOrderCtx {
155            topo,
156            em,
157            smt_enabled,
158            has_biglittle,
159            has_energy_model,
160        })
161    }
162
163    /// Build a CPU preference order based on its optimization target
164    fn build_topo_order(&self, prefer_powersave: bool) -> Option<Vec<CpuId>> {
165        let mut cpu_ids = Vec::new();
166        let smt_siblings = self.topo.sibling_cpus();
167
168        // Build a vector of cpu ids.
169        for (&numa_adx, node) in self.topo.nodes.iter() {
170            for (llc_rdx, (&llc_adx, llc)) in node.llcs.iter().enumerate() {
171                for (core_rdx, (_core_adx, core)) in llc.cores.iter().enumerate() {
172                    for (cpu_rdx, (cpu_adx, cpu)) in core.cpus.iter().enumerate() {
173                        let cpu_adx = *cpu_adx;
174                        let pd_adx = Self::get_pd_id(&self.em, cpu_adx, llc_adx);
175                        let cpu_id = CpuId {
176                            numa_adx,
177                            pd_adx,
178                            llc_adx,
179                            llc_rdx,
180                            core_rdx,
181                            cpu_rdx,
182                            cpu_adx,
183                            smt_level: cpu.smt_level,
184                            cache_size: cpu.cache_size,
185                            cpu_cap: cpu.cpu_capacity,
186                            big_core: cpu.core_type != CoreType::Little,
187                            turbo_core: cpu.core_type == CoreType::Big { turbo: true },
188                            cpu_sibling: smt_siblings[cpu_adx] as usize,
189                            llc_kernel_id: llc.kernel_id,
190                        };
191                        cpu_ids.push(RefCell::new(cpu_id));
192                    }
193                }
194            }
195        }
196
197        // Convert a vector of RefCell to a vector of plain cpu_ids
198        let mut cpu_ids2 = Vec::new();
199        for cpu_id in cpu_ids.iter() {
200            cpu_ids2.push(cpu_id.borrow().clone());
201        }
202        let mut cpu_ids = cpu_ids2;
203
204        // Sort the cpu_ids
205        match (prefer_powersave, self.has_biglittle) {
206            // 1. powersave,      no  big/little
207            //     * within the same LLC domain
208            //         - numa_adx, llc_rdx,
209            //     * prefer more capable CPU with higher capacity
210            //       and larger cache
211            //         - ^cpu_cap (chip binning), ^cache_size,
212            //     * prefere the SMT core within the same performance domain
213            //         - pd_adx, core_rdx, ^smt_level, cpu_rdx
214            (true, false) => {
215                cpu_ids.sort_by(|a, b| {
216                    a.numa_adx
217                        .cmp(&b.numa_adx)
218                        .then_with(|| a.llc_rdx.cmp(&b.llc_rdx))
219                        .then_with(|| b.cpu_cap.cmp(&a.cpu_cap))
220                        .then_with(|| b.cache_size.cmp(&a.cache_size))
221                        .then_with(|| a.pd_adx.cmp(&b.pd_adx))
222                        .then_with(|| a.core_rdx.cmp(&b.core_rdx))
223                        .then_with(|| b.smt_level.cmp(&a.smt_level))
224                        .then_with(|| a.cpu_rdx.cmp(&b.cpu_rdx))
225                        .then_with(|| a.cpu_adx.cmp(&b.cpu_adx))
226                });
227            }
228            // 2. powersave,      yes big/little
229            //     * within the same LLC domain
230            //         - numa_adx, llc_rdx,
231            //     * prefer energy-efficient LITTLE CPU with a larger cache
232            //         - cpu_cap (big/little), ^cache_size,
233            //     * prefere the SMT core within the same performance domain
234            //         - pd_adx, core_rdx, ^smt_level, cpu_rdx
235            (true, true) => {
236                cpu_ids.sort_by(|a, b| {
237                    a.numa_adx
238                        .cmp(&b.numa_adx)
239                        .then_with(|| a.llc_rdx.cmp(&b.llc_rdx))
240                        .then_with(|| a.cpu_cap.cmp(&b.cpu_cap))
241                        .then_with(|| b.cache_size.cmp(&a.cache_size))
242                        .then_with(|| a.pd_adx.cmp(&b.pd_adx))
243                        .then_with(|| a.core_rdx.cmp(&b.core_rdx))
244                        .then_with(|| b.smt_level.cmp(&a.smt_level))
245                        .then_with(|| a.cpu_rdx.cmp(&b.cpu_rdx))
246                        .then_with(|| a.cpu_adx.cmp(&b.cpu_adx))
247                });
248            }
249            // 3. performance,    no  big/little
250            // 4. performance,    yes big/little
251            //     * prefer the non-SMT core
252            //         - cpu_rdx,
253            //     * fill the same LLC domain first
254            //         - numa_adx, llc_rdx,
255            //     * prefer more capable CPU with higher capacity
256            //       (chip binning or big/little) and larger cache
257            //         - ^cpu_cap, ^cache_size, smt_level
258            //     * within the same power domain
259            //         - pd_adx, core_rdx
260            _ => {
261                cpu_ids.sort_by(|a, b| {
262                    a.cpu_rdx
263                        .cmp(&b.cpu_rdx)
264                        .then_with(|| a.numa_adx.cmp(&b.numa_adx))
265                        .then_with(|| a.llc_rdx.cmp(&b.llc_rdx))
266                        .then_with(|| b.cpu_cap.cmp(&a.cpu_cap))
267                        .then_with(|| b.cache_size.cmp(&a.cache_size))
268                        .then_with(|| a.smt_level.cmp(&b.smt_level))
269                        .then_with(|| a.pd_adx.cmp(&b.pd_adx))
270                        .then_with(|| a.core_rdx.cmp(&b.core_rdx))
271                        .then_with(|| a.cpu_adx.cmp(&b.cpu_adx))
272                });
273            }
274        }
275
276        Some(cpu_ids)
277    }
278
279    /// Build a list of compute domains
280    fn build_cpdom(cpu_ids: &Vec<CpuId>) -> Option<BTreeMap<ComputeDomainId, ComputeDomain>> {
281        // Note that building compute domain is independent to CPU orer
282        // so it is okay to use any cpus_*.
283
284        // Creat a compute domain map, where a compute domain is a CPUs that
285        // are under the same node and LLC (virtual and physical) and have the same core type.
286        let mut cpdom_id = 0;
287        let mut cpdom_map: BTreeMap<ComputeDomainId, ComputeDomain> = BTreeMap::new();
288        let mut cpdom_types: BTreeMap<usize, bool> = BTreeMap::new();
289        for cpu_id in cpu_ids.iter() {
290            let key = ComputeDomainId {
291                numa_adx: cpu_id.numa_adx,
292                llc_adx: cpu_id.llc_adx,
293                llc_rdx: cpu_id.llc_rdx,
294                llc_kernel_id: cpu_id.llc_kernel_id,
295                is_big: cpu_id.big_core,
296            };
297            let value = cpdom_map.entry(key.clone()).or_insert_with(|| {
298                let val = ComputeDomain {
299                    cpdom_id,
300                    cpdom_alt_id: Cell::new(cpdom_id),
301                    cpu_ids: Vec::new(),
302                    neighbor_map: RefCell::new(BTreeMap::new()),
303                };
304                cpdom_types.insert(cpdom_id, key.is_big);
305
306                cpdom_id += 1;
307                val
308            });
309            value.cpu_ids.push(cpu_id.cpu_adx);
310        }
311
312        // Build a neighbor map for each compute domain, where neighbors are
313        // ordered by core type, node, and LLC.
314        for ((from_k, from_v), (to_k, to_v)) in iproduct!(cpdom_map.iter(), cpdom_map.iter()) {
315            if from_k == to_k {
316                continue;
317            }
318
319            let d = Self::dist(from_k, to_k);
320            let mut map = from_v.neighbor_map.borrow_mut();
321            match map.get(&d) {
322                Some(v) => {
323                    v.borrow_mut().push(to_v.cpdom_id);
324                }
325                None => {
326                    map.insert(d, RefCell::new(vec![to_v.cpdom_id]));
327                }
328            }
329        }
330
331        // Circular sort compute domains within the same distance to preserve
332        // proximity between domains.
333        //
334        // Suppose that domains 0, 1, 2, 3, 4, 5, 6, 7 are at the same distance.
335        //            0
336        //         7     1
337        //       6         2
338        //         5     3
339        //            4
340        //
341        // We want to traverse the domains from 0. The circular-sorted order
342        // starting from domain 0 is 0, 1, 7, 2, 6, 3, 5, 4. Similarly,
343        // the order starting from domain 1 is 1, 0, 2, 3, 7, 4, 6, 5.
344        // The one from 7 is 7, 0, 6, 1, 5, 2, 4, 3. As follows, circularly
345        // sorted orders in task stealing preserve proximity between domains
346        // (e.g., 0, 1, 7 in the example), so we can achieve less cacheline
347        // bouncing than with random-ordered task stealing.
348        for (_, cpdom) in cpdom_map.iter() {
349            for (_, neighbors) in cpdom.neighbor_map.borrow_mut().iter() {
350                let mut neighbors_csorted =
351                    Self::circular_sort(cpdom.cpdom_id, &neighbors.borrow_mut().to_vec());
352                neighbors.borrow_mut().clear();
353                neighbors.borrow_mut().append(&mut neighbors_csorted);
354            }
355        }
356
357        // Fill up cpdom_alt_id for each compute domain.
358        for (k, v) in cpdom_map.iter() {
359            let mut key = k.clone();
360            key.is_big = !k.is_big;
361
362            if let Some(alt_v) = cpdom_map.get(&key) {
363                // First, try to find an alternative domain
364                // under the same node/LLC.
365                v.cpdom_alt_id.set(alt_v.cpdom_id);
366            } else {
367                // If there is no alternative domain in the same node/LLC,
368                // choose the closest one.
369                //
370                // Note that currently, the idle CPU selection (pick_idle_cpu)
371                // is not optimized for this kind of architecture, where big
372                // and LITTLE cores are in different node/LLCs.
373                'outer: for (_dist, ncpdoms) in v.neighbor_map.borrow().iter() {
374                    for ncpdom_id in ncpdoms.borrow().iter() {
375                        if let Some(is_big) = cpdom_types.get(ncpdom_id) {
376                            if *is_big == key.is_big {
377                                v.cpdom_alt_id.set(*ncpdom_id);
378                                break 'outer;
379                            }
380                        }
381                    }
382                }
383            }
384        }
385
386        Some(cpdom_map)
387    }
388
389    /// Circular sorting of a list from a starting point
390    fn circular_sort(start: usize, the_rest: &Vec<usize>) -> Vec<usize> {
391        // Create a full list including 'start'
392        let mut list = the_rest.clone();
393        list.push(start);
394        list.sort();
395
396        // Get the index of 'start'
397        let s = list
398            .binary_search(&start)
399            .expect("start must appear exactly once");
400
401        // Get the circularly sorted index list.
402        let n = list.len();
403        let dist = |x: usize| {
404            let d = (x + n - s) % n;
405            d.min(n - d)
406        };
407        let mut order: Vec<usize> = (0..n).collect();
408        order.sort_by_key(|&x| (dist(x), x));
409
410        // Rearrange the full list
411        // according to the circularly sorted index list.
412        let list_csorted: Vec<_> = order.iter().map(|&i| list[i]).collect();
413
414        // Drop 'start' from the rearranged full list.
415        list_csorted[1..].to_vec()
416    }
417
418    /// Get the performance domain (i.e., CPU frequency domain) ID for a CPU.
419    /// If the energy model is not available, use LLC ID instead.
420    fn get_pd_id(em: &Result<EnergyModel>, cpu_adx: usize, llc_adx: usize) -> usize {
421        match em {
422            Ok(em) => em.get_pd_by_cpu_id(cpu_adx).unwrap().id,
423            Err(_) => llc_adx,
424        }
425    }
426
427    /// Calculate distance from two compute domains
428    fn dist(from: &ComputeDomainId, to: &ComputeDomainId) -> usize {
429        let mut d = 0;
430        // core type > numa node > llc
431        if from.is_big != to.is_big {
432            d += 100;
433        }
434        if from.numa_adx != to.numa_adx {
435            d += 10;
436        } else {
437            if from.llc_rdx != to.llc_rdx {
438                d += 1;
439            }
440            if from.llc_kernel_id != to.llc_kernel_id {
441                d += 1;
442            }
443        }
444        d
445    }
446}
447
448#[derive(Debug)]
449struct EnergyModelOptimizer<'a> {
450    // Energy model of performance domains
451    em: &'a EnergyModel,
452
453    // CPU preference order in a performance mode purely based on topology
454    cpus_topological_order: Vec<usize>,
455
456    // CPU preference order within a performance domain
457    pd_cpu_order: BTreeMap<usize, RefCell<Vec<usize>>>,
458
459    // Total performance capacity of the system
460    tot_perf: usize,
461
462    // All possible combinations of performance domains & states
463    // indexed by performance.
464    pdss_infos: RefCell<BTreeMap<usize, RefCell<HashSet<PDSetInfo<'a>>>>>,
465
466    // Performance domains and states to achieve a certain performance level,
467    // which is derived from @pdss_infos.
468    perf_pdsi: RefCell<BTreeMap<usize, PDSetInfo<'a>>>,
469
470    // CPU orders indexed by performance
471    perf_cpu_order: RefCell<BTreeMap<usize, PerfCpuOrder>>,
472}
473
474#[derive(Debug, Clone, Eq, Hash, Ord, PartialOrd)]
475struct PDS<'a> {
476    pd: &'a PerfDomain,
477    ps: &'a PerfState,
478}
479
480#[derive(Debug, Clone, Eq, Hash, Ord, PartialOrd)]
481struct PDCpu<'a> {
482    pd: &'a PerfDomain, // performance domain
483    cpu_vid: usize,     // virtual ID of a CPU on the performance domain
484}
485
486#[derive(Debug, Clone, Eq)]
487struct PDSetInfo<'a> {
488    performance: usize,
489    power: usize,
490    pdcpu_set: BTreeSet<PDCpu<'a>>,
491    pd_id_set: BTreeSet<usize>, // pd:id:0, pd:id:1
492}
493
494const PD_UNIT: usize = 100_000_000;
495const CPU_UNIT: usize = 100_000;
496const LOOKAHEAD_CNT: usize = 10;
497
498impl<'a> EnergyModelOptimizer<'a> {
499    fn new(em: &'a EnergyModel, cpus_pf: &'a Vec<CpuId>) -> EnergyModelOptimizer<'a> {
500        let tot_perf = em.perf_total();
501
502        let pdss_infos: BTreeMap<usize, RefCell<HashSet<PDSetInfo<'a>>>> = BTreeMap::new();
503        let pdss_infos = pdss_infos.into();
504
505        let perf_pdsi: BTreeMap<usize, PDSetInfo<'a>> = BTreeMap::new();
506        let perf_pdsi = perf_pdsi.into();
507
508        let mut pd_cpu_order: BTreeMap<usize, RefCell<Vec<usize>>> = BTreeMap::new();
509        let mut cpus_topological_order: Vec<usize> = vec![];
510        for cpuid in cpus_pf.iter() {
511            match pd_cpu_order.get(&cpuid.pd_adx) {
512                Some(v) => {
513                    let mut v = v.borrow_mut();
514                    v.push(cpuid.cpu_adx);
515                }
516                None => {
517                    let v = vec![cpuid.cpu_adx];
518                    pd_cpu_order.insert(cpuid.pd_adx, v.into());
519                }
520            }
521            cpus_topological_order.push(cpuid.cpu_adx);
522        }
523
524        let perf_cpu_order: BTreeMap<usize, PerfCpuOrder> = BTreeMap::new();
525        let perf_cpu_order = perf_cpu_order.into();
526
527        debug!("# pd_cpu_order");
528        debug!("{:#?}", pd_cpu_order);
529
530        EnergyModelOptimizer {
531            em,
532            cpus_topological_order,
533            pd_cpu_order,
534            tot_perf,
535            pdss_infos,
536            perf_pdsi,
537            perf_cpu_order,
538        }
539    }
540
541    fn get_perf_cpu_order_table(
542        em: &'a EnergyModel,
543        cpus_pf: &'a Vec<CpuId>,
544    ) -> BTreeMap<usize, PerfCpuOrder> {
545        let emo = EnergyModelOptimizer::new(em, &cpus_pf);
546        emo.gen_perf_cpu_order_table();
547        let perf_cpu_order = emo.perf_cpu_order.borrow().clone();
548
549        perf_cpu_order
550    }
551
552    fn get_fake_perf_cpu_order_table(
553        cpus_pf: &'a Vec<CpuId>,
554        cpus_ps: &'a Vec<CpuId>,
555    ) -> BTreeMap<usize, PerfCpuOrder> {
556        let tot_perf: usize = cpus_pf.iter().map(|cpuid| cpuid.cpu_cap).sum();
557
558        let pco_pf = Self::fake_pco(tot_perf, cpus_pf, false);
559        let pco_ps = Self::fake_pco(tot_perf, cpus_ps, true);
560
561        let mut perf_cpu_order: BTreeMap<usize, PerfCpuOrder> = BTreeMap::new();
562        perf_cpu_order.insert(pco_pf.perf_cap, pco_pf);
563        perf_cpu_order.insert(pco_ps.perf_cap, pco_ps);
564
565        perf_cpu_order
566    }
567
568    fn fake_pco(tot_perf: usize, cpuids: &'a Vec<CpuId>, powersave: bool) -> PerfCpuOrder {
569        let perf_cap;
570
571        if powersave {
572            perf_cap = cpuids[0].cpu_cap;
573        } else {
574            perf_cap = tot_perf;
575        }
576
577        let perf_util: f32 = (perf_cap as f32) / (tot_perf as f32);
578        let cpus: Vec<usize> = cpuids.iter().map(|cpuid| cpuid.cpu_adx).collect();
579        let cpus_perf: Vec<usize> = cpus[..1].iter().map(|&cpuid| cpuid).collect();
580        let cpus_ovflw: Vec<usize> = cpus[1..].iter().map(|&cpuid| cpuid).collect();
581        PerfCpuOrder {
582            perf_cap,
583            perf_util,
584            cpus_perf: cpus_perf.clone().into(),
585            cpus_ovflw: cpus_ovflw.clone().into(),
586        }
587    }
588
589    /// Generate the performance versus CPU preference order table based on
590    /// the system's CPU topology and energy model. The table consists of the
591    /// following information (PerfCpuOrder):
592    ///
593    ///   - PerfCpuOrder::perf_cap: The upper bound of the performance
594    ///     capacity covered by this tuple.
595    ///
596    ///   - PerfCpuOrder::cpus_perf: Primary CPUs to be used is ordered
597    ///     by preference.
598    ///
599    ///   - PerfCpuOrder::cpus_ovrflw: When the system load goes beyond
600    ///     @perf_cap, the list of CPUs to be used is ordered by preference.
601    fn gen_perf_cpu_order_table(&'a self) {
602        // First, generate all possible combinations of CPUs (e.g., two CPUs
603        // in performance domain 0 and three CPUs in performance domain 1) to
604        // achieve the possible performance capacities with minimal energy
605        // consumption. We assume a reasonable load balancer, so the
606        // utilization of the used CPUs is similar.
607        self.gen_all_pds_combinations();
608
609        // Then, from all the possible combinations of performance versus
610        // CPU sets, select a list of combinations that minimize the number of
611        // active performance domains and reduce the number of performance
612        // domain switches when changing performance levels.
613        self.gen_perf_pds_table();
614
615        // Finally, assign CPUs (@cpu_adx) to the virtual CPU ID (@cpu_vid) of
616        // a performance domain.
617        self.assign_cpu_vids();
618    }
619
620    /// Generate a CPU order table for each performance range.
621    fn assign_cpu_vids(&'a self) {
622        // Generate CPU order within the performance range (@cpus_perf).
623        for (&perf_cap, pdsi) in self.perf_pdsi.borrow().iter() {
624            let mut cpus_perf: Vec<usize> = vec![];
625
626            for pdcpu in pdsi.pdcpu_set.iter() {
627                let pd_id = pdcpu.pd.id;
628                let cpu_vid = pdcpu.cpu_vid;
629                let cpu_order = self.pd_cpu_order.get(&pd_id).unwrap().borrow();
630                let cpu_adx = cpu_order[cpu_vid];
631                cpus_perf.push(cpu_adx);
632            }
633
634            let perf_util: f32 = (perf_cap as f32) / (self.tot_perf as f32);
635            let cpus_perf = self.sort_cpus_by_topological_order(&cpus_perf);
636            let cpus_ovflw: Vec<usize> = vec![];
637
638            let mut perf_cpu_order = self.perf_cpu_order.borrow_mut();
639            perf_cpu_order.insert(
640                perf_cap,
641                PerfCpuOrder {
642                    perf_cap,
643                    perf_util,
644                    cpus_perf: cpus_perf.clone().into(),
645                    cpus_ovflw: cpus_ovflw.clone().into(),
646                },
647            );
648        }
649
650        // Generate CPU order beyond the performance range (@cpus_ovflw).
651        let perf_cpu_order = self.perf_cpu_order.borrow();
652        let perf_caps: Vec<_> = self.perf_pdsi.borrow().keys().cloned().collect();
653        for o in 1..perf_caps.len() {
654            // Gather all @cpus_perf from the upper performance ranges.
655            let ovrflw_perf_caps = &perf_caps[o..];
656            let mut ovrflw_cpus_all: Vec<usize> = vec![];
657            for perf_cap in ovrflw_perf_caps.iter() {
658                let cpu_order = perf_cpu_order.get(perf_cap).unwrap();
659                let cpus_perf = cpu_order.cpus_perf.borrow();
660                ovrflw_cpus_all.extend(cpus_perf.iter().cloned());
661            }
662
663            // Filter out already taken CPUs from the @ovrflw_cpus_all,
664            // and build @cpus_ovrflw.
665            let mut cpu_set = HashSet::<usize>::new();
666            let perf_cap = perf_caps[o - 1];
667            let cpu_order = perf_cpu_order.get(&perf_cap).unwrap();
668            let cpus_perf = cpu_order.cpus_perf.borrow();
669            for &cpu_adx in cpus_perf.iter() {
670                cpu_set.insert(cpu_adx);
671            }
672
673            let mut cpus_ovflw: Vec<usize> = vec![];
674            for &cpu_adx in ovrflw_cpus_all.iter() {
675                if cpu_set.get(&cpu_adx).is_none() {
676                    cpus_ovflw.push(cpu_adx);
677                    cpu_set.insert(cpu_adx);
678                }
679            }
680
681            // Inject the constructed @cpus_ovrflw to the table.
682            let mut v = cpu_order.cpus_ovflw.borrow_mut();
683            v.extend(cpus_ovflw.iter().cloned());
684        }
685
686        // Debug print of the generated table
687        debug!("## gen_perf_cpu_order_table");
688        debug!("{:#?}", perf_cpu_order);
689    }
690
691    /// Sort the CPU IDs by topological order (@self.cpus_topological_order).
692    fn sort_cpus_by_topological_order(&'a self, cpus: &Vec<usize>) -> Vec<usize> {
693        let mut sorted: Vec<usize> = vec![];
694        for &cpu_adx in self.cpus_topological_order.iter() {
695            if let Some(_) = cpus.iter().find(|&&x| x == cpu_adx) {
696                sorted.push(cpu_adx);
697            }
698        }
699        sorted
700    }
701
702    /// Generate a table of performance vs. performance domain sets
703    /// (@self.perf_pdss) from all the possible performance domain & state
704    /// combinations (@self.pdss_infos).
705    ///
706    /// An example result is as follows:
707    ///     PERF: [_, 300]
708    ///             pd:id: 0 -- cpu_vid: 0
709    ///             pd:id: 0 -- cpu_vid: 1
710    ///     PERF: [_, 1138]
711    ///             pd:id: 0 -- cpu_vid: 0
712    ///             pd:id: 0 -- cpu_vid: 1
713    ///             pd:id: 1 -- cpu_vid: 0
714    ///             pd:id: 1 -- cpu_vid: 1
715    ///     PERF: [_, 3386]
716    ///             pd:id: 1 -- cpu_vid: 0
717    ///             pd:id: 1 -- cpu_vid: 1
718    ///             pd:id: 1 -- cpu_vid: 2
719    ///             pd:id: 2 -- cpu_vid: 0
720    ///             pd:id: 2 -- cpu_vid: 1
721    ///     PERF: [_, 3977]
722    ///             pd:id: 0 -- cpu_vid: 0
723    ///             pd:id: 1 -- cpu_vid: 0
724    ///             pd:id: 1 -- cpu_vid: 1
725    ///             pd:id: 1 -- cpu_vid: 2
726    ///             pd:id: 2 -- cpu_vid: 0
727    ///             pd:id: 2 -- cpu_vid: 1
728    ///     PERF: [_, 4508]
729    ///             pd:id: 0 -- cpu_vid: 0
730    ///             pd:id: 0 -- cpu_vid: 1
731    ///             pd:id: 1 -- cpu_vid: 0
732    ///             pd:id: 1 -- cpu_vid: 1
733    ///             pd:id: 1 -- cpu_vid: 2
734    ///             pd:id: 2 -- cpu_vid: 0
735    ///             pd:id: 2 -- cpu_vid: 1
736    ///     PERF: [_, 5627]
737    ///             pd:id: 0 -- cpu_vid: 0
738    ///             pd:id: 0 -- cpu_vid: 1
739    ///             pd:id: 1 -- cpu_vid: 0
740    ///             pd:id: 1 -- cpu_vid: 1
741    ///             pd:id: 1 -- cpu_vid: 2
742    ///             pd:id: 2 -- cpu_vid: 0
743    ///             pd:id: 2 -- cpu_vid: 1
744    ///             pd:id: 3 -- cpu_vid: 0
745    fn gen_perf_pds_table(&'a self) {
746        let utils = vec![0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0];
747
748        // Find the best performance domains for each system utilization target.
749        for &util in utils.iter() {
750            let mut best_pdsi: Option<PDSetInfo<'a>>;
751            let mut del_pdsi: Option<PDSetInfo<'a>> = None;
752
753            match self.perf_pdsi.borrow().last_key_value() {
754                Some((_, base)) => {
755                    best_pdsi = self.find_perf_pds_for(util, Some(base));
756
757                    // If the next performance level (@best_pdsi) is subsumed
758                    // by the previous level (@base), extend the base to the
759                    // next level. To this end, insert the extended base (with
760                    // updated performance and power values) and delete the old
761                    // base.
762                    if let Some(ref best) = best_pdsi {
763                        if best.pdcpu_set.is_subset(&base.pdcpu_set) {
764                            let ext_pdcpu = PDSetInfo {
765                                performance: best.performance,
766                                power: best.power,
767                                pdcpu_set: base.pdcpu_set.clone(),
768                                pd_id_set: base.pd_id_set.clone(),
769                            };
770                            best_pdsi = Some(ext_pdcpu);
771                            del_pdsi = Some(base.clone());
772                        }
773                    }
774                }
775                None => {
776                    best_pdsi = self.find_perf_pds_for(util, None);
777                }
778            };
779
780            if let Some(best_pdsi) = best_pdsi {
781                self.perf_pdsi
782                    .borrow_mut()
783                    .insert(best_pdsi.performance, best_pdsi);
784            }
785
786            if let Some(del_pdsi) = del_pdsi {
787                self.perf_pdsi.borrow_mut().remove(&del_pdsi.performance);
788            }
789        }
790
791        // Debug print of the generated table
792        debug!("## gen_perf_pds_table");
793        for (perf, pdsi) in self.perf_pdsi.borrow().iter() {
794            debug!("PERF: [_, {}]", perf);
795            for pdcpu in pdsi.pdcpu_set.iter() {
796                debug!(
797                    "        pd:id: {:?} -- cpu_vid: {}",
798                    pdcpu.pd.id, pdcpu.cpu_vid
799                );
800            }
801        }
802    }
803
804    fn find_perf_pds_for(
805        &'a self,
806        util: f32,
807        base: Option<&PDSetInfo<'a>>,
808    ) -> Option<PDSetInfo<'a>> {
809        let target_perf = (util * self.tot_perf as f32) as usize;
810        let mut lookahead = 0;
811        let mut min_dist: usize = usize::MAX;
812        let mut best_pdsi: Option<PDSetInfo<'a>> = None;
813
814        let pdss_infos = self.pdss_infos.borrow();
815        for (&pdsi_perf, pdsi_set) in pdss_infos.iter() {
816            if pdsi_perf >= target_perf {
817                let pdsi_set_ref = pdsi_set.borrow();
818                for pdsi in pdsi_set_ref.iter() {
819                    let dist = pdsi.dist(base);
820                    if dist < min_dist {
821                        min_dist = dist;
822                        best_pdsi = Some(pdsi.clone());
823                    }
824                }
825                lookahead += 1;
826                if lookahead >= LOOKAHEAD_CNT {
827                    break;
828                }
829            }
830        }
831
832        best_pdsi
833    }
834
835    /// Generate all possible performance domain & state combinations,
836    /// @self.pdss_infos. Each combination represents a set of performance
837    /// domains (and their corresponding performance states) that achieve the
838    /// requested performance with minimal power consumption.
839    ///
840    /// We assume a 'reasonable load balancer,' so the CPU utilization of all
841    /// the involved CPUs is similar.
842    ///
843    /// An example result is as follows:
844    ///
845    ///     PERF: [_, 5135]
846    ///         perf: 5135 -- power: 5475348
847    ///             pd:id: 0 -- cpu_vid: 0
848    ///             pd:id: 1 -- cpu_vid: 0
849    ///             pd:id: 1 -- cpu_vid: 1
850    ///             pd:id: 1 -- cpu_vid: 2
851    ///             pd:id: 2 -- cpu_vid: 0
852    ///             pd:id: 2 -- cpu_vid: 1
853    ///             pd:id: 3 -- cpu_vid: 0
854    ///     PERF: [_, 5187]
855    ///         perf: 5187 -- power: 4844969
856    ///             pd:id: 0 -- cpu_vid: 0
857    ///             pd:id: 0 -- cpu_vid: 1
858    ///             pd:id: 1 -- cpu_vid: 0
859    ///             pd:id: 1 -- cpu_vid: 1
860    ///             pd:id: 1 -- cpu_vid: 2
861    ///             pd:id: 2 -- cpu_vid: 0
862    ///             pd:id: 2 -- cpu_vid: 1
863    ///             pd:id: 3 -- cpu_vid: 0
864    ///     PERF: [_, 5195]
865    ///         perf: 5195 -- power: 5924606
866    ///             pd:id: 1 -- cpu_vid: 0
867    ///             pd:id: 1 -- cpu_vid: 1
868    ///             pd:id: 1 -- cpu_vid: 2
869    ///             pd:id: 2 -- cpu_vid: 0
870    ///             pd:id: 2 -- cpu_vid: 1
871    ///             pd:id: 3 -- cpu_vid: 0
872    ///     PERF: [_, 5217]
873    ///         perf: 5217 -- power: 4894911
874    ///             pd:id: 0 -- cpu_vid: 0
875    ///             pd:id: 0 -- cpu_vid: 1
876    ///             pd:id: 1 -- cpu_vid: 0
877    ///             pd:id: 1 -- cpu_vid: 1
878    ///             pd:id: 1 -- cpu_vid: 2
879    ///             pd:id: 2 -- cpu_vid: 0
880    ///             pd:id: 2 -- cpu_vid: 1
881    ///             pd:id: 3 -- cpu_vid: 0
882    ///     PERF: [_, 5225]
883    ///         perf: 5225 -- power: 5665770
884    ///             pd:id: 0 -- cpu_vid: 0
885    ///             pd:id: 1 -- cpu_vid: 0
886    ///             pd:id: 1 -- cpu_vid: 1
887    ///             pd:id: 1 -- cpu_vid: 2
888    ///             pd:id: 2 -- cpu_vid: 0
889    ///             pd:id: 2 -- cpu_vid: 1
890    ///             pd:id: 3 -- cpu_vid: 0
891    ///     PERF: [_, 5316]
892    ///         perf: 5316 -- power: 5860568
893    ///             pd:id: 0 -- cpu_vid: 0
894    ///             pd:id: 1 -- cpu_vid: 0
895    ///             pd:id: 1 -- cpu_vid: 1
896    ///             pd:id: 1 -- cpu_vid: 2
897    ///             pd:id: 2 -- cpu_vid: 0
898    ///             pd:id: 2 -- cpu_vid: 1
899    ///             pd:id: 3 -- cpu_vid: 0
900    fn gen_all_pds_combinations(&'a self) {
901        // Start from the min (0%) and max (100%) CPU utilizations
902        let pdsi_vec = self.gen_pds_combinations(0.0);
903        self.insert_pds_combinations(&pdsi_vec);
904
905        let pdsi_vec = self.gen_pds_combinations(100.0);
906        self.insert_pds_combinations(&pdsi_vec);
907
908        // Then dive into the range between the min and max.
909        self.gen_perf_cpuset_table_range(0, 100);
910
911        // Debug print performance table
912        debug!("## gen_all_pds_combinations");
913        for (perf, pdss_info) in self.pdss_infos.borrow().iter() {
914            debug!("PERF: [_, {}]", perf);
915            for pdsi in pdss_info.borrow().iter() {
916                debug!("    perf: {} -- power: {}", pdsi.performance, pdsi.power);
917                for pdcpu in pdsi.pdcpu_set.iter() {
918                    debug!(
919                        "        pd:id: {:?} -- cpu_vid: {}",
920                        pdcpu.pd.id, pdcpu.cpu_vid
921                    );
922                }
923            }
924        }
925    }
926
927    fn gen_perf_cpuset_table_range(&'a self, low: isize, high: isize) {
928        if low > high {
929            return;
930        }
931
932        // If there is a new performance point in the middle,
933        // let's further explore. Otherwise, stop it here.
934        let mid: isize = low + (high - low) / 2;
935        let pdsi_vec = self.gen_pds_combinations(mid as f32);
936        let found_new = self.insert_pds_combinations(&pdsi_vec);
937        if found_new {
938            self.gen_perf_cpuset_table_range(mid + 1, high);
939            self.gen_perf_cpuset_table_range(low, mid - 1);
940        }
941    }
942
943    fn gen_pds_combinations(&'a self, util: f32) -> Vec<PDSetInfo<'a>> {
944        let mut pdsi_vec = Vec::new();
945
946        let pds_set = self.gen_pds_set(util);
947        let n = pds_set.len();
948        for k in 1..n {
949            let pdss = pds_set.clone();
950            let pds_cmbs: Vec<_> = Combinations::new(pdss, k)
951                .map(|cmb| PDSetInfo::new(cmb.clone()))
952                .collect();
953            pdsi_vec.extend(pds_cmbs);
954        }
955
956        let pdsi = PDSetInfo::new(pds_set.clone());
957        pdsi_vec.push(pdsi);
958
959        pdsi_vec
960    }
961
962    fn insert_pds_combinations(&self, new_pdsi_vec: &Vec<PDSetInfo<'a>>) -> bool {
963        // For the same performance, keep the PDS combinations with the lowest
964        // power consumption. If there are more than one lowest, keep them all
965        // to choose one later when assigning CPUs from the selected
966        // performance domains.
967        let mut found_new = false;
968
969        for new_pdsi in new_pdsi_vec.iter() {
970            let mut pdss_infos = self.pdss_infos.borrow_mut();
971            let v = pdss_infos.get(&new_pdsi.performance);
972            match v {
973                // There are already PDSetInfo in the list.
974                Some(v) => {
975                    let mut v = v.borrow_mut();
976                    let pdsi = &v.iter().next().unwrap();
977                    if pdsi.power == new_pdsi.power {
978                        // If the power consumptions are the same, keep both.
979                        if v.insert(new_pdsi.clone()) {
980                            found_new = true;
981                        }
982                    } else if pdsi.power > new_pdsi.power {
983                        // If the new one takes less power, keep the new one.
984                        v.clear();
985                        v.insert(new_pdsi.clone());
986                        found_new = true;
987                    }
988                }
989                // This is the first for the performance target.
990                None => {
991                    // Let's add it and move on.
992                    let mut v: HashSet<PDSetInfo<'a>> = HashSet::new();
993                    v.insert(new_pdsi.clone());
994                    pdss_infos.insert(new_pdsi.performance, v.into());
995                    found_new = true;
996                }
997            }
998        }
999        found_new
1000    }
1001
1002    /// Get a vector of (performance domain, performance state) to achieve
1003    /// the given CPU utilization, @util.
1004    fn gen_pds_set(&self, util: f32) -> Vec<PDS<'_>> {
1005        let mut pds_set = vec![];
1006        for (_, pd) in self.em.perf_doms.iter() {
1007            let ps = pd.select_perf_state(util).unwrap();
1008            let pds = PDS::new(pd, ps);
1009            pds_set.push(pds);
1010        }
1011        self.expand_pds_set(&mut pds_set);
1012        pds_set
1013    }
1014
1015    /// Expand a PDS vector such that a performance domain with X CPUs
1016    /// has N elements in the vector. This is purely for generating
1017    /// combinations easy.
1018    fn expand_pds_set(&self, pds_set: &mut Vec<PDS<'_>>) {
1019        let mut xset = vec![];
1020        // For a performance domain having nr_cpus, add nr_cpus-1 more
1021        // PDS to make the PDS nr_cpus in the vector.
1022        for pds in pds_set.iter() {
1023            let nr_cpus = pds.pd.span.weight();
1024            for _ in 1..nr_cpus {
1025                xset.push(pds.clone());
1026            }
1027        }
1028        pds_set.append(&mut xset);
1029
1030        // Sort the pds_set for easy comparision.
1031        pds_set.sort();
1032    }
1033}
1034
1035impl<'a> PDS<'_> {
1036    fn new(pd: &'a PerfDomain, ps: &'a PerfState) -> PDS<'a> {
1037        PDS { pd, ps }
1038    }
1039}
1040
1041impl PartialEq for PDS<'_> {
1042    fn eq(&self, other: &Self) -> bool {
1043        self.pd == other.pd && self.ps == other.ps
1044    }
1045}
1046
1047impl<'a> PDCpu<'_> {
1048    fn new(pd: &'a PerfDomain, cpu_vid: usize) -> PDCpu<'a> {
1049        PDCpu { pd, cpu_vid }
1050    }
1051}
1052
1053impl PartialEq for PDCpu<'_> {
1054    fn eq(&self, other: &Self) -> bool {
1055        self.pd == other.pd && self.cpu_vid == other.cpu_vid
1056    }
1057}
1058
1059impl fmt::Display for PDS<'_> {
1060    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1061        write!(
1062            f,
1063            "pd:id:{}/pd:weight:{}/ps:cap:{}/ps:power:{}",
1064            self.pd.id,
1065            self.pd.span.weight(),
1066            self.ps.performance,
1067            self.ps.power,
1068        )?;
1069        Ok(())
1070    }
1071}
1072
1073impl<'a> PDSetInfo<'_> {
1074    fn new(pds_set: Vec<PDS<'a>>) -> PDSetInfo<'a> {
1075        // Create a pd_id_set and calculate performance and power.
1076        let mut performance = 0;
1077        let mut power = 0;
1078        let mut pd_id_set: BTreeSet<usize> = BTreeSet::new();
1079
1080        for pds in pds_set.iter() {
1081            performance += pds.ps.performance;
1082            power += pds.ps.power;
1083            pd_id_set.insert(pds.pd.id);
1084        }
1085
1086        // Create a pdcpu_set, so first gather the same PDS entires.
1087        let mut pds_map: BTreeMap<PDS<'a>, RefCell<Vec<PDS<'a>>>> = BTreeMap::new();
1088
1089        for pds in pds_set.iter() {
1090            let v = pds_map.get(&pds);
1091            match v {
1092                Some(v) => {
1093                    let mut v = v.borrow_mut();
1094                    v.push(pds.clone());
1095                }
1096                None => {
1097                    let mut v: Vec<PDS<'a>> = Vec::new();
1098                    v.push(pds.clone());
1099                    pds_map.insert(pds.clone(), v.into());
1100                }
1101            }
1102        }
1103        // Then assign cpu virtual ids to pdcpu_set.
1104        let mut pdcpu_set: BTreeSet<PDCpu<'a>> = BTreeSet::new();
1105        let pds_map = pds_map;
1106
1107        for (_, v) in pds_map.iter() {
1108            for (cpu_vid, pds) in v.borrow().iter().enumerate() {
1109                let pdcpu = PDCpu::new(pds.pd, cpu_vid);
1110                pdcpu_set.insert(pdcpu);
1111            }
1112        }
1113
1114        PDSetInfo {
1115            performance,
1116            power,
1117            pdcpu_set,
1118            pd_id_set,
1119        }
1120    }
1121
1122    /// Calculate the distance from @base to @self. We minimize the number of
1123    /// performance domains involved to reduce the leakage power consumption.
1124    /// We then maximize the overlap between the previous (i.e., base)
1125    /// performance domains and the new one for a smooth transition to the new
1126    /// cpuset with higher cache locality. Finally, we minimize the number of
1127    /// CPUs involved, thereby reducing the chance of contention for shared
1128    /// hardware resources (e.g., shared cache).
1129    fn dist(&self, base: Option<&PDSetInfo<'a>>) -> usize {
1130        let nr_pds = self.pd_id_set.len();
1131        let nr_pds_overlap = match base {
1132            Some(base) => self.pd_id_set.intersection(&base.pd_id_set).count(),
1133            None => 0,
1134        };
1135        let nr_cpus = self.pdcpu_set.len();
1136
1137        ((nr_pds - nr_pds_overlap) * PD_UNIT) +         // # non-overlapping PDs
1138        ((*NR_CPU_IDS - nr_cpus) * CPU_UNIT) +          // # of CPUs
1139        (*NR_CPU_IDS - self.pd_id_set.first().unwrap()) // PD ID as a tiebreaker
1140    }
1141}
1142
1143impl PartialEq for PDSetInfo<'_> {
1144    fn eq(&self, other: &Self) -> bool {
1145        self.performance == other.performance
1146            && self.power == other.power
1147            && self.pdcpu_set == other.pdcpu_set
1148    }
1149}
1150
1151impl Hash for PDSetInfo<'_> {
1152    fn hash<H: Hasher>(&self, state: &mut H) {
1153        // We don't need to hash performance, power, and pd_id_set
1154        // since they are a kind of cache for pds_set.
1155        self.pdcpu_set.hash(state);
1156    }
1157}
1158
1159impl PartialEq for PerfCpuOrder {
1160    fn eq(&self, other: &Self) -> bool {
1161        self.perf_cap == other.perf_cap
1162    }
1163}
1164
1165impl fmt::Display for PerfCpuOrder {
1166    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1167        write!(
1168            f,
1169            "capacity bound:  {} ({}%)\n",
1170            self.perf_cap,
1171            self.perf_util * 100.0
1172        )?;
1173        write!(f, "  primary CPUs:  {:?}\n", self.cpus_perf.borrow())?;
1174        write!(f, "  overflow CPUs: {:?}", self.cpus_ovflw.borrow())?;
1175        Ok(())
1176    }
1177}