scx_lavd/
cpu_order.rs

1// SPDX-License-Identifier: GPL-2.0
2//
3// Copyright (c) 2025 Valve Corporation.
4// Author: Changwoo Min <changwoo@igalia.com>
5
6// This software may be used and distributed according to the terms of the
7// GNU General Public License version 2.
8
9use anyhow::Result;
10use combinations::Combinations;
11use itertools::iproduct;
12use log::debug;
13use scx_utils::CoreType;
14use scx_utils::Cpumask;
15use scx_utils::EnergyModel;
16use scx_utils::PerfDomain;
17use scx_utils::PerfState;
18use scx_utils::Topology;
19use scx_utils::NR_CPU_IDS;
20use std::cell::Cell;
21use std::cell::RefCell;
22use std::collections::BTreeMap;
23use std::collections::BTreeSet;
24use std::collections::HashSet;
25use std::fmt;
26use std::hash::{Hash, Hasher};
27
28#[derive(Debug, Clone)]
29pub struct CpuId {
30    // - *_adx: an absolute index within a system scope
31    // - *_rdx: a relative index under a parent
32    //
33    // - numa_adx: a NUMA domain within a system
34    // - pd_adx: a performance domain (CPU frequency domain) within a system
35    //   - llc_rdx: an LLC domain (CCX) under a NUMA domain
36    //   - llc_kernel_id: physical LLC domain ID provided by the kernel
37    //     - core_rdx: a core under a LLC domain
38    //       - cpu_rdx: a CPU under a core
39    pub numa_adx: usize,
40    pub pd_adx: usize,
41    pub llc_adx: usize,
42    pub llc_rdx: usize,
43    pub llc_kernel_id: usize,
44    pub core_rdx: usize,
45    pub cpu_rdx: usize,
46    pub cpu_adx: usize,
47    pub smt_level: usize,
48    pub cache_size: usize,
49    pub cpu_cap: usize,
50    pub big_core: bool,
51    pub turbo_core: bool,
52    pub cpu_sibling: usize,
53}
54
55#[derive(Debug, Eq, PartialEq, Ord, PartialOrd, Clone)]
56pub struct ComputeDomainId {
57    pub numa_adx: usize,
58    pub llc_adx: usize,
59    pub llc_rdx: usize,
60    pub llc_kernel_id: usize,
61    pub is_big: bool,
62}
63
64#[derive(Debug, Clone)]
65pub struct ComputeDomain {
66    pub cpdom_id: usize,
67    pub cpdom_alt_id: Cell<usize>,
68    pub cpu_ids: Vec<usize>,
69    pub neighbor_map: RefCell<BTreeMap<usize, RefCell<Vec<usize>>>>,
70}
71
72#[derive(Debug, Clone)]
73#[allow(dead_code)]
74pub struct PerfCpuOrder {
75    pub perf_cap: usize,                 // performance in capacity
76    pub perf_util: f32,                  // performance in utilization, [0, 1]
77    pub cpus_perf: RefCell<Vec<usize>>,  // CPU adx order within the performance range by @perf_cap
78    pub cpus_ovflw: RefCell<Vec<usize>>, // CPU adx order beyond @perf_cap
79}
80
81#[derive(Debug)]
82#[allow(dead_code)]
83pub struct CpuOrder {
84    pub all_cpus_mask: Cpumask,
85    pub cpuids: Vec<CpuId>,
86    pub perf_cpu_order: BTreeMap<usize, PerfCpuOrder>,
87    pub cpdom_map: BTreeMap<ComputeDomainId, ComputeDomain>,
88    pub nr_cpus: usize,
89    pub nr_cores: usize,
90    pub nr_cpdoms: usize,
91    pub nr_llcs: usize,
92    pub nr_numa: usize,
93    pub smt_enabled: bool,
94    pub has_biglittle: bool,
95    pub has_energy_model: bool,
96}
97
98impl CpuOrder {
99    /// Build a cpu preference order with optional topology configuration
100    pub fn new(topology_args: Option<&scx_utils::TopologyArgs>) -> Result<CpuOrder> {
101        let ctx = CpuOrderCtx::new(topology_args)?;
102        let cpus_pf = ctx.build_topo_order(false).unwrap();
103        let cpus_ps = ctx.build_topo_order(true).unwrap();
104        let cpdom_map = CpuOrderCtx::build_cpdom(&cpus_pf).unwrap();
105        let perf_cpu_order = if ctx.em.is_ok() {
106            let em = ctx.em.unwrap();
107            EnergyModelOptimizer::get_perf_cpu_order_table(&em, &cpus_pf)
108        } else {
109            EnergyModelOptimizer::get_fake_perf_cpu_order_table(&cpus_pf, &cpus_ps)
110        };
111
112        let nr_cpdoms = cpdom_map.len();
113        Ok(CpuOrder {
114            all_cpus_mask: ctx.topo.span,
115            cpuids: cpus_pf,
116            perf_cpu_order,
117            cpdom_map,
118            nr_cpus: ctx.topo.all_cpus.len(),
119            nr_cores: ctx.topo.all_cores.len(),
120            nr_cpdoms,
121            nr_llcs: ctx.topo.all_llcs.len(),
122            nr_numa: ctx.topo.nodes.len(),
123            smt_enabled: ctx.smt_enabled,
124            has_biglittle: ctx.has_biglittle,
125            has_energy_model: ctx.has_energy_model,
126        })
127    }
128}
129
130/// CpuOrderCtx is a helper struct used to build a CpuOrder
131struct CpuOrderCtx {
132    topo: Topology,
133    em: Result<EnergyModel>,
134    smt_enabled: bool,
135    has_biglittle: bool,
136    has_energy_model: bool,
137}
138
139impl CpuOrderCtx {
140    fn new(topology_args: Option<&scx_utils::TopologyArgs>) -> Result<Self> {
141        let topo = match topology_args {
142            Some(args) => Topology::with_args(args)?,
143            None => Topology::new()?,
144        };
145
146        let em = EnergyModel::new();
147        let smt_enabled = topo.smt_enabled;
148        let has_biglittle = topo.has_little_cores();
149        let has_energy_model = em.is_ok();
150
151        debug!("{:#?}", topo);
152        debug!("{:#?}", em);
153
154        Ok(CpuOrderCtx {
155            topo,
156            em,
157            smt_enabled,
158            has_biglittle,
159            has_energy_model,
160        })
161    }
162
163    /// Build a CPU preference order based on its optimization target
164    fn build_topo_order(&self, prefer_powersave: bool) -> Option<Vec<CpuId>> {
165        let mut cpu_ids = Vec::new();
166        let smt_siblings = self.topo.sibling_cpus();
167
168        // Build a vector of cpu ids.
169        for (&numa_adx, node) in self.topo.nodes.iter() {
170            for (llc_rdx, (&llc_adx, llc)) in node.llcs.iter().enumerate() {
171                for (core_rdx, (_core_adx, core)) in llc.cores.iter().enumerate() {
172                    for (cpu_rdx, (cpu_adx, cpu)) in core.cpus.iter().enumerate() {
173                        let cpu_adx = *cpu_adx;
174                        let pd_adx = Self::get_pd_id(&self.em, cpu_adx, llc_adx);
175                        let cpu_id = CpuId {
176                            numa_adx,
177                            pd_adx,
178                            llc_adx,
179                            llc_rdx,
180                            core_rdx,
181                            cpu_rdx,
182                            cpu_adx,
183                            smt_level: cpu.smt_level,
184                            cache_size: cpu.cache_size,
185                            cpu_cap: cpu.cpu_capacity,
186                            big_core: cpu.core_type != CoreType::Little,
187                            turbo_core: cpu.core_type == CoreType::Big { turbo: true },
188                            cpu_sibling: smt_siblings[cpu_adx] as usize,
189                            llc_kernel_id: llc.kernel_id,
190                        };
191                        cpu_ids.push(RefCell::new(cpu_id));
192                    }
193                }
194            }
195        }
196
197        // Convert a vector of RefCell to a vector of plain cpu_ids
198        let mut cpu_ids2 = Vec::new();
199        for cpu_id in cpu_ids.iter() {
200            cpu_ids2.push(cpu_id.borrow().clone());
201        }
202        let mut cpu_ids = cpu_ids2;
203
204        // Sort the cpu_ids
205        match (prefer_powersave, self.has_biglittle) {
206            // 1. powersave,      no  big/little
207            //     * within the same LLC domain
208            //         - numa_adx, llc_rdx,
209            //     * prefer more capable CPU with higher capacity
210            //       and larger cache
211            //         - ^cpu_cap (chip binning), ^cache_size,
212            //     * prefere the SMT core within the same performance domain
213            //         - pd_adx, core_rdx, ^smt_level, cpu_rdx
214            (true, false) => {
215                cpu_ids.sort_by(|a, b| {
216                    a.numa_adx
217                        .cmp(&b.numa_adx)
218                        .then_with(|| a.llc_rdx.cmp(&b.llc_rdx))
219                        .then_with(|| b.cpu_cap.cmp(&a.cpu_cap))
220                        .then_with(|| b.cache_size.cmp(&a.cache_size))
221                        .then_with(|| a.pd_adx.cmp(&b.pd_adx))
222                        .then_with(|| a.core_rdx.cmp(&b.core_rdx))
223                        .then_with(|| b.smt_level.cmp(&a.smt_level))
224                        .then_with(|| a.cpu_rdx.cmp(&b.cpu_rdx))
225                        .then_with(|| a.cpu_adx.cmp(&b.cpu_adx))
226                });
227            }
228            // 2. powersave,      yes big/little
229            //     * within the same LLC domain
230            //         - numa_adx, llc_rdx,
231            //     * prefer energy-efficient LITTLE CPU with a larger cache
232            //         - cpu_cap (big/little), ^cache_size,
233            //     * prefere the SMT core within the same performance domain
234            //         - pd_adx, core_rdx, ^smt_level, cpu_rdx
235            (true, true) => {
236                cpu_ids.sort_by(|a, b| {
237                    a.numa_adx
238                        .cmp(&b.numa_adx)
239                        .then_with(|| a.llc_rdx.cmp(&b.llc_rdx))
240                        .then_with(|| a.cpu_cap.cmp(&b.cpu_cap))
241                        .then_with(|| b.cache_size.cmp(&a.cache_size))
242                        .then_with(|| a.pd_adx.cmp(&b.pd_adx))
243                        .then_with(|| a.core_rdx.cmp(&b.core_rdx))
244                        .then_with(|| b.smt_level.cmp(&a.smt_level))
245                        .then_with(|| a.cpu_rdx.cmp(&b.cpu_rdx))
246                        .then_with(|| a.cpu_adx.cmp(&b.cpu_adx))
247                });
248            }
249            // 3. performance,    no  big/little
250            // 4. performance,    yes big/little
251            //     * prefer the non-SMT core
252            //         - cpu_rdx,
253            //     * fill the same LLC domain first
254            //         - numa_adx, llc_rdx,
255            //     * prefer more capable CPU with higher capacity
256            //       (chip binning or big/little) and larger cache
257            //         - ^cpu_cap, ^cache_size, smt_level
258            //     * within the same power domain
259            //         - pd_adx, core_rdx
260            _ => {
261                cpu_ids.sort_by(|a, b| {
262                    a.cpu_rdx
263                        .cmp(&b.cpu_rdx)
264                        .then_with(|| a.numa_adx.cmp(&b.numa_adx))
265                        .then_with(|| a.llc_rdx.cmp(&b.llc_rdx))
266                        .then_with(|| b.cpu_cap.cmp(&a.cpu_cap))
267                        .then_with(|| b.cache_size.cmp(&a.cache_size))
268                        .then_with(|| a.smt_level.cmp(&b.smt_level))
269                        .then_with(|| a.pd_adx.cmp(&b.pd_adx))
270                        .then_with(|| a.core_rdx.cmp(&b.core_rdx))
271                        .then_with(|| a.cpu_adx.cmp(&b.cpu_adx))
272                });
273            }
274        }
275
276        Some(cpu_ids)
277    }
278
279    /// Build a list of compute domains
280    fn build_cpdom(cpu_ids: &Vec<CpuId>) -> Option<BTreeMap<ComputeDomainId, ComputeDomain>> {
281        // Note that building compute domain is independent to CPU orer
282        // so it is okay to use any cpus_*.
283
284        // Creat a compute domain map, where a compute domain is a CPUs that
285        // are under the same node and LLC (virtual and physical) and have the same core type.
286        let mut cpdom_id = 0;
287        let mut cpdom_map: BTreeMap<ComputeDomainId, ComputeDomain> = BTreeMap::new();
288        let mut cpdom_types: BTreeMap<usize, bool> = BTreeMap::new();
289        for cpu_id in cpu_ids.iter() {
290            let key = ComputeDomainId {
291                numa_adx: cpu_id.numa_adx,
292                llc_adx: cpu_id.llc_adx,
293                llc_rdx: cpu_id.llc_rdx,
294                llc_kernel_id: cpu_id.llc_kernel_id,
295                is_big: cpu_id.big_core,
296            };
297            let value = cpdom_map.entry(key.clone()).or_insert_with(|| {
298                let val = ComputeDomain {
299                    cpdom_id,
300                    cpdom_alt_id: Cell::new(cpdom_id),
301                    cpu_ids: Vec::new(),
302                    neighbor_map: RefCell::new(BTreeMap::new()),
303                };
304                cpdom_types.insert(cpdom_id, key.is_big);
305
306                cpdom_id += 1;
307                val
308            });
309            value.cpu_ids.push(cpu_id.cpu_adx);
310        }
311
312        // Build a neighbor map for each compute domain, where neighbors are
313        // ordered by core type, node, and LLC.
314        for ((from_k, from_v), (to_k, to_v)) in iproduct!(cpdom_map.iter(), cpdom_map.iter()) {
315            if from_k == to_k {
316                continue;
317            }
318
319            let d = Self::dist(from_k, to_k);
320            let mut map = from_v.neighbor_map.borrow_mut();
321            match map.get(&d) {
322                Some(v) => {
323                    v.borrow_mut().push(to_v.cpdom_id);
324                }
325                None => {
326                    map.insert(d, RefCell::new(vec![to_v.cpdom_id]));
327                }
328            }
329        }
330
331        // Fill up cpdom_alt_id for each compute domain.
332        for (k, v) in cpdom_map.iter() {
333            let mut key = k.clone();
334            key.is_big = !k.is_big;
335
336            if let Some(alt_v) = cpdom_map.get(&key) {
337                // First, try to find an alternative domain
338                // under the same node/LLC.
339                v.cpdom_alt_id.set(alt_v.cpdom_id);
340            } else {
341                // If there is no alternative domain in the same node/LLC,
342                // choose the closest one.
343                //
344                // Note that currently, the idle CPU selection (pick_idle_cpu)
345                // is not optimized for this kind of architecture, where big
346                // and LITTLE cores are in different node/LLCs.
347                'outer: for (_dist, ncpdoms) in v.neighbor_map.borrow().iter() {
348                    for ncpdom_id in ncpdoms.borrow().iter() {
349                        if let Some(is_big) = cpdom_types.get(ncpdom_id) {
350                            if *is_big == key.is_big {
351                                v.cpdom_alt_id.set(*ncpdom_id);
352                                break 'outer;
353                            }
354                        }
355                    }
356                }
357            }
358        }
359
360        Some(cpdom_map)
361    }
362
363    /// Get the performance domain (i.e., CPU frequency domain) ID for a CPU.
364    /// If the energy model is not available, use LLC ID instead.
365    fn get_pd_id(em: &Result<EnergyModel>, cpu_adx: usize, llc_adx: usize) -> usize {
366        match em {
367            Ok(em) => em.get_pd_by_cpu_id(cpu_adx).unwrap().id,
368            Err(_) => llc_adx,
369        }
370    }
371
372    /// Calculate distance from two compute domains
373    fn dist(from: &ComputeDomainId, to: &ComputeDomainId) -> usize {
374        let mut d = 0;
375        // core type > numa node > llc
376        if from.is_big != to.is_big {
377            d += 3;
378        }
379        if from.numa_adx != to.numa_adx {
380            d += 2;
381        } else {
382            if from.llc_rdx != to.llc_rdx {
383                d += 1;
384            }
385            if from.llc_kernel_id != to.llc_kernel_id {
386                d += 1;
387            }
388        }
389        d
390    }
391}
392
393#[derive(Debug)]
394struct EnergyModelOptimizer<'a> {
395    // Energy model of performance domains
396    em: &'a EnergyModel,
397
398    // CPU preference order in a performance mode purely based on topology
399    cpus_topological_order: Vec<usize>,
400
401    // CPU preference order within a performance domain
402    pd_cpu_order: BTreeMap<usize, RefCell<Vec<usize>>>,
403
404    // Total performance capacity of the system
405    tot_perf: usize,
406
407    // All possible combinations of performance domains & states
408    // indexed by performance.
409    pdss_infos: RefCell<BTreeMap<usize, RefCell<HashSet<PDSetInfo<'a>>>>>,
410
411    // Performance domains and states to achieve a certain performance level,
412    // which is derived from @pdss_infos.
413    perf_pdsi: RefCell<BTreeMap<usize, PDSetInfo<'a>>>,
414
415    // CPU orders indexed by performance
416    perf_cpu_order: RefCell<BTreeMap<usize, PerfCpuOrder>>,
417}
418
419#[derive(Debug, Clone, Eq, Hash, Ord, PartialOrd)]
420struct PDS<'a> {
421    pd: &'a PerfDomain,
422    ps: &'a PerfState,
423}
424
425#[derive(Debug, Clone, Eq, Hash, Ord, PartialOrd)]
426struct PDCpu<'a> {
427    pd: &'a PerfDomain, // performance domain
428    cpu_vid: usize,     // virtual ID of a CPU on the performance domain
429}
430
431#[derive(Debug, Clone, Eq)]
432struct PDSetInfo<'a> {
433    performance: usize,
434    power: usize,
435    pdcpu_set: BTreeSet<PDCpu<'a>>,
436    pd_id_set: BTreeSet<usize>, // pd:id:0, pd:id:1
437}
438
439const PD_UNIT: usize = 100_000_000;
440const CPU_UNIT: usize = 100_000;
441const LOOKAHEAD_CNT: usize = 10;
442
443impl<'a> EnergyModelOptimizer<'a> {
444    fn new(em: &'a EnergyModel, cpus_pf: &'a Vec<CpuId>) -> EnergyModelOptimizer<'a> {
445        let tot_perf = em.perf_total();
446
447        let pdss_infos: BTreeMap<usize, RefCell<HashSet<PDSetInfo<'a>>>> = BTreeMap::new();
448        let pdss_infos = pdss_infos.into();
449
450        let perf_pdsi: BTreeMap<usize, PDSetInfo<'a>> = BTreeMap::new();
451        let perf_pdsi = perf_pdsi.into();
452
453        let mut pd_cpu_order: BTreeMap<usize, RefCell<Vec<usize>>> = BTreeMap::new();
454        let mut cpus_topological_order: Vec<usize> = vec![];
455        for cpuid in cpus_pf.iter() {
456            match pd_cpu_order.get(&cpuid.pd_adx) {
457                Some(v) => {
458                    let mut v = v.borrow_mut();
459                    v.push(cpuid.cpu_adx);
460                }
461                None => {
462                    let v = vec![cpuid.cpu_adx];
463                    pd_cpu_order.insert(cpuid.pd_adx, v.into());
464                }
465            }
466            cpus_topological_order.push(cpuid.cpu_adx);
467        }
468
469        let perf_cpu_order: BTreeMap<usize, PerfCpuOrder> = BTreeMap::new();
470        let perf_cpu_order = perf_cpu_order.into();
471
472        debug!("# pd_cpu_order");
473        debug!("{:#?}", pd_cpu_order);
474
475        EnergyModelOptimizer {
476            em,
477            cpus_topological_order,
478            pd_cpu_order,
479            tot_perf,
480            pdss_infos,
481            perf_pdsi,
482            perf_cpu_order,
483        }
484    }
485
486    fn get_perf_cpu_order_table(
487        em: &'a EnergyModel,
488        cpus_pf: &'a Vec<CpuId>,
489    ) -> BTreeMap<usize, PerfCpuOrder> {
490        let emo = EnergyModelOptimizer::new(em, &cpus_pf);
491        emo.gen_perf_cpu_order_table();
492        let perf_cpu_order = emo.perf_cpu_order.borrow().clone();
493
494        perf_cpu_order
495    }
496
497    fn get_fake_perf_cpu_order_table(
498        cpus_pf: &'a Vec<CpuId>,
499        cpus_ps: &'a Vec<CpuId>,
500    ) -> BTreeMap<usize, PerfCpuOrder> {
501        let tot_perf: usize = cpus_pf.iter().map(|cpuid| cpuid.cpu_cap).sum();
502
503        let pco_pf = Self::fake_pco(tot_perf, cpus_pf, false);
504        let pco_ps = Self::fake_pco(tot_perf, cpus_ps, true);
505
506        let mut perf_cpu_order: BTreeMap<usize, PerfCpuOrder> = BTreeMap::new();
507        perf_cpu_order.insert(pco_pf.perf_cap, pco_pf);
508        perf_cpu_order.insert(pco_ps.perf_cap, pco_ps);
509
510        perf_cpu_order
511    }
512
513    fn fake_pco(tot_perf: usize, cpuids: &'a Vec<CpuId>, powersave: bool) -> PerfCpuOrder {
514        let perf_cap;
515
516        if powersave {
517            perf_cap = cpuids[0].cpu_cap;
518        } else {
519            perf_cap = tot_perf;
520        }
521
522        let perf_util: f32 = (perf_cap as f32) / (tot_perf as f32);
523        let cpus: Vec<usize> = cpuids.iter().map(|cpuid| cpuid.cpu_adx).collect();
524        let cpus_perf: Vec<usize> = cpus[..1].iter().map(|&cpuid| cpuid).collect();
525        let cpus_ovflw: Vec<usize> = cpus[1..].iter().map(|&cpuid| cpuid).collect();
526        PerfCpuOrder {
527            perf_cap,
528            perf_util,
529            cpus_perf: cpus_perf.clone().into(),
530            cpus_ovflw: cpus_ovflw.clone().into(),
531        }
532    }
533
534    /// Generate the performance versus CPU preference order table based on
535    /// the system's CPU topology and energy model. The table consists of the
536    /// following information (PerfCpuOrder):
537    ///
538    ///   - PerfCpuOrder::perf_cap: The upper bound of the performance
539    ///     capacity covered by this tuple.
540    ///
541    ///   - PerfCpuOrder::cpus_perf: Primary CPUs to be used is ordered
542    ///     by preference.
543    ///
544    ///   - PerfCpuOrder::cpus_ovrflw: When the system load goes beyond
545    ///     @perf_cap, the list of CPUs to be used is ordered by preference.
546    fn gen_perf_cpu_order_table(&'a self) {
547        // First, generate all possible combinations of CPUs (e.g., two CPUs
548        // in performance domain 0 and three CPUs in performance domain 1) to
549        // achieve the possible performance capacities with minimal energy
550        // consumption. We assume a reasonable load balancer, so the
551        // utilization of the used CPUs is similar.
552        self.gen_all_pds_combinations();
553
554        // Then, from all the possible combinations of performance versus
555        // CPU sets, select a list of combinations that minimize the number of
556        // active performance domains and reduce the number of performance
557        // domain switches when changing performance levels.
558        self.gen_perf_pds_table();
559
560        // Finally, assign CPUs (@cpu_adx) to the virtual CPU ID (@cpu_vid) of
561        // a performance domain.
562        self.assign_cpu_vids();
563    }
564
565    /// Generate a CPU order table for each performance range.
566    fn assign_cpu_vids(&'a self) {
567        // Generate CPU order within the performance range (@cpus_perf).
568        for (&perf_cap, pdsi) in self.perf_pdsi.borrow().iter() {
569            let mut cpus_perf: Vec<usize> = vec![];
570
571            for pdcpu in pdsi.pdcpu_set.iter() {
572                let pd_id = pdcpu.pd.id;
573                let cpu_vid = pdcpu.cpu_vid;
574                let cpu_order = self.pd_cpu_order.get(&pd_id).unwrap().borrow();
575                let cpu_adx = cpu_order[cpu_vid];
576                cpus_perf.push(cpu_adx);
577            }
578
579            let perf_util: f32 = (perf_cap as f32) / (self.tot_perf as f32);
580            let cpus_perf = self.sort_cpus_by_topological_order(&cpus_perf);
581            let cpus_ovflw: Vec<usize> = vec![];
582
583            let mut perf_cpu_order = self.perf_cpu_order.borrow_mut();
584            perf_cpu_order.insert(
585                perf_cap,
586                PerfCpuOrder {
587                    perf_cap,
588                    perf_util,
589                    cpus_perf: cpus_perf.clone().into(),
590                    cpus_ovflw: cpus_ovflw.clone().into(),
591                },
592            );
593        }
594
595        // Generate CPU order beyond the performance range (@cpus_ovflw).
596        let perf_cpu_order = self.perf_cpu_order.borrow();
597        let perf_caps: Vec<_> = self.perf_pdsi.borrow().keys().cloned().collect();
598        for o in 1..perf_caps.len() {
599            // Gather all @cpus_perf from the upper performance ranges.
600            let ovrflw_perf_caps = &perf_caps[o..];
601            let mut ovrflw_cpus_all: Vec<usize> = vec![];
602            for perf_cap in ovrflw_perf_caps.iter() {
603                let cpu_order = perf_cpu_order.get(perf_cap).unwrap();
604                let cpus_perf = cpu_order.cpus_perf.borrow();
605                ovrflw_cpus_all.extend(cpus_perf.iter().cloned());
606            }
607
608            // Filter out already taken CPUs from the @ovrflw_cpus_all,
609            // and build @cpus_ovrflw.
610            let mut cpu_set = HashSet::<usize>::new();
611            let perf_cap = perf_caps[o - 1];
612            let cpu_order = perf_cpu_order.get(&perf_cap).unwrap();
613            let cpus_perf = cpu_order.cpus_perf.borrow();
614            for &cpu_adx in cpus_perf.iter() {
615                cpu_set.insert(cpu_adx);
616            }
617
618            let mut cpus_ovflw: Vec<usize> = vec![];
619            for &cpu_adx in ovrflw_cpus_all.iter() {
620                if cpu_set.get(&cpu_adx).is_none() {
621                    cpus_ovflw.push(cpu_adx);
622                    cpu_set.insert(cpu_adx);
623                }
624            }
625
626            // Inject the constructed @cpus_ovrflw to the table.
627            let mut v = cpu_order.cpus_ovflw.borrow_mut();
628            v.extend(cpus_ovflw.iter().cloned());
629        }
630
631        // Debug print of the generated table
632        debug!("## gen_perf_cpu_order_table");
633        debug!("{:#?}", perf_cpu_order);
634    }
635
636    /// Sort the CPU IDs by topological order (@self.cpus_topological_order).
637    fn sort_cpus_by_topological_order(&'a self, cpus: &Vec<usize>) -> Vec<usize> {
638        let mut sorted: Vec<usize> = vec![];
639        for &cpu_adx in self.cpus_topological_order.iter() {
640            if let Some(_) = cpus.iter().find(|&&x| x == cpu_adx) {
641                sorted.push(cpu_adx);
642            }
643        }
644        sorted
645    }
646
647    /// Generate a table of performance vs. performance domain sets
648    /// (@self.perf_pdss) from all the possible performance domain & state
649    /// combinations (@self.pdss_infos).
650    ///
651    /// An example result is as follows:
652    ///     PERF: [_, 300]
653    ///             pd:id: 0 -- cpu_vid: 0
654    ///             pd:id: 0 -- cpu_vid: 1
655    ///     PERF: [_, 1138]
656    ///             pd:id: 0 -- cpu_vid: 0
657    ///             pd:id: 0 -- cpu_vid: 1
658    ///             pd:id: 1 -- cpu_vid: 0
659    ///             pd:id: 1 -- cpu_vid: 1
660    ///     PERF: [_, 3386]
661    ///             pd:id: 1 -- cpu_vid: 0
662    ///             pd:id: 1 -- cpu_vid: 1
663    ///             pd:id: 1 -- cpu_vid: 2
664    ///             pd:id: 2 -- cpu_vid: 0
665    ///             pd:id: 2 -- cpu_vid: 1
666    ///     PERF: [_, 3977]
667    ///             pd:id: 0 -- cpu_vid: 0
668    ///             pd:id: 1 -- cpu_vid: 0
669    ///             pd:id: 1 -- cpu_vid: 1
670    ///             pd:id: 1 -- cpu_vid: 2
671    ///             pd:id: 2 -- cpu_vid: 0
672    ///             pd:id: 2 -- cpu_vid: 1
673    ///     PERF: [_, 4508]
674    ///             pd:id: 0 -- cpu_vid: 0
675    ///             pd:id: 0 -- cpu_vid: 1
676    ///             pd:id: 1 -- cpu_vid: 0
677    ///             pd:id: 1 -- cpu_vid: 1
678    ///             pd:id: 1 -- cpu_vid: 2
679    ///             pd:id: 2 -- cpu_vid: 0
680    ///             pd:id: 2 -- cpu_vid: 1
681    ///     PERF: [_, 5627]
682    ///             pd:id: 0 -- cpu_vid: 0
683    ///             pd:id: 0 -- cpu_vid: 1
684    ///             pd:id: 1 -- cpu_vid: 0
685    ///             pd:id: 1 -- cpu_vid: 1
686    ///             pd:id: 1 -- cpu_vid: 2
687    ///             pd:id: 2 -- cpu_vid: 0
688    ///             pd:id: 2 -- cpu_vid: 1
689    ///             pd:id: 3 -- cpu_vid: 0
690    fn gen_perf_pds_table(&'a self) {
691        let utils = vec![0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0];
692
693        // Find the best performance domains for each system utilization target.
694        for &util in utils.iter() {
695            let mut best_pdsi: Option<PDSetInfo<'a>>;
696            let mut del_pdsi: Option<PDSetInfo<'a>> = None;
697
698            match self.perf_pdsi.borrow().last_key_value() {
699                Some((_, base)) => {
700                    best_pdsi = self.find_perf_pds_for(util, Some(base));
701
702                    // If the next performance level (@best_pdsi) is subsumed
703                    // by the previous level (@base), extend the base to the
704                    // next level. To this end, insert the extended base (with
705                    // updated performance and power values) and delete the old
706                    // base.
707                    if let Some(ref best) = best_pdsi {
708                        if best.pdcpu_set.is_subset(&base.pdcpu_set) {
709                            let ext_pdcpu = PDSetInfo {
710                                performance: best.performance,
711                                power: best.power,
712                                pdcpu_set: base.pdcpu_set.clone(),
713                                pd_id_set: base.pd_id_set.clone(),
714                            };
715                            best_pdsi = Some(ext_pdcpu);
716                            del_pdsi = Some(base.clone());
717                        }
718                    }
719                }
720                None => {
721                    best_pdsi = self.find_perf_pds_for(util, None);
722                }
723            };
724
725            if let Some(best_pdsi) = best_pdsi {
726                self.perf_pdsi
727                    .borrow_mut()
728                    .insert(best_pdsi.performance, best_pdsi);
729            }
730
731            if let Some(del_pdsi) = del_pdsi {
732                self.perf_pdsi.borrow_mut().remove(&del_pdsi.performance);
733            }
734        }
735
736        // Debug print of the generated table
737        debug!("## gen_perf_pds_table");
738        for (perf, pdsi) in self.perf_pdsi.borrow().iter() {
739            debug!("PERF: [_, {}]", perf);
740            for pdcpu in pdsi.pdcpu_set.iter() {
741                debug!(
742                    "        pd:id: {:?} -- cpu_vid: {}",
743                    pdcpu.pd.id, pdcpu.cpu_vid
744                );
745            }
746        }
747    }
748
749    fn find_perf_pds_for(
750        &'a self,
751        util: f32,
752        base: Option<&PDSetInfo<'a>>,
753    ) -> Option<PDSetInfo<'a>> {
754        let target_perf = (util * self.tot_perf as f32) as usize;
755        let mut lookahead = 0;
756        let mut min_dist: usize = usize::MAX;
757        let mut best_pdsi: Option<PDSetInfo<'a>> = None;
758
759        let pdss_infos = self.pdss_infos.borrow();
760        for (&pdsi_perf, pdsi_set) in pdss_infos.iter() {
761            if pdsi_perf >= target_perf {
762                let pdsi_set_ref = pdsi_set.borrow();
763                for pdsi in pdsi_set_ref.iter() {
764                    let dist = pdsi.dist(base);
765                    if dist < min_dist {
766                        min_dist = dist;
767                        best_pdsi = Some(pdsi.clone());
768                    }
769                }
770                lookahead += 1;
771                if lookahead >= LOOKAHEAD_CNT {
772                    break;
773                }
774            }
775        }
776
777        best_pdsi
778    }
779
780    /// Generate all possible performance domain & state combinations,
781    /// @self.pdss_infos. Each combination represents a set of performance
782    /// domains (and their corresponding performance states) that achieve the
783    /// requested performance with minimal power consumption.
784    ///
785    /// We assume a 'reasonable load balancer,' so the CPU utilization of all
786    /// the involved CPUs is similar.
787    ///
788    /// An example result is as follows:
789    ///
790    ///     PERF: [_, 5135]
791    ///         perf: 5135 -- power: 5475348
792    ///             pd:id: 0 -- cpu_vid: 0
793    ///             pd:id: 1 -- cpu_vid: 0
794    ///             pd:id: 1 -- cpu_vid: 1
795    ///             pd:id: 1 -- cpu_vid: 2
796    ///             pd:id: 2 -- cpu_vid: 0
797    ///             pd:id: 2 -- cpu_vid: 1
798    ///             pd:id: 3 -- cpu_vid: 0
799    ///     PERF: [_, 5187]
800    ///         perf: 5187 -- power: 4844969
801    ///             pd:id: 0 -- cpu_vid: 0
802    ///             pd:id: 0 -- cpu_vid: 1
803    ///             pd:id: 1 -- cpu_vid: 0
804    ///             pd:id: 1 -- cpu_vid: 1
805    ///             pd:id: 1 -- cpu_vid: 2
806    ///             pd:id: 2 -- cpu_vid: 0
807    ///             pd:id: 2 -- cpu_vid: 1
808    ///             pd:id: 3 -- cpu_vid: 0
809    ///     PERF: [_, 5195]
810    ///         perf: 5195 -- power: 5924606
811    ///             pd:id: 1 -- cpu_vid: 0
812    ///             pd:id: 1 -- cpu_vid: 1
813    ///             pd:id: 1 -- cpu_vid: 2
814    ///             pd:id: 2 -- cpu_vid: 0
815    ///             pd:id: 2 -- cpu_vid: 1
816    ///             pd:id: 3 -- cpu_vid: 0
817    ///     PERF: [_, 5217]
818    ///         perf: 5217 -- power: 4894911
819    ///             pd:id: 0 -- cpu_vid: 0
820    ///             pd:id: 0 -- cpu_vid: 1
821    ///             pd:id: 1 -- cpu_vid: 0
822    ///             pd:id: 1 -- cpu_vid: 1
823    ///             pd:id: 1 -- cpu_vid: 2
824    ///             pd:id: 2 -- cpu_vid: 0
825    ///             pd:id: 2 -- cpu_vid: 1
826    ///             pd:id: 3 -- cpu_vid: 0
827    ///     PERF: [_, 5225]
828    ///         perf: 5225 -- power: 5665770
829    ///             pd:id: 0 -- cpu_vid: 0
830    ///             pd:id: 1 -- cpu_vid: 0
831    ///             pd:id: 1 -- cpu_vid: 1
832    ///             pd:id: 1 -- cpu_vid: 2
833    ///             pd:id: 2 -- cpu_vid: 0
834    ///             pd:id: 2 -- cpu_vid: 1
835    ///             pd:id: 3 -- cpu_vid: 0
836    ///     PERF: [_, 5316]
837    ///         perf: 5316 -- power: 5860568
838    ///             pd:id: 0 -- cpu_vid: 0
839    ///             pd:id: 1 -- cpu_vid: 0
840    ///             pd:id: 1 -- cpu_vid: 1
841    ///             pd:id: 1 -- cpu_vid: 2
842    ///             pd:id: 2 -- cpu_vid: 0
843    ///             pd:id: 2 -- cpu_vid: 1
844    ///             pd:id: 3 -- cpu_vid: 0
845    fn gen_all_pds_combinations(&'a self) {
846        // Start from the min (0%) and max (100%) CPU utilizations
847        let pdsi_vec = self.gen_pds_combinations(0.0);
848        self.insert_pds_combinations(&pdsi_vec);
849
850        let pdsi_vec = self.gen_pds_combinations(100.0);
851        self.insert_pds_combinations(&pdsi_vec);
852
853        // Then dive into the range between the min and max.
854        self.gen_perf_cpuset_table_range(0, 100);
855
856        // Debug print performance table
857        debug!("## gen_all_pds_combinations");
858        for (perf, pdss_info) in self.pdss_infos.borrow().iter() {
859            debug!("PERF: [_, {}]", perf);
860            for pdsi in pdss_info.borrow().iter() {
861                debug!("    perf: {} -- power: {}", pdsi.performance, pdsi.power);
862                for pdcpu in pdsi.pdcpu_set.iter() {
863                    debug!(
864                        "        pd:id: {:?} -- cpu_vid: {}",
865                        pdcpu.pd.id, pdcpu.cpu_vid
866                    );
867                }
868            }
869        }
870    }
871
872    fn gen_perf_cpuset_table_range(&'a self, low: isize, high: isize) {
873        if low > high {
874            return;
875        }
876
877        // If there is a new performance point in the middle,
878        // let's further explore. Otherwise, stop it here.
879        let mid: isize = low + (high - low) / 2;
880        let pdsi_vec = self.gen_pds_combinations(mid as f32);
881        let found_new = self.insert_pds_combinations(&pdsi_vec);
882        if found_new {
883            self.gen_perf_cpuset_table_range(mid + 1, high);
884            self.gen_perf_cpuset_table_range(low, mid - 1);
885        }
886    }
887
888    fn gen_pds_combinations(&'a self, util: f32) -> Vec<PDSetInfo<'a>> {
889        let mut pdsi_vec = Vec::new();
890
891        let pds_set = self.gen_pds_set(util);
892        let n = pds_set.len();
893        for k in 1..n {
894            let pdss = pds_set.clone();
895            let pds_cmbs: Vec<_> = Combinations::new(pdss, k)
896                .map(|cmb| PDSetInfo::new(cmb.clone()))
897                .collect();
898            pdsi_vec.extend(pds_cmbs);
899        }
900
901        let pdsi = PDSetInfo::new(pds_set.clone());
902        pdsi_vec.push(pdsi);
903
904        pdsi_vec
905    }
906
907    fn insert_pds_combinations(&self, new_pdsi_vec: &Vec<PDSetInfo<'a>>) -> bool {
908        // For the same performance, keep the PDS combinations with the lowest
909        // power consumption. If there are more than one lowest, keep them all
910        // to choose one later when assigning CPUs from the selected
911        // performance domains.
912        let mut found_new = false;
913
914        for new_pdsi in new_pdsi_vec.iter() {
915            let mut pdss_infos = self.pdss_infos.borrow_mut();
916            let v = pdss_infos.get(&new_pdsi.performance);
917            match v {
918                // There are already PDSetInfo in the list.
919                Some(v) => {
920                    let mut v = v.borrow_mut();
921                    let pdsi = &v.iter().next().unwrap();
922                    if pdsi.power == new_pdsi.power {
923                        // If the power consumptions are the same, keep both.
924                        if v.insert(new_pdsi.clone()) {
925                            found_new = true;
926                        }
927                    } else if pdsi.power > new_pdsi.power {
928                        // If the new one takes less power, keep the new one.
929                        v.clear();
930                        v.insert(new_pdsi.clone());
931                        found_new = true;
932                    }
933                }
934                // This is the first for the performance target.
935                None => {
936                    // Let's add it and move on.
937                    let mut v: HashSet<PDSetInfo<'a>> = HashSet::new();
938                    v.insert(new_pdsi.clone());
939                    pdss_infos.insert(new_pdsi.performance, v.into());
940                    found_new = true;
941                }
942            }
943        }
944        found_new
945    }
946
947    /// Get a vector of (performance domain, performance state) to achieve
948    /// the given CPU utilization, @util.
949    fn gen_pds_set(&self, util: f32) -> Vec<PDS<'_>> {
950        let mut pds_set = vec![];
951        for (_, pd) in self.em.perf_doms.iter() {
952            let ps = pd.select_perf_state(util).unwrap();
953            let pds = PDS::new(pd, ps);
954            pds_set.push(pds);
955        }
956        self.expand_pds_set(&mut pds_set);
957        pds_set
958    }
959
960    /// Expand a PDS vector such that a performance domain with X CPUs
961    /// has N elements in the vector. This is purely for generating
962    /// combinations easy.
963    fn expand_pds_set(&self, pds_set: &mut Vec<PDS<'_>>) {
964        let mut xset = vec![];
965        // For a performance domain having nr_cpus, add nr_cpus-1 more
966        // PDS to make the PDS nr_cpus in the vector.
967        for pds in pds_set.iter() {
968            let nr_cpus = pds.pd.span.weight();
969            for _ in 1..nr_cpus {
970                xset.push(pds.clone());
971            }
972        }
973        pds_set.append(&mut xset);
974
975        // Sort the pds_set for easy comparision.
976        pds_set.sort();
977    }
978}
979
980impl<'a> PDS<'_> {
981    fn new(pd: &'a PerfDomain, ps: &'a PerfState) -> PDS<'a> {
982        PDS { pd, ps }
983    }
984}
985
986impl PartialEq for PDS<'_> {
987    fn eq(&self, other: &Self) -> bool {
988        self.pd == other.pd && self.ps == other.ps
989    }
990}
991
992impl<'a> PDCpu<'_> {
993    fn new(pd: &'a PerfDomain, cpu_vid: usize) -> PDCpu<'a> {
994        PDCpu { pd, cpu_vid }
995    }
996}
997
998impl PartialEq for PDCpu<'_> {
999    fn eq(&self, other: &Self) -> bool {
1000        self.pd == other.pd && self.cpu_vid == other.cpu_vid
1001    }
1002}
1003
1004impl fmt::Display for PDS<'_> {
1005    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1006        write!(
1007            f,
1008            "pd:id:{}/pd:weight:{}/ps:cap:{}/ps:power:{}",
1009            self.pd.id,
1010            self.pd.span.weight(),
1011            self.ps.performance,
1012            self.ps.power,
1013        )?;
1014        Ok(())
1015    }
1016}
1017
1018impl<'a> PDSetInfo<'_> {
1019    fn new(pds_set: Vec<PDS<'a>>) -> PDSetInfo<'a> {
1020        // Create a pd_id_set and calculate performance and power.
1021        let mut performance = 0;
1022        let mut power = 0;
1023        let mut pd_id_set: BTreeSet<usize> = BTreeSet::new();
1024
1025        for pds in pds_set.iter() {
1026            performance += pds.ps.performance;
1027            power += pds.ps.power;
1028            pd_id_set.insert(pds.pd.id);
1029        }
1030
1031        // Create a pdcpu_set, so first gather the same PDS entires.
1032        let mut pds_map: BTreeMap<PDS<'a>, RefCell<Vec<PDS<'a>>>> = BTreeMap::new();
1033
1034        for pds in pds_set.iter() {
1035            let v = pds_map.get(&pds);
1036            match v {
1037                Some(v) => {
1038                    let mut v = v.borrow_mut();
1039                    v.push(pds.clone());
1040                }
1041                None => {
1042                    let mut v: Vec<PDS<'a>> = Vec::new();
1043                    v.push(pds.clone());
1044                    pds_map.insert(pds.clone(), v.into());
1045                }
1046            }
1047        }
1048        // Then assign cpu virtual ids to pdcpu_set.
1049        let mut pdcpu_set: BTreeSet<PDCpu<'a>> = BTreeSet::new();
1050        let pds_map = pds_map;
1051
1052        for (_, v) in pds_map.iter() {
1053            for (cpu_vid, pds) in v.borrow().iter().enumerate() {
1054                let pdcpu = PDCpu::new(pds.pd, cpu_vid);
1055                pdcpu_set.insert(pdcpu);
1056            }
1057        }
1058
1059        PDSetInfo {
1060            performance,
1061            power,
1062            pdcpu_set,
1063            pd_id_set,
1064        }
1065    }
1066
1067    /// Calculate the distance from @base to @self. We minimize the number of
1068    /// performance domains involved to reduce the leakage power consumption.
1069    /// We then maximize the overlap between the previous (i.e., base)
1070    /// performance domains and the new one for a smooth transition to the new
1071    /// cpuset with higher cache locality. Finally, we minimize the number of
1072    /// CPUs involved, thereby reducing the chance of contention for shared
1073    /// hardware resources (e.g., shared cache).
1074    fn dist(&self, base: Option<&PDSetInfo<'a>>) -> usize {
1075        let nr_pds = self.pd_id_set.len();
1076        let nr_pds_overlap = match base {
1077            Some(base) => self.pd_id_set.intersection(&base.pd_id_set).count(),
1078            None => 0,
1079        };
1080        let nr_cpus = self.pdcpu_set.len();
1081
1082        ((nr_pds - nr_pds_overlap) * PD_UNIT) +         // # non-overlapping PDs
1083        ((*NR_CPU_IDS - nr_cpus) * CPU_UNIT) +          // # of CPUs
1084        (*NR_CPU_IDS - self.pd_id_set.first().unwrap()) // PD ID as a tiebreaker
1085    }
1086}
1087
1088impl PartialEq for PDSetInfo<'_> {
1089    fn eq(&self, other: &Self) -> bool {
1090        self.performance == other.performance
1091            && self.power == other.power
1092            && self.pdcpu_set == other.pdcpu_set
1093    }
1094}
1095
1096impl Hash for PDSetInfo<'_> {
1097    fn hash<H: Hasher>(&self, state: &mut H) {
1098        // We don't need to hash performance, power, and pd_id_set
1099        // since they are a kind of cache for pds_set.
1100        self.pdcpu_set.hash(state);
1101    }
1102}
1103
1104impl PartialEq for PerfCpuOrder {
1105    fn eq(&self, other: &Self) -> bool {
1106        self.perf_cap == other.perf_cap
1107    }
1108}
1109
1110impl fmt::Display for PerfCpuOrder {
1111    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1112        write!(
1113            f,
1114            "capacity bound:  {} ({}%)\n",
1115            self.perf_cap,
1116            self.perf_util * 100.0
1117        )?;
1118        write!(f, "  primary CPUs:  {:?}\n", self.cpus_perf.borrow())?;
1119        write!(f, "  overflow CPUs: {:?}", self.cpus_ovflw.borrow())?;
1120        Ok(())
1121    }
1122}