scx_lavd/
cpu_order.rs

1// SPDX-License-Identifier: GPL-2.0
2//
3// Copyright (c) 2025 Valve Corporation.
4// Author: Changwoo Min <changwoo@igalia.com>
5
6// This software may be used and distributed according to the terms of the
7// GNU General Public License version 2.
8
9use anyhow::Result;
10use combinations::Combinations;
11use itertools::iproduct;
12use log::debug;
13use scx_utils::CoreType;
14use scx_utils::Cpumask;
15use scx_utils::EnergyModel;
16use scx_utils::PerfDomain;
17use scx_utils::PerfState;
18use scx_utils::Topology;
19use scx_utils::NR_CPU_IDS;
20use std::cell::Cell;
21use std::cell::RefCell;
22use std::collections::BTreeMap;
23use std::collections::BTreeSet;
24use std::collections::HashSet;
25use std::fmt;
26use std::hash::{Hash, Hasher};
27
28#[derive(Debug, Clone)]
29pub struct CpuId {
30    // - *_adx: an absolute index within a system scope
31    // - *_rdx: a relative index under a parent
32    //
33    // - node_adx: a NUMA domain within a system
34    // - pd_adx: a performance domain (CPU frequency domain) within a system
35    //   - llc_rdx: an LLC domain (CCX) under a NUMA domain
36    //     - core_rdx: a core under a LLC domain
37    //       - cpu_rdx: a CPU under a core
38    pub node_adx: usize,
39    pub pd_adx: usize,
40    pub llc_rdx: usize,
41    pub core_rdx: usize,
42    pub cpu_rdx: usize,
43    pub cpu_adx: usize,
44    pub smt_level: usize,
45    pub cache_size: usize,
46    pub cpu_cap: usize,
47    pub big_core: bool,
48    pub turbo_core: bool,
49}
50
51#[derive(Debug, Eq, PartialEq, Ord, PartialOrd, Clone)]
52pub struct ComputeDomainId {
53    pub node_adx: usize,
54    pub llc_rdx: usize,
55    pub is_big: bool,
56}
57
58#[derive(Debug, Clone)]
59pub struct ComputeDomain {
60    pub cpdom_id: usize,
61    pub cpdom_alt_id: Cell<usize>,
62    pub cpu_ids: Vec<usize>,
63    pub neighbor_map: RefCell<BTreeMap<usize, RefCell<Vec<usize>>>>,
64}
65
66#[derive(Debug, Clone)]
67#[allow(dead_code)]
68pub struct PerfCpuOrder {
69    pub perf_cap: usize,                 // performance in capacity
70    pub perf_util: f32,                  // performance in utilization, [0, 1]
71    pub cpus_perf: RefCell<Vec<usize>>,  // CPU adx order within the performance range by @perf_cap
72    pub cpus_ovflw: RefCell<Vec<usize>>, // CPU adx order beyond @perf_cap
73}
74
75#[derive(Debug)]
76#[allow(dead_code)]
77pub struct CpuOrder {
78    pub all_cpus_mask: Cpumask,
79    pub cpuids: Vec<CpuId>,
80    pub perf_cpu_order: BTreeMap<usize, PerfCpuOrder>,
81    pub cpdom_map: BTreeMap<ComputeDomainId, ComputeDomain>,
82    pub smt_enabled: bool,
83    pub has_biglittle: bool,
84    pub has_energy_model: bool,
85}
86
87impl CpuOrder {
88    /// Build a cpu preference order
89    pub fn new() -> Result<CpuOrder> {
90        let ctx = CpuOrderCtx::new();
91        let cpus_pf = ctx.build_topo_order(false).unwrap();
92        let cpus_ps = ctx.build_topo_order(true).unwrap();
93        let cpdom_map = CpuOrderCtx::build_cpdom(&cpus_pf).unwrap();
94        let perf_cpu_order = if ctx.em.is_ok() {
95            let em = ctx.em.unwrap();
96            EnergyModelOptimizer::get_perf_cpu_order_table(&em, &cpus_pf)
97        } else {
98            EnergyModelOptimizer::get_fake_perf_cpu_order_table(&cpus_pf, &cpus_ps)
99        };
100
101        Ok(CpuOrder {
102            all_cpus_mask: ctx.topo.span,
103            cpuids: cpus_pf,
104            perf_cpu_order,
105            cpdom_map,
106            smt_enabled: ctx.smt_enabled,
107            has_biglittle: ctx.has_biglittle,
108            has_energy_model: ctx.has_energy_model,
109        })
110    }
111}
112
113/// CpuOrderCtx is a helper struct used to build a CpuOrder
114struct CpuOrderCtx {
115    topo: Topology,
116    em: Result<EnergyModel>,
117    smt_enabled: bool,
118    has_biglittle: bool,
119    has_energy_model: bool,
120}
121
122impl CpuOrderCtx {
123    fn new() -> Self {
124        let topo = Topology::new().expect("Failed to build host topology");
125        let em = EnergyModel::new();
126        let smt_enabled = topo.smt_enabled;
127        let has_biglittle = topo.has_little_cores();
128        let has_energy_model = em.is_ok();
129
130        debug!("{:#?}", topo);
131        debug!("{:#?}", em);
132
133        CpuOrderCtx {
134            topo,
135            em,
136            smt_enabled,
137            has_biglittle,
138            has_energy_model,
139        }
140    }
141
142    /// Build a CPU preference order based on its optimization target
143    fn build_topo_order(&self, prefer_powersave: bool) -> Option<Vec<CpuId>> {
144        let mut cpu_ids = Vec::new();
145
146        // Build a vector of cpu ids.
147        for (&node_adx, node) in self.topo.nodes.iter() {
148            for (llc_rdx, (&llc_adx, llc)) in node.llcs.iter().enumerate() {
149                for (core_rdx, (_core_adx, core)) in llc.cores.iter().enumerate() {
150                    for (cpu_rdx, (cpu_adx, cpu)) in core.cpus.iter().enumerate() {
151                        let cpu_adx = *cpu_adx;
152                        let pd_adx = Self::get_pd_id(&self.em, cpu_adx, llc_adx);
153                        let cpu_id = CpuId {
154                            node_adx,
155                            pd_adx,
156                            llc_rdx,
157                            core_rdx,
158                            cpu_rdx,
159                            cpu_adx,
160                            smt_level: cpu.smt_level,
161                            cache_size: cpu.cache_size,
162                            cpu_cap: cpu.cpu_capacity,
163                            big_core: cpu.core_type != CoreType::Little,
164                            turbo_core: cpu.core_type == CoreType::Big { turbo: true },
165                        };
166                        cpu_ids.push(RefCell::new(cpu_id));
167                    }
168                }
169            }
170        }
171
172        // Convert a vector of RefCell to a vector of plain cpu_ids
173        let mut cpu_ids2 = Vec::new();
174        for cpu_id in cpu_ids.iter() {
175            cpu_ids2.push(cpu_id.borrow().clone());
176        }
177        let mut cpu_ids = cpu_ids2;
178
179        // Sort the cpu_ids
180        match (prefer_powersave, self.has_biglittle) {
181            // 1. powersave,      no  big/little
182            //     * within the same LLC domain
183            //         - node_adx, llc_rdx,
184            //     * prefer more capable CPU with higher capacity
185            //       and larger cache
186            //         - ^cpu_cap (chip binning), ^cache_size,
187            //     * prefere the SMT core within the same performance domain
188            //         - pd_adx, core_rdx, ^smt_level, cpu_rdx
189            (true, false) => {
190                cpu_ids.sort_by(|a, b| {
191                    a.node_adx
192                        .cmp(&b.node_adx)
193                        .then_with(|| a.llc_rdx.cmp(&b.llc_rdx))
194                        .then_with(|| b.cpu_cap.cmp(&a.cpu_cap))
195                        .then_with(|| b.cache_size.cmp(&a.cache_size))
196                        .then_with(|| a.pd_adx.cmp(&b.pd_adx))
197                        .then_with(|| a.core_rdx.cmp(&b.core_rdx))
198                        .then_with(|| b.smt_level.cmp(&a.smt_level))
199                        .then_with(|| a.cpu_rdx.cmp(&b.cpu_rdx))
200                        .then_with(|| a.cpu_adx.cmp(&b.cpu_adx))
201                });
202            }
203            // 2. powersave,      yes big/little
204            //     * within the same LLC domain
205            //         - node_adx, llc_rdx,
206            //     * prefer energy-efficient LITTLE CPU with a larger cache
207            //         - cpu_cap (big/little), ^cache_size,
208            //     * prefere the SMT core within the same performance domain
209            //         - pd_adx, core_rdx, ^smt_level, cpu_rdx
210            (true, true) => {
211                cpu_ids.sort_by(|a, b| {
212                    a.node_adx
213                        .cmp(&b.node_adx)
214                        .then_with(|| a.llc_rdx.cmp(&b.llc_rdx))
215                        .then_with(|| a.cpu_cap.cmp(&b.cpu_cap))
216                        .then_with(|| b.cache_size.cmp(&a.cache_size))
217                        .then_with(|| a.pd_adx.cmp(&b.pd_adx))
218                        .then_with(|| a.core_rdx.cmp(&b.core_rdx))
219                        .then_with(|| b.smt_level.cmp(&a.smt_level))
220                        .then_with(|| a.cpu_rdx.cmp(&b.cpu_rdx))
221                        .then_with(|| a.cpu_adx.cmp(&b.cpu_adx))
222                });
223            }
224            // 3. performance,    no  big/little
225            // 4. performance,    yes big/little
226            //     * prefer the non-SMT core
227            //         - cpu_rdx,
228            //     * fill the same LLC domain first
229            //         - node_adx, llc_rdx,
230            //     * prefer more capable CPU with higher capacity
231            //       (chip binning or big/little) and larger cache
232            //         - ^cpu_cap, ^cache_size, smt_level
233            //     * within the same power domain
234            //         - pd_adx, core_rdx
235            _ => {
236                cpu_ids.sort_by(|a, b| {
237                    a.cpu_rdx
238                        .cmp(&b.cpu_rdx)
239                        .then_with(|| a.node_adx.cmp(&b.node_adx))
240                        .then_with(|| a.llc_rdx.cmp(&b.llc_rdx))
241                        .then_with(|| b.cpu_cap.cmp(&a.cpu_cap))
242                        .then_with(|| b.cache_size.cmp(&a.cache_size))
243                        .then_with(|| a.smt_level.cmp(&b.smt_level))
244                        .then_with(|| a.pd_adx.cmp(&b.pd_adx))
245                        .then_with(|| a.core_rdx.cmp(&b.core_rdx))
246                        .then_with(|| a.cpu_adx.cmp(&b.cpu_adx))
247                });
248            }
249        }
250
251        Some(cpu_ids)
252    }
253
254    /// Build a list of compute domains
255    fn build_cpdom(cpu_ids: &Vec<CpuId>) -> Option<BTreeMap<ComputeDomainId, ComputeDomain>> {
256        // Note that building compute domain is independent to CPU orer
257        // so it is okay to use any cpus_*.
258
259        // Creat a compute domain map, where a compute domain is a CPUs that
260        // are under the same node and LLC and have the same core type.
261        let mut cpdom_id = 0;
262        let mut cpdom_map: BTreeMap<ComputeDomainId, ComputeDomain> = BTreeMap::new();
263        let mut cpdom_types: BTreeMap<usize, bool> = BTreeMap::new();
264        for cpu_id in cpu_ids.iter() {
265            let key = ComputeDomainId {
266                node_adx: cpu_id.node_adx,
267                llc_rdx: cpu_id.llc_rdx,
268                is_big: cpu_id.big_core,
269            };
270            let value = cpdom_map.entry(key.clone()).or_insert_with(|| {
271                let val = ComputeDomain {
272                    cpdom_id,
273                    cpdom_alt_id: Cell::new(cpdom_id),
274                    cpu_ids: Vec::new(),
275                    neighbor_map: RefCell::new(BTreeMap::new()),
276                };
277                cpdom_types.insert(cpdom_id, key.is_big);
278
279                cpdom_id += 1;
280                val
281            });
282            value.cpu_ids.push(cpu_id.cpu_adx);
283        }
284
285        // Build a neighbor map for each compute domain, where neighbors are
286        // ordered by core type, node, and LLC.
287        for ((from_k, from_v), (to_k, to_v)) in iproduct!(cpdom_map.iter(), cpdom_map.iter()) {
288            if from_k == to_k {
289                continue;
290            }
291
292            let d = Self::dist(from_k, to_k);
293            let mut map = from_v.neighbor_map.borrow_mut();
294            match map.get(&d) {
295                Some(v) => {
296                    v.borrow_mut().push(to_v.cpdom_id);
297                }
298                None => {
299                    map.insert(d, RefCell::new(vec![to_v.cpdom_id]));
300                }
301            }
302        }
303
304        // Fill up cpdom_alt_id for each compute domain.
305        for (k, v) in cpdom_map.iter() {
306            let mut key = k.clone();
307            key.is_big = !k.is_big;
308
309            if let Some(alt_v) = cpdom_map.get(&key) {
310                // First, try to find an alternative domain
311                // under the same node/LLC.
312                v.cpdom_alt_id.set(alt_v.cpdom_id);
313            } else {
314                // If there is no alternative domain in the same node/LLC,
315                // choose the closest one.
316                //
317                // Note that currently, the idle CPU selection (pick_idle_cpu)
318                // is not optimized for this kind of architecture, where big
319                // and LITTLE cores are in different node/LLCs.
320                'outer: for (_dist, ncpdoms) in v.neighbor_map.borrow().iter() {
321                    for ncpdom_id in ncpdoms.borrow().iter() {
322                        if let Some(is_big) = cpdom_types.get(ncpdom_id) {
323                            if *is_big == key.is_big {
324                                v.cpdom_alt_id.set(*ncpdom_id);
325                                break 'outer;
326                            }
327                        }
328                    }
329                }
330            }
331        }
332
333        Some(cpdom_map)
334    }
335
336    /// Get the performance domain (i.e., CPU frequency domain) ID for a CPU.
337    /// If the energy model is not available, use LLC ID instead.
338    fn get_pd_id(em: &Result<EnergyModel>, cpu_adx: usize, llc_adx: usize) -> usize {
339        match em {
340            Ok(em) => em.get_pd_by_cpu_id(cpu_adx).unwrap().id,
341            Err(_) => llc_adx,
342        }
343    }
344
345    /// Calculate distance from two compute domains
346    fn dist(from: &ComputeDomainId, to: &ComputeDomainId) -> usize {
347        let mut d = 0;
348        // core type > numa node > llc
349        if from.is_big != to.is_big {
350            d += 3;
351        }
352        if from.node_adx != to.node_adx {
353            d += 2;
354        } else {
355            if from.llc_rdx != to.llc_rdx {
356                d += 1;
357            }
358        }
359        d
360    }
361}
362
363#[derive(Debug)]
364struct EnergyModelOptimizer<'a> {
365    // Energy model of performance domains
366    em: &'a EnergyModel,
367
368    // CPU preference order in a performance mode purely based on topology
369    cpus_topological_order: Vec<usize>,
370
371    // CPU preference order within a performance domain
372    pd_cpu_order: BTreeMap<usize, RefCell<Vec<usize>>>,
373
374    // Total performance capacity of the system
375    tot_perf: usize,
376
377    // All possible combinations of performance domains & states
378    // indexed by performance.
379    pdss_infos: RefCell<BTreeMap<usize, RefCell<HashSet<PDSetInfo<'a>>>>>,
380
381    // Performance domains and states to achieve a certain performance level,
382    // which is derived from @pdss_infos.
383    perf_pdsi: RefCell<BTreeMap<usize, PDSetInfo<'a>>>,
384
385    // CPU orders indexed by performance
386    perf_cpu_order: RefCell<BTreeMap<usize, PerfCpuOrder>>,
387}
388
389#[derive(Debug, Clone, Eq, Hash, Ord, PartialOrd)]
390struct PDS<'a> {
391    pd: &'a PerfDomain,
392    ps: &'a PerfState,
393}
394
395#[derive(Debug, Clone, Eq, Hash, Ord, PartialOrd)]
396struct PDCpu<'a> {
397    pd: &'a PerfDomain, // performance domain
398    cpu_vid: usize,     // virtual ID of a CPU on the performance domain
399}
400
401#[derive(Debug, Clone, Eq)]
402struct PDSetInfo<'a> {
403    performance: usize,
404    power: usize,
405    pdcpu_set: BTreeSet<PDCpu<'a>>,
406    pd_id_set: BTreeSet<usize>, // pd:id:0, pd:id:1
407}
408
409const PD_UNIT: usize = 100_000_000;
410const CPU_UNIT: usize = 100_000;
411const LOOKAHEAD_CNT: usize = 10;
412
413impl<'a> EnergyModelOptimizer<'a> {
414    fn new(em: &'a EnergyModel, cpus_pf: &'a Vec<CpuId>) -> EnergyModelOptimizer<'a> {
415        let tot_perf = em.perf_total();
416
417        let pdss_infos: BTreeMap<usize, RefCell<HashSet<PDSetInfo<'a>>>> = BTreeMap::new();
418        let pdss_infos = pdss_infos.into();
419
420        let perf_pdsi: BTreeMap<usize, PDSetInfo<'a>> = BTreeMap::new();
421        let perf_pdsi = perf_pdsi.into();
422
423        let mut pd_cpu_order: BTreeMap<usize, RefCell<Vec<usize>>> = BTreeMap::new();
424        let mut cpus_topological_order: Vec<usize> = vec![];
425        for cpuid in cpus_pf.iter() {
426            match pd_cpu_order.get(&cpuid.pd_adx) {
427                Some(v) => {
428                    let mut v = v.borrow_mut();
429                    v.push(cpuid.cpu_adx);
430                }
431                None => {
432                    let v = vec![cpuid.cpu_adx];
433                    pd_cpu_order.insert(cpuid.pd_adx, v.into());
434                }
435            }
436            cpus_topological_order.push(cpuid.cpu_adx);
437        }
438
439        let perf_cpu_order: BTreeMap<usize, PerfCpuOrder> = BTreeMap::new();
440        let perf_cpu_order = perf_cpu_order.into();
441
442        debug!("# pd_cpu_order");
443        debug!("{:#?}", pd_cpu_order);
444
445        EnergyModelOptimizer {
446            em,
447            cpus_topological_order,
448            pd_cpu_order,
449            tot_perf,
450            pdss_infos,
451            perf_pdsi,
452            perf_cpu_order,
453        }
454    }
455
456    fn get_perf_cpu_order_table(
457        em: &'a EnergyModel,
458        cpus_pf: &'a Vec<CpuId>,
459    ) -> BTreeMap<usize, PerfCpuOrder> {
460        let emo = EnergyModelOptimizer::new(em, &cpus_pf);
461        emo.gen_perf_cpu_order_table();
462        let perf_cpu_order = emo.perf_cpu_order.borrow().clone();
463
464        perf_cpu_order
465    }
466
467    fn get_fake_perf_cpu_order_table(
468        cpus_pf: &'a Vec<CpuId>,
469        cpus_ps: &'a Vec<CpuId>,
470    ) -> BTreeMap<usize, PerfCpuOrder> {
471        let tot_perf: usize = cpus_pf.iter().map(|cpuid| cpuid.cpu_cap).sum();
472
473        let pco_pf = Self::fake_pco(tot_perf, cpus_pf, false);
474        let pco_ps = Self::fake_pco(tot_perf, cpus_ps, true);
475
476        let mut perf_cpu_order: BTreeMap<usize, PerfCpuOrder> = BTreeMap::new();
477        perf_cpu_order.insert(pco_pf.perf_cap, pco_pf);
478        perf_cpu_order.insert(pco_ps.perf_cap, pco_ps);
479
480        perf_cpu_order
481    }
482
483    fn fake_pco(tot_perf: usize, cpuids: &'a Vec<CpuId>, powersave: bool) -> PerfCpuOrder {
484        let perf_cap;
485
486        if powersave {
487            perf_cap = cpuids[0].cpu_cap;
488        } else {
489            perf_cap = tot_perf;
490        }
491
492        let perf_util: f32 = (perf_cap as f32) / (tot_perf as f32);
493        let cpus: Vec<usize> = cpuids.iter().map(|cpuid| cpuid.cpu_adx).collect();
494        let cpus_perf: Vec<usize> = cpus[..1].iter().map(|&cpuid| cpuid).collect();
495        let cpus_ovflw: Vec<usize> = cpus[1..].iter().map(|&cpuid| cpuid).collect();
496        PerfCpuOrder {
497            perf_cap,
498            perf_util,
499            cpus_perf: cpus_perf.clone().into(),
500            cpus_ovflw: cpus_ovflw.clone().into(),
501        }
502    }
503
504    /// Generate the performance versus CPU preference order table based on
505    /// the system's CPU topology and energy model. The table consists of the
506    /// following information (PerfCpuOrder):
507    ///
508    ///   - PerfCpuOrder::perf_cap: The upper bound of the performance
509    ///     capacity covered by this tuple.
510    ///
511    ///   - PerfCpuOrder::cpus_perf: Primary CPUs to be used is ordered
512    ///     by preference.
513    ///
514    ///   - PerfCpuOrder::cpus_ovrflw: When the system load goes beyond
515    ///     @perf_cap, the list of CPUs to be used is ordered by preference.
516    fn gen_perf_cpu_order_table(&'a self) {
517        // First, generate all possible combinations of CPUs (e.g., two CPUs
518        // in performance domain 0 and three CPUs in performance domain 1) to
519        // achieve the possible performance capacities with minimal energy
520        // consumption. We assume a reasonable load balancer, so the
521        // utilization of the used CPUs is similar.
522        self.gen_all_pds_combinations();
523
524        // Then, from all the possible combinations of performance versus
525        // CPU sets, select a list of combinations that minimize the number of
526        // active performance domains and reduce the number of performance
527        // domain switches when changing performance levels.
528        self.gen_perf_pds_table();
529
530        // Finally, assign CPUs (@cpu_adx) to the virtual CPU ID (@cpu_vid) of
531        // a performance domain.
532        self.assign_cpu_vids();
533    }
534
535    /// Generate a CPU order table for each performance range.
536    fn assign_cpu_vids(&'a self) {
537        // Generate CPU order within the performance range (@cpus_perf).
538        for (&perf_cap, pdsi) in self.perf_pdsi.borrow().iter() {
539            let mut cpus_perf: Vec<usize> = vec![];
540
541            for pdcpu in pdsi.pdcpu_set.iter() {
542                let pd_id = pdcpu.pd.id;
543                let cpu_vid = pdcpu.cpu_vid;
544                let cpu_order = self.pd_cpu_order.get(&pd_id).unwrap().borrow();
545                let cpu_adx = cpu_order[cpu_vid];
546                cpus_perf.push(cpu_adx);
547            }
548
549            let perf_util: f32 = (perf_cap as f32) / (self.tot_perf as f32);
550            let cpus_perf = self.sort_cpus_by_topological_order(&cpus_perf);
551            let cpus_ovflw: Vec<usize> = vec![];
552
553            let mut perf_cpu_order = self.perf_cpu_order.borrow_mut();
554            perf_cpu_order.insert(
555                perf_cap,
556                PerfCpuOrder {
557                    perf_cap,
558                    perf_util,
559                    cpus_perf: cpus_perf.clone().into(),
560                    cpus_ovflw: cpus_ovflw.clone().into(),
561                },
562            );
563        }
564
565        // Generate CPU order beyond the performance range (@cpus_ovflw).
566        let perf_cpu_order = self.perf_cpu_order.borrow();
567        let perf_caps: Vec<_> = self.perf_pdsi.borrow().keys().cloned().collect();
568        for o in 1..perf_caps.len() {
569            // Gather all @cpus_perf from the upper performance ranges.
570            let ovrflw_perf_caps = &perf_caps[o..];
571            let mut ovrflw_cpus_all: Vec<usize> = vec![];
572            for perf_cap in ovrflw_perf_caps.iter() {
573                let cpu_order = perf_cpu_order.get(perf_cap).unwrap();
574                let cpus_perf = cpu_order.cpus_perf.borrow();
575                ovrflw_cpus_all.extend(cpus_perf.iter().cloned());
576            }
577
578            // Filter out already taken CPUs from the @ovrflw_cpus_all,
579            // and build @cpus_ovrflw.
580            let mut cpu_set = HashSet::<usize>::new();
581            let perf_cap = perf_caps[o - 1];
582            let cpu_order = perf_cpu_order.get(&perf_cap).unwrap();
583            let cpus_perf = cpu_order.cpus_perf.borrow();
584            for &cpu_adx in cpus_perf.iter() {
585                cpu_set.insert(cpu_adx);
586            }
587
588            let mut cpus_ovflw: Vec<usize> = vec![];
589            for &cpu_adx in ovrflw_cpus_all.iter() {
590                if cpu_set.get(&cpu_adx).is_none() {
591                    cpus_ovflw.push(cpu_adx);
592                    cpu_set.insert(cpu_adx);
593                }
594            }
595
596            // Inject the constructed @cpus_ovrflw to the table.
597            let mut v = cpu_order.cpus_ovflw.borrow_mut();
598            v.extend(cpus_ovflw.iter().cloned());
599        }
600
601        // Debug print of the generated table
602        debug!("## gen_perf_cpu_order_table");
603        debug!("{:#?}", perf_cpu_order);
604    }
605
606    /// Sort the CPU IDs by topological order (@self.cpus_topological_order).
607    fn sort_cpus_by_topological_order(&'a self, cpus: &Vec<usize>) -> Vec<usize> {
608        let mut sorted: Vec<usize> = vec![];
609        for &cpu_adx in self.cpus_topological_order.iter() {
610            if let Some(_) = cpus.iter().find(|&&x| x == cpu_adx) {
611                sorted.push(cpu_adx);
612            }
613        }
614        sorted
615    }
616
617    /// Generate a table of performance vs. performance domain sets
618    /// (@self.perf_pdss) from all the possible performance domain & state
619    /// combinations (@self.pdss_infos).
620    ///
621    /// An example result is as follows:
622    ///     PERF: [_, 300]
623    ///             pd:id: 0 -- cpu_vid: 0
624    ///             pd:id: 0 -- cpu_vid: 1
625    ///     PERF: [_, 1138]
626    ///             pd:id: 0 -- cpu_vid: 0
627    ///             pd:id: 0 -- cpu_vid: 1
628    ///             pd:id: 1 -- cpu_vid: 0
629    ///             pd:id: 1 -- cpu_vid: 1
630    ///     PERF: [_, 3386]
631    ///             pd:id: 1 -- cpu_vid: 0
632    ///             pd:id: 1 -- cpu_vid: 1
633    ///             pd:id: 1 -- cpu_vid: 2
634    ///             pd:id: 2 -- cpu_vid: 0
635    ///             pd:id: 2 -- cpu_vid: 1
636    ///     PERF: [_, 3977]
637    ///             pd:id: 0 -- cpu_vid: 0
638    ///             pd:id: 1 -- cpu_vid: 0
639    ///             pd:id: 1 -- cpu_vid: 1
640    ///             pd:id: 1 -- cpu_vid: 2
641    ///             pd:id: 2 -- cpu_vid: 0
642    ///             pd:id: 2 -- cpu_vid: 1
643    ///     PERF: [_, 4508]
644    ///             pd:id: 0 -- cpu_vid: 0
645    ///             pd:id: 0 -- cpu_vid: 1
646    ///             pd:id: 1 -- cpu_vid: 0
647    ///             pd:id: 1 -- cpu_vid: 1
648    ///             pd:id: 1 -- cpu_vid: 2
649    ///             pd:id: 2 -- cpu_vid: 0
650    ///             pd:id: 2 -- cpu_vid: 1
651    ///     PERF: [_, 5627]
652    ///             pd:id: 0 -- cpu_vid: 0
653    ///             pd:id: 0 -- cpu_vid: 1
654    ///             pd:id: 1 -- cpu_vid: 0
655    ///             pd:id: 1 -- cpu_vid: 1
656    ///             pd:id: 1 -- cpu_vid: 2
657    ///             pd:id: 2 -- cpu_vid: 0
658    ///             pd:id: 2 -- cpu_vid: 1
659    ///             pd:id: 3 -- cpu_vid: 0
660    fn gen_perf_pds_table(&'a self) {
661        let utils = vec![0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0];
662
663        // Find the best performance domains for each system utilization target.
664        for &util in utils.iter() {
665            let mut best_pdsi: Option<PDSetInfo<'a>>;
666            let mut del_pdsi: Option<PDSetInfo<'a>> = None;
667
668            match self.perf_pdsi.borrow().last_key_value() {
669                Some((_, base)) => {
670                    best_pdsi = self.find_perf_pds_for(util, Some(base));
671
672                    // If the next performance level (@best_pdsi) is subsumed
673                    // by the previous level (@base), extend the base to the
674                    // next level. To this end, insert the extended base (with
675                    // updated performance and power values) and delete the old
676                    // base.
677                    if let Some(ref best) = best_pdsi {
678                        if best.pdcpu_set.is_subset(&base.pdcpu_set) {
679                            let ext_pdcpu = PDSetInfo {
680                                performance: best.performance,
681                                power: best.power,
682                                pdcpu_set: base.pdcpu_set.clone(),
683                                pd_id_set: base.pd_id_set.clone(),
684                            };
685                            best_pdsi = Some(ext_pdcpu);
686                            del_pdsi = Some(base.clone());
687                        }
688                    }
689                }
690                None => {
691                    best_pdsi = self.find_perf_pds_for(util, None);
692                }
693            };
694
695            if let Some(best_pdsi) = best_pdsi {
696                self.perf_pdsi
697                    .borrow_mut()
698                    .insert(best_pdsi.performance, best_pdsi);
699            }
700
701            if let Some(del_pdsi) = del_pdsi {
702                self.perf_pdsi.borrow_mut().remove(&del_pdsi.performance);
703            }
704        }
705
706        // Debug print of the generated table
707        debug!("## gen_perf_pds_table");
708        for (perf, pdsi) in self.perf_pdsi.borrow().iter() {
709            debug!("PERF: [_, {}]", perf);
710            for pdcpu in pdsi.pdcpu_set.iter() {
711                debug!(
712                    "        pd:id: {:?} -- cpu_vid: {}",
713                    pdcpu.pd.id, pdcpu.cpu_vid
714                );
715            }
716        }
717    }
718
719    fn find_perf_pds_for(
720        &'a self,
721        util: f32,
722        base: Option<&PDSetInfo<'a>>,
723    ) -> Option<PDSetInfo<'a>> {
724        let target_perf = (util * self.tot_perf as f32) as usize;
725        let mut lookahead = 0;
726        let mut min_dist: usize = usize::MAX;
727        let mut best_pdsi: Option<PDSetInfo<'a>> = None;
728
729        let pdss_infos = self.pdss_infos.borrow();
730        for (&pdsi_perf, pdsi_set) in pdss_infos.iter() {
731            if pdsi_perf >= target_perf {
732                let pdsi_set_ref = pdsi_set.borrow();
733                for pdsi in pdsi_set_ref.iter() {
734                    let dist = pdsi.dist(base);
735                    if dist < min_dist {
736                        min_dist = dist;
737                        best_pdsi = Some(pdsi.clone());
738                    }
739                }
740                lookahead += 1;
741                if lookahead >= LOOKAHEAD_CNT {
742                    break;
743                }
744            }
745        }
746
747        best_pdsi
748    }
749
750    /// Generate all possible performance domain & state combinations,
751    /// @self.pdss_infos. Each combination represents a set of performance
752    /// domains (and their corresponding performance states) that achieve the
753    /// requested performance with minimal power consumption.
754    ///
755    /// We assume a 'reasonable load balancer,' so the CPU utilization of all
756    /// the involved CPUs is similar.
757    ///
758    /// An example result is as follows:
759    ///
760    ///     PERF: [_, 5135]
761    ///         perf: 5135 -- power: 5475348
762    ///             pd:id: 0 -- cpu_vid: 0
763    ///             pd:id: 1 -- cpu_vid: 0
764    ///             pd:id: 1 -- cpu_vid: 1
765    ///             pd:id: 1 -- cpu_vid: 2
766    ///             pd:id: 2 -- cpu_vid: 0
767    ///             pd:id: 2 -- cpu_vid: 1
768    ///             pd:id: 3 -- cpu_vid: 0
769    ///     PERF: [_, 5187]
770    ///         perf: 5187 -- power: 4844969
771    ///             pd:id: 0 -- cpu_vid: 0
772    ///             pd:id: 0 -- cpu_vid: 1
773    ///             pd:id: 1 -- cpu_vid: 0
774    ///             pd:id: 1 -- cpu_vid: 1
775    ///             pd:id: 1 -- cpu_vid: 2
776    ///             pd:id: 2 -- cpu_vid: 0
777    ///             pd:id: 2 -- cpu_vid: 1
778    ///             pd:id: 3 -- cpu_vid: 0
779    ///     PERF: [_, 5195]
780    ///         perf: 5195 -- power: 5924606
781    ///             pd:id: 1 -- cpu_vid: 0
782    ///             pd:id: 1 -- cpu_vid: 1
783    ///             pd:id: 1 -- cpu_vid: 2
784    ///             pd:id: 2 -- cpu_vid: 0
785    ///             pd:id: 2 -- cpu_vid: 1
786    ///             pd:id: 3 -- cpu_vid: 0
787    ///     PERF: [_, 5217]
788    ///         perf: 5217 -- power: 4894911
789    ///             pd:id: 0 -- cpu_vid: 0
790    ///             pd:id: 0 -- cpu_vid: 1
791    ///             pd:id: 1 -- cpu_vid: 0
792    ///             pd:id: 1 -- cpu_vid: 1
793    ///             pd:id: 1 -- cpu_vid: 2
794    ///             pd:id: 2 -- cpu_vid: 0
795    ///             pd:id: 2 -- cpu_vid: 1
796    ///             pd:id: 3 -- cpu_vid: 0
797    ///     PERF: [_, 5225]
798    ///         perf: 5225 -- power: 5665770
799    ///             pd:id: 0 -- cpu_vid: 0
800    ///             pd:id: 1 -- cpu_vid: 0
801    ///             pd:id: 1 -- cpu_vid: 1
802    ///             pd:id: 1 -- cpu_vid: 2
803    ///             pd:id: 2 -- cpu_vid: 0
804    ///             pd:id: 2 -- cpu_vid: 1
805    ///             pd:id: 3 -- cpu_vid: 0
806    ///     PERF: [_, 5316]
807    ///         perf: 5316 -- power: 5860568
808    ///             pd:id: 0 -- cpu_vid: 0
809    ///             pd:id: 1 -- cpu_vid: 0
810    ///             pd:id: 1 -- cpu_vid: 1
811    ///             pd:id: 1 -- cpu_vid: 2
812    ///             pd:id: 2 -- cpu_vid: 0
813    ///             pd:id: 2 -- cpu_vid: 1
814    ///             pd:id: 3 -- cpu_vid: 0
815    fn gen_all_pds_combinations(&'a self) {
816        // Start from the min (0%) and max (100%) CPU utilizations
817        let pdsi_vec = self.gen_pds_combinations(0.0);
818        self.insert_pds_combinations(&pdsi_vec);
819
820        let pdsi_vec = self.gen_pds_combinations(100.0);
821        self.insert_pds_combinations(&pdsi_vec);
822
823        // Then dive into the range between the min and max.
824        self.gen_perf_cpuset_table_range(0, 100);
825
826        // Debug print performance table
827        debug!("## gen_all_pds_combinations");
828        for (perf, pdss_info) in self.pdss_infos.borrow().iter() {
829            debug!("PERF: [_, {}]", perf);
830            for pdsi in pdss_info.borrow().iter() {
831                debug!("    perf: {} -- power: {}", pdsi.performance, pdsi.power);
832                for pdcpu in pdsi.pdcpu_set.iter() {
833                    debug!(
834                        "        pd:id: {:?} -- cpu_vid: {}",
835                        pdcpu.pd.id, pdcpu.cpu_vid
836                    );
837                }
838            }
839        }
840    }
841
842    fn gen_perf_cpuset_table_range(&'a self, low: isize, high: isize) {
843        if low > high {
844            return;
845        }
846
847        // If there is a new performance point in the middle,
848        // let's further explore. Otherwise, stop it here.
849        let mid: isize = low + (high - low) / 2;
850        let pdsi_vec = self.gen_pds_combinations(mid as f32);
851        let found_new = self.insert_pds_combinations(&pdsi_vec);
852        if found_new {
853            self.gen_perf_cpuset_table_range(mid + 1, high);
854            self.gen_perf_cpuset_table_range(low, mid - 1);
855        }
856    }
857
858    fn gen_pds_combinations(&'a self, util: f32) -> Vec<PDSetInfo<'a>> {
859        let mut pdsi_vec = Vec::new();
860
861        let pds_set = self.gen_pds_set(util);
862        let n = pds_set.len();
863        for k in 1..n {
864            let pdss = pds_set.clone();
865            let pds_cmbs: Vec<_> = Combinations::new(pdss, k)
866                .map(|cmb| PDSetInfo::new(cmb.clone()))
867                .collect();
868            pdsi_vec.extend(pds_cmbs);
869        }
870
871        let pdsi = PDSetInfo::new(pds_set.clone());
872        pdsi_vec.push(pdsi);
873
874        pdsi_vec
875    }
876
877    fn insert_pds_combinations(&self, new_pdsi_vec: &Vec<PDSetInfo<'a>>) -> bool {
878        // For the same performance, keep the PDS combinations with the lowest
879        // power consumption. If there are more than one lowest, keep them all
880        // to choose one later when assigning CPUs from the selected
881        // performance domains.
882        let mut found_new = false;
883
884        for new_pdsi in new_pdsi_vec.iter() {
885            let mut pdss_infos = self.pdss_infos.borrow_mut();
886            let v = pdss_infos.get(&new_pdsi.performance);
887            match v {
888                // There are already PDSetInfo in the list.
889                Some(v) => {
890                    let mut v = v.borrow_mut();
891                    let pdsi = &v.iter().next().unwrap();
892                    if pdsi.power == new_pdsi.power {
893                        // If the power consumptions are the same, keep both.
894                        if v.insert(new_pdsi.clone()) {
895                            found_new = true;
896                        }
897                    } else if pdsi.power > new_pdsi.power {
898                        // If the new one takes less power, keep the new one.
899                        v.clear();
900                        v.insert(new_pdsi.clone());
901                        found_new = true;
902                    }
903                }
904                // This is the first for the performance target.
905                None => {
906                    // Let's add it and move on.
907                    let mut v: HashSet<PDSetInfo<'a>> = HashSet::new();
908                    v.insert(new_pdsi.clone());
909                    pdss_infos.insert(new_pdsi.performance, v.into());
910                    found_new = true;
911                }
912            }
913        }
914        found_new
915    }
916
917    /// Get a vector of (performance domain, performance state) to achieve
918    /// the given CPU utilization, @util.
919    fn gen_pds_set(&self, util: f32) -> Vec<PDS<'_>> {
920        let mut pds_set = vec![];
921        for (_, pd) in self.em.perf_doms.iter() {
922            let ps = pd.select_perf_state(util).unwrap();
923            let pds = PDS::new(pd, ps);
924            pds_set.push(pds);
925        }
926        self.expand_pds_set(&mut pds_set);
927        pds_set
928    }
929
930    /// Expand a PDS vector such that a performance domain with X CPUs
931    /// has N elements in the vector. This is purely for generating
932    /// combinations easy.
933    fn expand_pds_set(&self, pds_set: &mut Vec<PDS<'_>>) {
934        let mut xset = vec![];
935        // For a performance domain having nr_cpus, add nr_cpus-1 more
936        // PDS to make the PDS nr_cpus in the vector.
937        for pds in pds_set.iter() {
938            let nr_cpus = pds.pd.span.weight();
939            for _ in 1..nr_cpus {
940                xset.push(pds.clone());
941            }
942        }
943        pds_set.append(&mut xset);
944
945        // Sort the pds_set for easy comparision.
946        pds_set.sort();
947    }
948}
949
950impl<'a> PDS<'_> {
951    fn new(pd: &'a PerfDomain, ps: &'a PerfState) -> PDS<'a> {
952        PDS { pd, ps }
953    }
954}
955
956impl PartialEq for PDS<'_> {
957    fn eq(&self, other: &Self) -> bool {
958        self.pd == other.pd && self.ps == other.ps
959    }
960}
961
962impl<'a> PDCpu<'_> {
963    fn new(pd: &'a PerfDomain, cpu_vid: usize) -> PDCpu<'a> {
964        PDCpu { pd, cpu_vid }
965    }
966}
967
968impl PartialEq for PDCpu<'_> {
969    fn eq(&self, other: &Self) -> bool {
970        self.pd == other.pd && self.cpu_vid == other.cpu_vid
971    }
972}
973
974impl fmt::Display for PDS<'_> {
975    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
976        write!(
977            f,
978            "pd:id:{}/pd:weight:{}/ps:cap:{}/ps:power:{}",
979            self.pd.id,
980            self.pd.span.weight(),
981            self.ps.performance,
982            self.ps.power,
983        )?;
984        Ok(())
985    }
986}
987
988impl<'a> PDSetInfo<'_> {
989    fn new(pds_set: Vec<PDS<'a>>) -> PDSetInfo<'a> {
990        // Create a pd_id_set and calculate performance and power.
991        let mut performance = 0;
992        let mut power = 0;
993        let mut pd_id_set: BTreeSet<usize> = BTreeSet::new();
994
995        for pds in pds_set.iter() {
996            performance += pds.ps.performance;
997            power += pds.ps.power;
998            pd_id_set.insert(pds.pd.id);
999        }
1000
1001        // Create a pdcpu_set, so first gather the same PDS entires.
1002        let mut pds_map: BTreeMap<PDS<'a>, RefCell<Vec<PDS<'a>>>> = BTreeMap::new();
1003
1004        for pds in pds_set.iter() {
1005            let v = pds_map.get(&pds);
1006            match v {
1007                Some(v) => {
1008                    let mut v = v.borrow_mut();
1009                    v.push(pds.clone());
1010                }
1011                None => {
1012                    let mut v: Vec<PDS<'a>> = Vec::new();
1013                    v.push(pds.clone());
1014                    pds_map.insert(pds.clone(), v.into());
1015                }
1016            }
1017        }
1018        // Then assign cpu virtual ids to pdcpu_set.
1019        let mut pdcpu_set: BTreeSet<PDCpu<'a>> = BTreeSet::new();
1020        let pds_map = pds_map;
1021
1022        for (_, v) in pds_map.iter() {
1023            for (cpu_vid, pds) in v.borrow().iter().enumerate() {
1024                let pdcpu = PDCpu::new(pds.pd, cpu_vid);
1025                pdcpu_set.insert(pdcpu);
1026            }
1027        }
1028
1029        PDSetInfo {
1030            performance,
1031            power,
1032            pdcpu_set,
1033            pd_id_set,
1034        }
1035    }
1036
1037    /// Calculate the distance from @base to @self. We minimize the number of
1038    /// performance domains involved to reduce the leakage power consumption.
1039    /// We then maximize the overlap between the previous (i.e., base)
1040    /// performance domains and the new one for a smooth transition to the new
1041    /// cpuset with higher cache locality. Finally, we minimize the number of
1042    /// CPUs involved, thereby reducing the chance of contention for shared
1043    /// hardware resources (e.g., shared cache).
1044    fn dist(&self, base: Option<&PDSetInfo<'a>>) -> usize {
1045        let nr_pds = self.pd_id_set.len();
1046        let nr_pds_overlap = match base {
1047            Some(base) => self.pd_id_set.intersection(&base.pd_id_set).count(),
1048            None => 0,
1049        };
1050        let nr_cpus = self.pdcpu_set.len();
1051
1052        ((nr_pds - nr_pds_overlap) * PD_UNIT) +         // # non-overlapping PDs
1053        ((*NR_CPU_IDS - nr_cpus) * CPU_UNIT) +          // # of CPUs
1054        (*NR_CPU_IDS - self.pd_id_set.first().unwrap()) // PD ID as a tiebreaker
1055    }
1056}
1057
1058impl PartialEq for PDSetInfo<'_> {
1059    fn eq(&self, other: &Self) -> bool {
1060        self.performance == other.performance
1061            && self.power == other.power
1062            && self.pdcpu_set == other.pdcpu_set
1063    }
1064}
1065
1066impl Hash for PDSetInfo<'_> {
1067    fn hash<H: Hasher>(&self, state: &mut H) {
1068        // We don't need to hash performance, power, and pd_id_set
1069        // since they are a kind of cache for pds_set.
1070        self.pdcpu_set.hash(state);
1071    }
1072}
1073
1074impl PartialEq for PerfCpuOrder {
1075    fn eq(&self, other: &Self) -> bool {
1076        self.perf_cap == other.perf_cap
1077    }
1078}
1079
1080impl fmt::Display for PerfCpuOrder {
1081    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1082        write!(
1083            f,
1084            "capacity bound:  {} ({}%)\n",
1085            self.perf_cap,
1086            self.perf_util * 100.0
1087        )?;
1088        write!(f, "  primary CPUs:  {:?}\n", self.cpus_perf.borrow())?;
1089        write!(f, "  overflow CPUs: {:?}", self.cpus_ovflw.borrow())?;
1090        Ok(())
1091    }
1092}