scx_rustland_core/
alloc.rs

1// Copyright (c) Andrea Righi <andrea.righi@linux.dev>
2
3// Buddy allocator code imported from https://github.com/jjyr/buddy-alloc
4// and distributed under the terms of the MIT license.
5//
6// MIT License:
7//
8// Permission is hereby granted, free of charge, to any person obtaining a copy
9// of this software and associated documentation files (the "Software"), to deal
10// in the Software without restriction, including without limitation the rights
11// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12// copies of the Software, and to permit persons to whom the Software is
13// furnished to do so, subject to the following conditions:
14//
15// The above copyright notice and this permission notice shall be included in
16// all copies or substantial portions of the Software.
17//
18// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
24// THE SOFTWARE.
25
26// This software may be used and distributed according to the terms of the
27// GNU General Public License version 2.
28
29use seccomp::*;
30use std::alloc::{GlobalAlloc, Layout};
31use std::fs::File;
32use std::io::{BufRead, BufReader, Write};
33use std::sync::Mutex;
34
35/// Buddy allocator
36///
37/// The following code is strongly based on <https://github.com/jjyr/buddy-alloc> and imported
38/// directly here to make packaging easier.
39
40const OOM_MSG: &str = "requires more memory space to initialize BuddyAlloc";
41const LEAF_ALIGN_ERROR_MSG: &str = "leaf size must be align to 16 bytes";
42/// required align to 16 bytes, since Node takes 16 bytes on 64-bits machine.
43pub const MIN_LEAF_SIZE_ALIGN: usize = 16;
44
45const fn block_size_2base(k: usize, leaf2base: usize) -> usize {
46    (1 << k) << leaf2base
47}
48
49const fn nblock(k: usize, entries_size: usize) -> usize {
50    1 << (entries_size - k - 1)
51}
52
53const fn roundup(n: usize, sz2base: usize) -> usize {
54    (((n - 1) >> sz2base) + 1) << sz2base
55}
56
57fn log2(mut n: usize) -> usize {
58    let mut k = 0;
59    while n > 1 {
60        k += 1;
61        n >>= 1;
62    }
63    k
64}
65
66fn bit_isset(bit_array: *const u8, i: usize) -> bool {
67    unsafe {
68        let b = bit_array.add(i >> 3);
69        let m = 1 << (i % 8);
70        *b & m == m
71    }
72}
73
74fn bit_set(bit_array: *mut u8, i: usize) {
75    unsafe {
76        let b = bit_array.add(i >> 3);
77        let m = 1 << (i % 8);
78        *b |= m;
79    }
80}
81
82fn bit_clear(bit_array: *mut u8, i: usize) {
83    debug_assert!(bit_isset(bit_array, i));
84    unsafe {
85        let b = bit_array.add(i >> 3);
86        let m = 1 << (i % 8);
87        *b &= !m;
88    }
89}
90
91// find a min k that great than n bytes
92pub fn first_up_k(n: usize, leaf_size: usize) -> usize {
93    let mut k = 0;
94    let mut size = leaf_size;
95    while size < n {
96        k += 1;
97        size <<= 1;
98    }
99    k
100}
101
102struct Node {
103    next: *mut Node,
104    prev: *mut Node,
105}
106
107impl Node {
108    fn init(list: *mut Node) {
109        unsafe {
110            (*list).next = list;
111            (*list).prev = list;
112        }
113    }
114
115    fn remove(list: *mut Node) {
116        unsafe {
117            (*(*list).prev).next = (*list).next;
118            (*(*list).next).prev = (*list).prev;
119        }
120    }
121
122    fn pop(list: *mut Node) -> *mut Node {
123        debug_assert!(!Self::is_empty(list));
124        let n_list = unsafe { (*list).next };
125        Self::remove(n_list);
126        n_list
127    }
128
129    fn push(list: *mut Node, p: *mut u8) {
130        let p = p.cast::<Node>();
131        unsafe {
132            let n_list: Node = Node {
133                prev: list,
134                next: (*list).next,
135            };
136            // pointer aligned to 16 bytes(MIN_LEAF_SIZE_ALIGN), so it's safe to use write
137            p.write(n_list);
138            (*(*list).next).prev = p;
139            (*list).next = p;
140        }
141    }
142
143    fn is_empty(list: *const Node) -> bool {
144        unsafe { (*list).next as *const Node == list }
145    }
146}
147
148struct Entry {
149    free: *mut Node,
150    /// Bit array to keep tracking alloc
151    alloc: *mut u8,
152    /// Bit array to keep tracking split
153    split: *mut u8,
154}
155
156impl Default for Entry {
157    fn default() -> Self {
158        Entry {
159            free: core::ptr::null_mut(),
160            alloc: core::ptr::null_mut(),
161            split: core::ptr::null_mut(),
162        }
163    }
164}
165
166#[derive(Clone, Copy)]
167pub struct BuddyAllocParam {
168    /// Base addr: the start address
169    base_addr: *const u8,
170    /// Len: available bytes from the start address
171    len: usize,
172    /// Leaf size: the min size to allocate
173    leaf_size: usize,
174    /// Zero filled: in many cases, provided address might already be zero filled,
175    /// in which case we can reduce re-filling zeros to the data again.
176    zero_filled: bool,
177}
178
179impl BuddyAllocParam {
180    /// Base addr: the start address
181    /// Len: available bytes from the start address
182    /// Leaf size: the min size to allocate
183    pub const fn new(base_addr: *const u8, len: usize, leaf_size: usize) -> Self {
184        BuddyAllocParam {
185            base_addr,
186            len,
187            leaf_size,
188            zero_filled: false,
189        }
190    }
191}
192
193pub struct BuddyAlloc {
194    /// memory start addr
195    base_addr: usize,
196    /// memory end addr
197    end_addr: usize,
198    /// unavailable memories at end_addr
199    unavailable: usize,
200    entries: *mut Entry,
201    entries_size: usize,
202    /// min size of a block, represent in 1 << leaf2base
203    leaf2base: usize,
204}
205
206impl BuddyAlloc {
207    /// # Safety
208    ///
209    /// The `base_addr..(base_addr + len)` must be allocated before using,
210    /// and must guarantee no others write to the memory range, to avoid undefined behaviors.
211    /// The new function panic if memory space not enough for initialize BuddyAlloc.
212    pub unsafe fn new(param: BuddyAllocParam) -> Self {
213        unsafe {
214            let BuddyAllocParam {
215                base_addr,
216                len,
217                leaf_size,
218                zero_filled,
219            } = param;
220            let mut base_addr = base_addr as usize;
221            let end_addr = base_addr + len;
222            assert!(
223                leaf_size % MIN_LEAF_SIZE_ALIGN == 0 && leaf_size != 0,
224                "{}",
225                LEAF_ALIGN_ERROR_MSG
226            );
227            let leaf2base = log2(leaf_size);
228            base_addr = roundup(base_addr, leaf2base);
229            // we use (k + 1)-th entry's split flag to test existence of k-th entry's blocks;
230            // to accoding this convention, we make a dummy (entries_size - 1)-th entry.
231            // so we plus 2 on entries_size.
232            let entries_size = log2((end_addr - base_addr) >> leaf2base) + 2;
233
234            // alloc buddy allocator memory
235            let used_bytes = core::mem::size_of::<Entry>() * entries_size;
236            debug_assert!(end_addr >= base_addr + used_bytes, "{}", OOM_MSG);
237            let entries = base_addr as *mut Entry;
238            base_addr += used_bytes;
239
240            let buddy_list_size = core::mem::size_of::<Node>();
241            // init entries free
242            for k in 0..entries_size {
243                // use one bit for per memory block
244                debug_assert!(end_addr >= base_addr + buddy_list_size, "{}", OOM_MSG);
245                let entry = entries.add(k).as_mut().expect("entry");
246                entry.free = base_addr as *mut Node;
247                if !zero_filled {
248                    core::ptr::write_bytes(entry.free, 0, buddy_list_size);
249                }
250                Node::init(entry.free);
251                base_addr += buddy_list_size;
252            }
253
254            // init alloc
255            for k in 0..entries_size {
256                // use one bit for per memory block
257                // use shift instead `/`, 8 == 1 << 3
258                let used_bytes = roundup(nblock(k, entries_size), 3) >> 3;
259                debug_assert!(end_addr >= base_addr + used_bytes, "{}", OOM_MSG);
260                let entry = entries.add(k).as_mut().expect("entry");
261                entry.alloc = base_addr as *mut u8;
262                // mark all blocks as allocated
263                if !zero_filled {
264                    core::ptr::write_bytes(entry.alloc, 0, used_bytes);
265                }
266                base_addr += used_bytes;
267            }
268
269            // init split
270            for k in 1..entries_size {
271                // use one bit for per memory block
272                // use shift instead `/`, 8 == 1 << 3
273                let used_bytes = roundup(nblock(k, entries_size), 3) >> 3;
274                debug_assert!(end_addr >= base_addr + used_bytes, "{}", OOM_MSG);
275                let entry = entries.add(k).as_mut().expect("entry");
276                entry.split = base_addr as *mut u8;
277                if !zero_filled {
278                    core::ptr::write_bytes(entry.split, 0, used_bytes);
279                }
280                base_addr += used_bytes;
281            }
282
283            // align base_addr to leaf size
284            base_addr = roundup(base_addr, leaf2base);
285            assert!(end_addr >= base_addr, "{}", OOM_MSG);
286            debug_assert_eq!(
287                (base_addr >> leaf2base) << leaf2base,
288                base_addr,
289                "misalignment"
290            );
291
292            let mut allocator = BuddyAlloc {
293                base_addr,
294                end_addr,
295                entries,
296                entries_size,
297                leaf2base,
298                unavailable: 0,
299            };
300            allocator.init_free_list();
301            allocator
302        }
303    }
304
305    fn init_free_list(&mut self) {
306        let mut base_addr = self.base_addr;
307        let end_addr = self.end_addr;
308        let entries_size = self.entries_size;
309
310        // try alloc blocks
311        for k in (0..(entries_size - 1)).rev() {
312            let block_size = block_size_2base(k, self.leaf2base);
313            let entry = self.entry(k);
314            let parent_entry = self.entry(k + 1);
315
316            // alloc free blocks
317            while base_addr + block_size <= end_addr {
318                debug_assert!(!bit_isset(
319                    entry.alloc,
320                    self.block_index(k, base_addr as *const u8)
321                ));
322                Node::push(entry.free, base_addr as *mut u8);
323                // mark parent's split and alloc
324                let block_index = self.block_index(k, base_addr as *const u8);
325                if block_index & 1 == 0 {
326                    let parent_index = self.block_index(k + 1, base_addr as *const u8);
327                    bit_set(parent_entry.alloc, parent_index);
328                    bit_set(parent_entry.split, parent_index);
329                }
330                base_addr += block_size;
331            }
332
333            // mark unavailable blocks as allocated
334            let n = nblock(k, entries_size);
335            let unavailable_block_index = self.block_index(k, base_addr as *const u8);
336            debug_assert!(unavailable_block_index < n);
337            bit_set(entry.alloc, unavailable_block_index);
338        }
339
340        self.unavailable = end_addr - base_addr;
341    }
342
343    pub fn malloc(&mut self, nbytes: usize) -> *mut u8 {
344        let fk = first_up_k(nbytes, 1 << self.leaf2base);
345        let mut k = match (fk..self.entries_size).find(|&k| !Node::is_empty(self.entry(k).free)) {
346            Some(k) => k,
347            None => return core::ptr::null_mut(),
348        };
349        let p: *mut u8 = Node::pop(self.entry(k).free) as *mut u8;
350        bit_set(self.entry(k).alloc, self.block_index(k, p));
351        while k > fk {
352            let q: *mut u8 = (p as usize + block_size_2base(k - 1, self.leaf2base)) as *mut u8;
353            bit_set(self.entry(k).split, self.block_index(k, p));
354            let parent_entry = self.entry(k - 1);
355            bit_set(parent_entry.alloc, self.block_index(k - 1, p));
356            debug_assert!(!bit_isset(parent_entry.alloc, self.block_index(k - 1, q)));
357            Node::push(parent_entry.free, q);
358            k -= 1;
359        }
360        debug_assert_eq!(
361            ((p as usize) >> self.leaf2base) << self.leaf2base,
362            p as usize,
363            "misalignment"
364        );
365        p
366    }
367
368    pub fn free(&mut self, mut p: *mut u8) {
369        let mut k = self.find_k_for_p(p);
370        while k < (self.entries_size - 1) {
371            let block_index = self.block_index(k, p);
372            let entry = self.entry(k);
373            bit_clear(entry.alloc, block_index);
374            let is_head = block_index & 1 == 0;
375            let buddy = if is_head {
376                block_index + 1
377            } else {
378                block_index - 1
379            };
380            if bit_isset(entry.alloc, buddy) {
381                break;
382            }
383            // merge buddy since its free
384            // 1. clear split of k + 1
385            // 2. set p to the address of merged block
386            // 3. repeat for k = k + 1 until reach MAX_K
387            // 4. push p back to k entry free list
388            let q = self.block_addr(k, buddy);
389            Node::remove(q as *mut Node);
390            if !is_head {
391                p = q as *mut u8;
392            }
393            bit_clear(self.entry(k + 1).split, self.block_index(k + 1, p));
394            k += 1;
395        }
396        debug_assert!(!bit_isset(self.entry(k).alloc, self.block_index(k, p)));
397        Node::push(self.entry(k).free, p);
398    }
399
400    fn entry(&self, i: usize) -> &Entry {
401        debug_assert!(i < self.entries_size, "index out of range");
402        unsafe { self.entries.add(i).as_ref().expect("entry") }
403    }
404
405    /// find k for p
406    fn find_k_for_p(&self, p: *const u8) -> usize {
407        for k in 0..(self.entries_size - 1) {
408            if bit_isset(self.entry(k + 1).split, self.block_index(k + 1, p)) {
409                debug_assert!(bit_isset(self.entry(k).alloc, self.block_index(k, p)));
410                return k;
411            }
412        }
413        0
414    }
415
416    /// block index of p under k
417    fn block_index(&self, k: usize, p: *const u8) -> usize {
418        if (p as usize) < self.base_addr {
419            // TODO handle this outside
420            panic!("out of memory");
421        }
422        let n = p as usize - self.base_addr;
423        // equal to: n / block_size_2base(k, self.leaf2base);
424        let index = (n >> k) >> self.leaf2base;
425        debug_assert!(index < nblock(k, self.entries_size));
426        index
427    }
428
429    /// block addr of index under k
430    fn block_addr(&self, k: usize, i: usize) -> usize {
431        // equal to: i * block_size_2base(k, self.leaf2base);
432        let n = (i << k) << self.leaf2base;
433        self.base_addr + n
434    }
435}
436
437// Main allocator class.
438pub struct UserAllocator {
439    buddy_alloc_param: BuddyAllocParam,
440    inner_buddy_alloc: Mutex<Option<BuddyAlloc>>,
441}
442
443impl UserAllocator {
444    pub const fn new(buddy_alloc_param: BuddyAllocParam) -> Self {
445        UserAllocator {
446            inner_buddy_alloc: Mutex::new(None),
447            buddy_alloc_param,
448        }
449    }
450
451    unsafe fn fetch_buddy_alloc<R, F: FnOnce(&mut BuddyAlloc) -> R>(&self, f: F) -> R {
452        unsafe {
453            let mut inner = self.inner_buddy_alloc.lock().unwrap();
454            let alloc = inner.get_or_insert_with(|| BuddyAlloc::new(self.buddy_alloc_param));
455            f(alloc)
456        }
457    }
458
459    // Enable a seccomp filter that sends a SIGSYS when mmap() is called.
460    #[allow(static_mut_refs)]
461    pub fn disable_mmap(&self) -> Result<(), Box<dyn std::error::Error>> {
462        let mut ctx = seccomp::Context::default(Action::Allow)?;
463        let syscall_nr = libc::SYS_mmap as usize;
464        let cmp = Compare::arg(0)
465            .using(Op::MaskedEq)
466            .with(0)
467            .build()
468            .ok_or("Failed to build seccomp filter")?;
469
470        let rule = Rule::new(syscall_nr, cmp, Action::Errno(libc::EPERM));
471        ctx.add_rule(rule)?;
472        ctx.load()?;
473
474        Ok(())
475    }
476
477    #[allow(static_mut_refs)]
478    pub fn lock_memory(&self) {
479        unsafe {
480            VM.save().ok();
481
482            // Call setrlimit to set the locked-in-memory limit to unlimited.
483            let new_rlimit = libc::rlimit {
484                rlim_cur: libc::RLIM_INFINITY,
485                rlim_max: libc::RLIM_INFINITY,
486            };
487            let res = libc::setrlimit(libc::RLIMIT_MEMLOCK, &new_rlimit);
488            if res != 0 {
489                panic!("setrlimit failed with error code: {}", res);
490            }
491
492            // Lock all memory to prevent being paged out.
493            let res = libc::mlockall(libc::MCL_CURRENT | libc::MCL_FUTURE);
494            if res != 0 {
495                panic!("mlockall failed with error code: {}", res);
496            }
497
498            // Hint the kernel to use huge pages for the memory arena.
499            let ptr = &mut HEAP.0 as *mut u8 as *mut libc::c_void;
500            libc::madvise(ptr, HEAP_SIZE, libc::MADV_HUGEPAGE);
501        }
502    }
503
504    #[allow(static_mut_refs)]
505    pub fn unlock_memory(&self) {
506        unsafe {
507            VM.restore().ok();
508        };
509    }
510}
511
512// Override global allocator methods.
513unsafe impl GlobalAlloc for UserAllocator {
514    unsafe fn alloc(&self, layout: Layout) -> *mut u8 {
515        unsafe {
516            let bytes = layout.size();
517            self.fetch_buddy_alloc(|alloc| alloc.malloc(bytes))
518        }
519    }
520
521    unsafe fn dealloc(&self, ptr: *mut u8, _layout: Layout) {
522        unsafe {
523            self.fetch_buddy_alloc(|alloc| alloc.free(ptr));
524        }
525    }
526}
527
528unsafe impl Sync for UserAllocator {}
529
530// Buddy allocator parameters.
531const HEAP_SIZE: usize = 64 * 1024 * 1024; // 64M
532const LEAF_SIZE: usize = 64;
533
534#[repr(align(4096))]
535struct AlignedHeap<const N: usize>([u8; N]);
536
537// Statically pre-allocated memory arena.
538static mut HEAP: AlignedHeap<HEAP_SIZE> = AlignedHeap([0u8; HEAP_SIZE]);
539
540// Override default memory allocator.
541//
542// To prevent potential deadlock conditions under heavy loads, any scheduler that delegates
543// scheduling decisions to user-space should avoid triggering page faults.
544//
545// To address this issue, replace the global allocator with a custom one (UserAllocator),
546// designed to operate on a pre-allocated buffer. This, coupled with the memory locking achieved
547// through mlockall(), prevents page faults from occurring during the execution of the user-space
548// scheduler.
549#[allow(static_mut_refs)]
550#[cfg_attr(not(test), global_allocator)]
551pub static ALLOCATOR: UserAllocator =
552    unsafe { UserAllocator::new(BuddyAllocParam::new(HEAP.0.as_ptr(), HEAP_SIZE, LEAF_SIZE)) };
553
554// State of special sysctl VM settings.
555//
556// This is used to save the previous state of some procfs settings that must be changed by the
557// user-space scheduler.
558struct VmSettings {
559    // We cannot allow page faults in the user-space scheduler, otherwise we may hit deadlock
560    // conditions: a kthread may need to run to resolve the page fault, but the user-space
561    // scheduler is waiting on the page fault to be resolved => deadlock.
562    //
563    // To prevent this from happening automatically enforce vm.compact_unevictable_allowed=0 when
564    // the scheduler is running, to disable compaction of unevictable memory pages and make sure
565    // that the scheduler never faults.
566    //
567    // The original value will be restored when the user-space scheduler exits.
568    compact_unevictable_allowed: i32,
569}
570
571impl VmSettings {
572    // Return the content of a procfs file as i32.
573    fn read_procfs(&self, file_path: &str) -> Result<i32, String> {
574        // Attempt to open the file
575        let file = File::open(file_path)
576            .map_err(|err| format!("Failed to open {}: {}", file_path, err))?;
577
578        let reader = BufReader::new(file);
579
580        let line = reader
581            .lines()
582            .next()
583            .ok_or_else(|| format!("File is empty: {}", file_path))?
584            .map_err(|err| format!("Failed to read line from {}: {}", file_path, err))?;
585
586        line.trim()
587            .parse::<i32>()
588            .map_err(|err| format!("Failed to parse {}: {}", file_path, err))
589    }
590
591    // Write an i32 to a file in procfs.
592    fn write_procfs(&self, file_path: &str, value: i32) -> Result<(), String> {
593        // Attempt to create or open the file
594        let mut file = File::create(file_path)
595            .map_err(|err| format!("Failed to open {}: {}", file_path, err))?;
596
597        // Attempt to write the value to the file
598        write!(file, "{}", value)
599            .map_err(|err| format!("Failed to write to {}: {}", file_path, err))
600    }
601
602    // Save all the sysctl VM settings in the internal state.
603    fn save(&self) -> Result<(), String> {
604        let compact_unevictable_allowed = "/proc/sys/vm/compact_unevictable_allowed";
605        let value = self.read_procfs(compact_unevictable_allowed)?;
606        unsafe {
607            VM.compact_unevictable_allowed = value;
608        };
609        self.write_procfs(compact_unevictable_allowed, 0)?;
610
611        Ok(())
612    }
613
614    // Restore all the previous sysctl vm settings.
615    fn restore(&self) -> Result<(), String> {
616        let compact_unevictable_allowed = "/proc/sys/vm/compact_unevictable_allowed";
617        let value = unsafe { VM.compact_unevictable_allowed };
618        self.write_procfs(compact_unevictable_allowed, value)?;
619
620        Ok(())
621    }
622}
623
624// Special sysctl VM settings.
625static mut VM: VmSettings = VmSettings {
626    compact_unevictable_allowed: 0,
627};