1use seccomp::*;
30use std::alloc::{GlobalAlloc, Layout};
31use std::fs::File;
32use std::io::{BufRead, BufReader, Write};
33use std::sync::Mutex;
34
35const OOM_MSG: &str = "requires more memory space to initialize BuddyAlloc";
41const LEAF_ALIGN_ERROR_MSG: &str = "leaf size must be align to 16 bytes";
42pub const MIN_LEAF_SIZE_ALIGN: usize = 16;
44
45const fn block_size_2base(k: usize, leaf2base: usize) -> usize {
46 (1 << k) << leaf2base
47}
48
49const fn nblock(k: usize, entries_size: usize) -> usize {
50 1 << (entries_size - k - 1)
51}
52
53const fn roundup(n: usize, sz2base: usize) -> usize {
54 (((n - 1) >> sz2base) + 1) << sz2base
55}
56
57fn log2(mut n: usize) -> usize {
58 let mut k = 0;
59 while n > 1 {
60 k += 1;
61 n >>= 1;
62 }
63 k
64}
65
66fn bit_isset(bit_array: *const u8, i: usize) -> bool {
67 unsafe {
68 let b = bit_array.add(i >> 3);
69 let m = 1 << (i % 8);
70 *b & m == m
71 }
72}
73
74fn bit_set(bit_array: *mut u8, i: usize) {
75 unsafe {
76 let b = bit_array.add(i >> 3);
77 let m = 1 << (i % 8);
78 *b |= m;
79 }
80}
81
82fn bit_clear(bit_array: *mut u8, i: usize) {
83 debug_assert!(bit_isset(bit_array, i));
84 unsafe {
85 let b = bit_array.add(i >> 3);
86 let m = 1 << (i % 8);
87 *b &= !m;
88 }
89}
90
91pub fn first_up_k(n: usize, leaf_size: usize) -> usize {
93 let mut k = 0;
94 let mut size = leaf_size;
95 while size < n {
96 k += 1;
97 size <<= 1;
98 }
99 k
100}
101
102struct Node {
103 next: *mut Node,
104 prev: *mut Node,
105}
106
107impl Node {
108 fn init(list: *mut Node) {
109 unsafe {
110 (*list).next = list;
111 (*list).prev = list;
112 }
113 }
114
115 fn remove(list: *mut Node) {
116 unsafe {
117 (*(*list).prev).next = (*list).next;
118 (*(*list).next).prev = (*list).prev;
119 }
120 }
121
122 fn pop(list: *mut Node) -> *mut Node {
123 debug_assert!(!Self::is_empty(list));
124 let n_list = unsafe { (*list).next };
125 Self::remove(n_list);
126 n_list
127 }
128
129 fn push(list: *mut Node, p: *mut u8) {
130 let p = p.cast::<Node>();
131 unsafe {
132 let n_list: Node = Node {
133 prev: list,
134 next: (*list).next,
135 };
136 p.write(n_list);
138 (*(*list).next).prev = p;
139 (*list).next = p;
140 }
141 }
142
143 fn is_empty(list: *const Node) -> bool {
144 unsafe { (*list).next as *const Node == list }
145 }
146}
147
148struct Entry {
149 free: *mut Node,
150 alloc: *mut u8,
152 split: *mut u8,
154}
155
156impl Default for Entry {
157 fn default() -> Self {
158 Entry {
159 free: core::ptr::null_mut(),
160 alloc: core::ptr::null_mut(),
161 split: core::ptr::null_mut(),
162 }
163 }
164}
165
166#[derive(Clone, Copy)]
167pub struct BuddyAllocParam {
168 base_addr: *const u8,
170 len: usize,
172 leaf_size: usize,
174 zero_filled: bool,
177}
178
179impl BuddyAllocParam {
180 pub const fn new(base_addr: *const u8, len: usize, leaf_size: usize) -> Self {
184 BuddyAllocParam {
185 base_addr,
186 len,
187 leaf_size,
188 zero_filled: false,
189 }
190 }
191}
192
193pub struct BuddyAlloc {
194 base_addr: usize,
196 end_addr: usize,
198 unavailable: usize,
200 entries: *mut Entry,
201 entries_size: usize,
202 leaf2base: usize,
204}
205
206impl BuddyAlloc {
207 pub unsafe fn new(param: BuddyAllocParam) -> Self {
213 unsafe {
214 let BuddyAllocParam {
215 base_addr,
216 len,
217 leaf_size,
218 zero_filled,
219 } = param;
220 let mut base_addr = base_addr as usize;
221 let end_addr = base_addr + len;
222 assert!(
223 leaf_size % MIN_LEAF_SIZE_ALIGN == 0 && leaf_size != 0,
224 "{}",
225 LEAF_ALIGN_ERROR_MSG
226 );
227 let leaf2base = log2(leaf_size);
228 base_addr = roundup(base_addr, leaf2base);
229 let entries_size = log2((end_addr - base_addr) >> leaf2base) + 2;
233
234 let used_bytes = core::mem::size_of::<Entry>() * entries_size;
236 debug_assert!(end_addr >= base_addr + used_bytes, "{}", OOM_MSG);
237 let entries = base_addr as *mut Entry;
238 base_addr += used_bytes;
239
240 let buddy_list_size = core::mem::size_of::<Node>();
241 for k in 0..entries_size {
243 debug_assert!(end_addr >= base_addr + buddy_list_size, "{}", OOM_MSG);
245 let entry = entries.add(k).as_mut().expect("entry");
246 entry.free = base_addr as *mut Node;
247 if !zero_filled {
248 core::ptr::write_bytes(entry.free, 0, buddy_list_size);
249 }
250 Node::init(entry.free);
251 base_addr += buddy_list_size;
252 }
253
254 for k in 0..entries_size {
256 let used_bytes = roundup(nblock(k, entries_size), 3) >> 3;
259 debug_assert!(end_addr >= base_addr + used_bytes, "{}", OOM_MSG);
260 let entry = entries.add(k).as_mut().expect("entry");
261 entry.alloc = base_addr as *mut u8;
262 if !zero_filled {
264 core::ptr::write_bytes(entry.alloc, 0, used_bytes);
265 }
266 base_addr += used_bytes;
267 }
268
269 for k in 1..entries_size {
271 let used_bytes = roundup(nblock(k, entries_size), 3) >> 3;
274 debug_assert!(end_addr >= base_addr + used_bytes, "{}", OOM_MSG);
275 let entry = entries.add(k).as_mut().expect("entry");
276 entry.split = base_addr as *mut u8;
277 if !zero_filled {
278 core::ptr::write_bytes(entry.split, 0, used_bytes);
279 }
280 base_addr += used_bytes;
281 }
282
283 base_addr = roundup(base_addr, leaf2base);
285 assert!(end_addr >= base_addr, "{}", OOM_MSG);
286 debug_assert_eq!(
287 (base_addr >> leaf2base) << leaf2base,
288 base_addr,
289 "misalignment"
290 );
291
292 let mut allocator = BuddyAlloc {
293 base_addr,
294 end_addr,
295 entries,
296 entries_size,
297 leaf2base,
298 unavailable: 0,
299 };
300 allocator.init_free_list();
301 allocator
302 }
303 }
304
305 fn init_free_list(&mut self) {
306 let mut base_addr = self.base_addr;
307 let end_addr = self.end_addr;
308 let entries_size = self.entries_size;
309
310 for k in (0..(entries_size - 1)).rev() {
312 let block_size = block_size_2base(k, self.leaf2base);
313 let entry = self.entry(k);
314 let parent_entry = self.entry(k + 1);
315
316 while base_addr + block_size <= end_addr {
318 debug_assert!(!bit_isset(
319 entry.alloc,
320 self.block_index(k, base_addr as *const u8)
321 ));
322 Node::push(entry.free, base_addr as *mut u8);
323 let block_index = self.block_index(k, base_addr as *const u8);
325 if block_index & 1 == 0 {
326 let parent_index = self.block_index(k + 1, base_addr as *const u8);
327 bit_set(parent_entry.alloc, parent_index);
328 bit_set(parent_entry.split, parent_index);
329 }
330 base_addr += block_size;
331 }
332
333 let n = nblock(k, entries_size);
335 let unavailable_block_index = self.block_index(k, base_addr as *const u8);
336 debug_assert!(unavailable_block_index < n);
337 bit_set(entry.alloc, unavailable_block_index);
338 }
339
340 self.unavailable = end_addr - base_addr;
341 }
342
343 pub fn malloc(&mut self, nbytes: usize) -> *mut u8 {
344 let fk = first_up_k(nbytes, 1 << self.leaf2base);
345 let mut k = match (fk..self.entries_size).find(|&k| !Node::is_empty(self.entry(k).free)) {
346 Some(k) => k,
347 None => return core::ptr::null_mut(),
348 };
349 let p: *mut u8 = Node::pop(self.entry(k).free) as *mut u8;
350 bit_set(self.entry(k).alloc, self.block_index(k, p));
351 while k > fk {
352 let q: *mut u8 = (p as usize + block_size_2base(k - 1, self.leaf2base)) as *mut u8;
353 bit_set(self.entry(k).split, self.block_index(k, p));
354 let parent_entry = self.entry(k - 1);
355 bit_set(parent_entry.alloc, self.block_index(k - 1, p));
356 debug_assert!(!bit_isset(parent_entry.alloc, self.block_index(k - 1, q)));
357 Node::push(parent_entry.free, q);
358 k -= 1;
359 }
360 debug_assert_eq!(
361 ((p as usize) >> self.leaf2base) << self.leaf2base,
362 p as usize,
363 "misalignment"
364 );
365 p
366 }
367
368 pub fn free(&mut self, mut p: *mut u8) {
369 let mut k = self.find_k_for_p(p);
370 while k < (self.entries_size - 1) {
371 let block_index = self.block_index(k, p);
372 let entry = self.entry(k);
373 bit_clear(entry.alloc, block_index);
374 let is_head = block_index & 1 == 0;
375 let buddy = if is_head {
376 block_index + 1
377 } else {
378 block_index - 1
379 };
380 if bit_isset(entry.alloc, buddy) {
381 break;
382 }
383 let q = self.block_addr(k, buddy);
389 Node::remove(q as *mut Node);
390 if !is_head {
391 p = q as *mut u8;
392 }
393 bit_clear(self.entry(k + 1).split, self.block_index(k + 1, p));
394 k += 1;
395 }
396 debug_assert!(!bit_isset(self.entry(k).alloc, self.block_index(k, p)));
397 Node::push(self.entry(k).free, p);
398 }
399
400 fn entry(&self, i: usize) -> &Entry {
401 debug_assert!(i < self.entries_size, "index out of range");
402 unsafe { self.entries.add(i).as_ref().expect("entry") }
403 }
404
405 fn find_k_for_p(&self, p: *const u8) -> usize {
407 for k in 0..(self.entries_size - 1) {
408 if bit_isset(self.entry(k + 1).split, self.block_index(k + 1, p)) {
409 debug_assert!(bit_isset(self.entry(k).alloc, self.block_index(k, p)));
410 return k;
411 }
412 }
413 0
414 }
415
416 fn block_index(&self, k: usize, p: *const u8) -> usize {
418 if (p as usize) < self.base_addr {
419 panic!("out of memory");
421 }
422 let n = p as usize - self.base_addr;
423 let index = (n >> k) >> self.leaf2base;
425 debug_assert!(index < nblock(k, self.entries_size));
426 index
427 }
428
429 fn block_addr(&self, k: usize, i: usize) -> usize {
431 let n = (i << k) << self.leaf2base;
433 self.base_addr + n
434 }
435}
436
437pub struct UserAllocator {
439 buddy_alloc_param: BuddyAllocParam,
440 inner_buddy_alloc: Mutex<Option<BuddyAlloc>>,
441}
442
443impl UserAllocator {
444 pub const fn new(buddy_alloc_param: BuddyAllocParam) -> Self {
445 UserAllocator {
446 inner_buddy_alloc: Mutex::new(None),
447 buddy_alloc_param,
448 }
449 }
450
451 unsafe fn fetch_buddy_alloc<R, F: FnOnce(&mut BuddyAlloc) -> R>(&self, f: F) -> R {
452 unsafe {
453 let mut inner = self.inner_buddy_alloc.lock().unwrap();
454 let alloc = inner.get_or_insert_with(|| BuddyAlloc::new(self.buddy_alloc_param));
455 f(alloc)
456 }
457 }
458
459 #[allow(static_mut_refs)]
461 pub fn disable_mmap(&self) -> Result<(), Box<dyn std::error::Error>> {
462 let mut ctx = seccomp::Context::default(Action::Allow)?;
463 let syscall_nr = libc::SYS_mmap as usize;
464 let cmp = Compare::arg(0)
465 .using(Op::MaskedEq)
466 .with(0)
467 .build()
468 .ok_or("Failed to build seccomp filter")?;
469
470 let rule = Rule::new(syscall_nr, cmp, Action::Errno(libc::EPERM));
471 ctx.add_rule(rule)?;
472 ctx.load()?;
473
474 Ok(())
475 }
476
477 #[allow(static_mut_refs)]
478 pub fn lock_memory(&self) {
479 unsafe {
480 VM.save().ok();
481
482 let new_rlimit = libc::rlimit {
484 rlim_cur: libc::RLIM_INFINITY,
485 rlim_max: libc::RLIM_INFINITY,
486 };
487 let res = libc::setrlimit(libc::RLIMIT_MEMLOCK, &new_rlimit);
488 if res != 0 {
489 panic!("setrlimit failed with error code: {}", res);
490 }
491
492 let res = libc::mlockall(libc::MCL_CURRENT | libc::MCL_FUTURE);
494 if res != 0 {
495 panic!("mlockall failed with error code: {}", res);
496 }
497
498 let ptr = &mut HEAP.0 as *mut u8 as *mut libc::c_void;
500 libc::madvise(ptr, HEAP_SIZE, libc::MADV_HUGEPAGE);
501 }
502 }
503
504 #[allow(static_mut_refs)]
505 pub fn unlock_memory(&self) {
506 unsafe {
507 VM.restore().ok();
508 };
509 }
510}
511
512unsafe impl GlobalAlloc for UserAllocator {
514 unsafe fn alloc(&self, layout: Layout) -> *mut u8 {
515 unsafe {
516 let bytes = layout.size();
517 self.fetch_buddy_alloc(|alloc| alloc.malloc(bytes))
518 }
519 }
520
521 unsafe fn dealloc(&self, ptr: *mut u8, _layout: Layout) {
522 unsafe {
523 self.fetch_buddy_alloc(|alloc| alloc.free(ptr));
524 }
525 }
526}
527
528unsafe impl Sync for UserAllocator {}
529
530const HEAP_SIZE: usize = 64 * 1024 * 1024; const LEAF_SIZE: usize = 64;
533
534#[repr(align(4096))]
535struct AlignedHeap<const N: usize>([u8; N]);
536
537static mut HEAP: AlignedHeap<HEAP_SIZE> = AlignedHeap([0u8; HEAP_SIZE]);
539
540#[allow(static_mut_refs)]
550#[cfg_attr(not(test), global_allocator)]
551pub static ALLOCATOR: UserAllocator =
552 unsafe { UserAllocator::new(BuddyAllocParam::new(HEAP.0.as_ptr(), HEAP_SIZE, LEAF_SIZE)) };
553
554struct VmSettings {
559 compact_unevictable_allowed: i32,
569}
570
571impl VmSettings {
572 fn read_procfs(&self, file_path: &str) -> Result<i32, String> {
574 let file = File::open(file_path)
576 .map_err(|err| format!("Failed to open {}: {}", file_path, err))?;
577
578 let reader = BufReader::new(file);
579
580 let line = reader
581 .lines()
582 .next()
583 .ok_or_else(|| format!("File is empty: {}", file_path))?
584 .map_err(|err| format!("Failed to read line from {}: {}", file_path, err))?;
585
586 line.trim()
587 .parse::<i32>()
588 .map_err(|err| format!("Failed to parse {}: {}", file_path, err))
589 }
590
591 fn write_procfs(&self, file_path: &str, value: i32) -> Result<(), String> {
593 let mut file = File::create(file_path)
595 .map_err(|err| format!("Failed to open {}: {}", file_path, err))?;
596
597 write!(file, "{}", value)
599 .map_err(|err| format!("Failed to write to {}: {}", file_path, err))
600 }
601
602 fn save(&self) -> Result<(), String> {
604 let compact_unevictable_allowed = "/proc/sys/vm/compact_unevictable_allowed";
605 let value = self.read_procfs(compact_unevictable_allowed)?;
606 unsafe {
607 VM.compact_unevictable_allowed = value;
608 };
609 self.write_procfs(compact_unevictable_allowed, 0)?;
610
611 Ok(())
612 }
613
614 fn restore(&self) -> Result<(), String> {
616 let compact_unevictable_allowed = "/proc/sys/vm/compact_unevictable_allowed";
617 let value = unsafe { VM.compact_unevictable_allowed };
618 self.write_procfs(compact_unevictable_allowed, value)?;
619
620 Ok(())
621 }
622}
623
624static mut VM: VmSettings = VmSettings {
626 compact_unevictable_allowed: 0,
627};