polkavm_linux_raw/
lib.rs

1#![doc = include_str!("../README.md")]
2#![no_std]
3#![deny(clippy::panic)]
4#![deny(clippy::unwrap_used)]
5#![deny(clippy::expect_used)]
6#![deny(clippy::unreachable)]
7#![deny(clippy::indexing_slicing)]
8#![allow(clippy::collapsible_else_if)]
9#![allow(clippy::len_without_is_empty)]
10#![allow(clippy::manual_range_contains)]
11// This crate mostly contains syscall wrappers. If you use them you should know what you're doing.
12#![allow(clippy::missing_safety_doc)]
13#![allow(clippy::undocumented_unsafe_blocks)]
14#![cfg(all(target_os = "linux", target_arch = "x86_64"))]
15
16#[cfg(feature = "std")]
17extern crate std;
18
19mod syscall;
20
21#[cfg(target_arch = "x86_64")]
22#[doc(hidden)]
23pub mod arch_amd64_syscall;
24
25#[cfg(target_arch = "x86_64")]
26#[allow(dead_code)]
27#[allow(non_upper_case_globals)]
28#[allow(non_camel_case_types)]
29#[allow(non_snake_case)]
30#[allow(clippy::ptr_as_ptr)]
31#[allow(clippy::used_underscore_binding)]
32mod arch_amd64_bindings;
33
34mod mmap;
35
36pub use mmap::Mmap;
37
38#[cfg(target_arch = "x86_64")]
39#[doc(hidden)]
40pub use arch_amd64_syscall as syscall_impl;
41
42pub use core::ffi::{c_int, c_long, c_uchar, c_uint, c_ulong, c_ushort, c_void};
43
44use core::ffi::CStr;
45use core::marker::PhantomData;
46use core::mem::MaybeUninit;
47use core::sync::atomic::AtomicU32;
48use core::time::Duration;
49
50#[cfg(feature = "std")]
51use std::borrow::Cow;
52
53// TODO: Remove this once this is stable: https://github.com/rust-lang/rust/issues/88345
54#[allow(non_camel_case_types)]
55type c_size_t = usize;
56
57#[allow(non_camel_case_types)]
58pub type size_t = c_size_t;
59
60// Doesn't appear in public headers.
61pub const MNT_FORCE: u32 = 1;
62pub const MNT_DETACH: u32 = 2;
63pub const MNT_EXPIRE: u32 = 4;
64
65pub const SIG_DFL: usize = 0;
66pub const SIG_IGN: usize = 1;
67
68// Bindgen seems to not want to emit this constant,
69// so let's define it manually.
70pub const HWCAP2_FSGSBASE: usize = 1 << 1;
71
72#[rustfmt::skip]
73pub use crate::arch_amd64_bindings::{
74    __kernel_gid_t as gid_t,
75    __kernel_pid_t as pid_t,
76    __kernel_uid_t as uid_t,
77    __NR_arch_prctl as SYS_arch_prctl,
78    __NR_capset as SYS_capset,
79    __NR_chdir as SYS_chdir,
80    __NR_clock_gettime as SYS_clock_gettime,
81    __NR_clone as SYS_clone,
82    __NR_clone3 as SYS_clone3,
83    __NR_close as SYS_close,
84    __NR_close_range as SYS_close_range,
85    __NR_dup3 as SYS_dup3,
86    __NR_execveat as SYS_execveat,
87    __NR_exit as SYS_exit,
88    __NR_fchdir as SYS_fchdir,
89    __NR_fcntl as SYS_fcntl,
90    __NR_ftruncate as SYS_ftruncate,
91    __NR_futex as SYS_futex,
92    __NR_getdents64 as SYS_getdents64,
93    __NR_getgid as SYS_getgid,
94    __NR_getpid as SYS_getpid,
95    __NR_getuid as SYS_getuid,
96    __NR_kill as SYS_kill,
97    __NR_lseek as SYS_lseek,
98    __NR_madvise as SYS_madvise,
99    __NR_memfd_create as SYS_memfd_create,
100    __NR_mmap as SYS_mmap,
101    __NR_mount as SYS_mount,
102    __NR_mprotect as SYS_mprotect,
103    __NR_mremap as SYS_mremap,
104    __NR_munmap as SYS_munmap,
105    __NR_open as SYS_open,
106    __NR_openat as SYS_openat,
107    __NR_pidfd_send_signal as SYS_pidfd_send_signal,
108    __NR_pipe2 as SYS_pipe2,
109    __NR_pivot_root as SYS_pivot_root,
110    __NR_prctl as SYS_prctl,
111    __NR_process_vm_readv as SYS_process_vm_readv,
112    __NR_process_vm_writev as SYS_process_vm_writev,
113    __NR_ptrace as SYS_ptrace,
114    __NR_read as SYS_read,
115    __NR_recvmsg as SYS_recvmsg,
116    __NR_rt_sigaction as SYS_rt_sigaction,
117    __NR_rt_sigprocmask as SYS_rt_sigprocmask,
118    __NR_rt_sigreturn as SYS_rt_sigreturn,
119    __NR_sched_yield as SYS_sched_yield,
120    __NR_seccomp as SYS_seccomp,
121    __NR_sendmsg as SYS_sendmsg,
122    __NR_set_tid_address as SYS_set_tid_address,
123    __NR_setdomainname as SYS_setdomainname,
124    __NR_sethostname as SYS_sethostname,
125    __NR_setrlimit as SYS_setrlimit,
126    __NR_sigaltstack as SYS_sigaltstack,
127    __NR_socketpair as SYS_socketpair,
128    __NR_umount2 as SYS_umount2,
129    __NR_unshare as SYS_unshare,
130    __NR_waitid as SYS_waitid,
131    __NR_write as SYS_write,
132    __NR_writev as SYS_writev,
133    __user_cap_data_struct,
134    __user_cap_header_struct,
135    __WALL,
136    _LINUX_CAPABILITY_VERSION_3,
137    ARCH_GET_FS,
138    ARCH_GET_GS,
139    ARCH_SET_FS,
140    ARCH_SET_GS,
141    AT_EMPTY_PATH,
142    AT_HWCAP2,
143    AT_MINSIGSTKSZ,
144    AT_NULL,
145    AT_PAGESZ,
146    AT_SYSINFO_EHDR,
147    CLD_CONTINUED,
148    CLD_DUMPED,
149    CLD_EXITED,
150    CLD_KILLED,
151    CLD_STOPPED,
152    CLD_TRAPPED,
153    CLOCK_MONOTONIC_RAW,
154    CLONE_CLEAR_SIGHAND,
155    CLONE_NEWCGROUP,
156    CLONE_NEWIPC,
157    CLONE_NEWNET,
158    CLONE_NEWNS,
159    CLONE_NEWPID,
160    CLONE_NEWUSER,
161    CLONE_NEWUTS,
162    CLONE_PIDFD,
163    E2BIG,
164    EACCES,
165    EAGAIN,
166    EBADF,
167    EBUSY,
168    ECHILD,
169    EDOM,
170    EEXIST,
171    EFAULT,
172    EFBIG,
173    EINTR,
174    EINVAL,
175    EIO,
176    EISDIR,
177    EMFILE,
178    EMLINK,
179    ENFILE,
180    ENODEV,
181    ENOENT,
182    ENOEXEC,
183    ENOMEM,
184    ENOSPC,
185    ENOTBLK,
186    ENOTDIR,
187    ENOTTY,
188    ENXIO,
189    EPERM,
190    EPIPE,
191    ERANGE,
192    EROFS,
193    ESPIPE,
194    ESRCH,
195    ETIMEDOUT,
196    ETXTBSY,
197    EXDEV,
198    F_ADD_SEALS,
199    F_SEAL_GROW,
200    F_SEAL_SEAL,
201    F_SEAL_SHRINK,
202    F_SEAL_WRITE,
203    F_SETFL,
204    F_SETOWN,
205    F_SETSIG,
206    FUTEX_WAIT,
207    FUTEX_WAKE,
208    iovec,
209    linux_dirent64,
210    MADV_DONTNEED,
211    MAP_ANONYMOUS,
212    MAP_FIXED,
213    MAP_POPULATE,
214    MAP_PRIVATE,
215    MAP_SHARED,
216    MFD_ALLOW_SEALING,
217    MFD_CLOEXEC,
218    MINSIGSTKSZ,
219    MREMAP_FIXED,
220    MREMAP_MAYMOVE,
221    MS_BIND,
222    MS_NODEV,
223    MS_NOEXEC,
224    MS_NOSUID,
225    MS_PRIVATE,
226    MS_RDONLY,
227    MS_REC,
228    O_CLOEXEC,
229    O_DIRECTORY,
230    O_NONBLOCK,
231    O_PATH,
232    O_RDONLY,
233    O_RDWR,
234    O_WRONLY,
235    P_ALL,
236    P_PGID,
237    P_PID,
238    P_PIDFD,
239    PROT_EXEC,
240    PROT_READ,
241    PROT_WRITE,
242    RLIMIT_DATA,
243    RLIMIT_FSIZE,
244    RLIMIT_LOCKS,
245    RLIMIT_MEMLOCK,
246    RLIMIT_MSGQUEUE,
247    RLIMIT_NOFILE,
248    RLIMIT_NPROC,
249    RLIMIT_STACK,
250    rlimit,
251    rusage,
252    SA_NODEFER,
253    SA_ONSTACK,
254    SA_RESTORER,
255    SA_SIGINFO,
256    SECCOMP_RET_ALLOW,
257    SECCOMP_RET_KILL_THREAD,
258    SECCOMP_SET_MODE_FILTER,
259    SIG_BLOCK,
260    SIG_SETMASK,
261    SIG_UNBLOCK,
262    SIGABRT,
263    sigaction as kernel_sigaction,
264    SIGBUS,
265    SIGCHLD,
266    SIGCONT,
267    SIGFPE,
268    SIGHUP,
269    SIGILL,
270    siginfo_t,
271    SIGINT,
272    SIGIO,
273    SIGKILL,
274    SIGPIPE,
275    SIGSEGV,
276    sigset_t as kernel_sigset_t,
277    SIGSTOP,
278    SIGSYS,
279    SIGTERM,
280    SIGTRAP,
281    timespec,
282    WEXITED,
283    WNOHANG,
284};
285
286impl siginfo_t {
287    pub unsafe fn si_signo(&self) -> c_int {
288        self.__bindgen_anon_1.__bindgen_anon_1.si_signo
289    }
290
291    pub unsafe fn si_code(&self) -> c_int {
292        self.__bindgen_anon_1.__bindgen_anon_1.si_code
293    }
294
295    pub unsafe fn si_pid(&self) -> pid_t {
296        self.__bindgen_anon_1.__bindgen_anon_1._sifields._sigchld._pid
297    }
298
299    pub unsafe fn si_status(&self) -> c_int {
300        self.__bindgen_anon_1.__bindgen_anon_1._sifields._sigchld._status
301    }
302}
303
304#[allow(non_snake_case)]
305pub const fn WIFSIGNALED(status: c_int) -> bool {
306    ((status & 0x7f) + 1) as i8 >= 2
307}
308
309#[allow(non_snake_case)]
310pub const fn WTERMSIG(status: c_int) -> c_int {
311    status & 0x7f
312}
313
314#[allow(non_snake_case)]
315pub const fn WIFEXITED(status: c_int) -> bool {
316    (status & 0x7f) == 0
317}
318
319#[allow(non_snake_case)]
320pub const fn WEXITSTATUS(status: c_int) -> c_int {
321    (status >> 8) & 0xff
322}
323
324#[allow(non_camel_case_types)]
325pub type socklen_t = u32;
326
327// Source: linux/arch/x86/include/uapi/asm/signal.h
328#[derive(Debug)]
329#[repr(C)]
330pub struct stack_t {
331    pub ss_sp: *mut c_void,
332    pub ss_flags: c_int,
333    pub ss_size: usize,
334}
335
336// Source: linux/include/uapi/asm-generic/ucontext.h
337#[derive(Debug)]
338#[repr(C)]
339pub struct ucontext {
340    pub uc_flags: c_ulong,
341    pub uc_link: *mut ucontext,
342    pub uc_stack: stack_t,
343    pub uc_mcontext: sigcontext,
344    pub uc_sigmask: kernel_sigset_t,
345}
346
347// Source: linux/arch/x86/include/uapi/asm/sigcontext.h
348#[derive(Debug)]
349#[repr(C)]
350pub struct sigcontext {
351    pub r8: u64,
352    pub r9: u64,
353    pub r10: u64,
354    pub r11: u64,
355    pub r12: u64,
356    pub r13: u64,
357    pub r14: u64,
358    pub r15: u64,
359    pub rdi: u64,
360    pub rsi: u64,
361    pub rbp: u64,
362    pub rbx: u64,
363    pub rdx: u64,
364    pub rax: u64,
365    pub rcx: u64,
366    pub rsp: u64,
367    pub rip: u64,
368    pub eflags: u64,
369    pub cs: u16,
370    pub gs: u16,
371    pub fs: u16,
372    pub ss: u16,
373    pub err: u64,
374    pub trapno: u64,
375    pub oldmask: u64,
376    pub cr2: u64,
377    pub fpstate: *mut fpstate,
378    pub reserved: [u64; 8],
379}
380
381#[repr(C)]
382pub struct fpstate {
383    pub cwd: u16,
384    pub swd: u16,
385    pub twd: u16,
386    pub fop: u16,
387    pub rip: u64,
388    pub rdp: u64,
389    pub mxcsr: u32,
390    pub mxcsr_mask: u32,
391    pub st_space: [u32; 32],  /*  8x  FP registers, 16 bytes each */
392    pub xmm_space: [u32; 64], /* 16x XMM registers, 16 bytes each */
393    pub reserved_1: [u32; 12],
394    pub sw_reserved: fpx_sw_bytes,
395}
396
397#[repr(C)]
398pub struct fpx_sw_bytes {
399    pub magic1: u32,
400    pub extended_size: u32,
401    pub xfeatures: u64,
402    pub xstate_size: u32,
403    pub padding: [u32; 7],
404}
405
406#[repr(C)]
407pub struct msghdr {
408    pub msg_name: *mut c_void,
409    pub msg_namelen: socklen_t,
410    pub msg_iov: *mut iovec,
411    pub msg_iovlen: c_size_t,
412    pub msg_control: *mut c_void,
413    pub msg_controllen: c_size_t,
414    pub msg_flags: c_int,
415}
416
417#[repr(C)]
418pub struct cmsghdr {
419    pub cmsg_len: c_size_t,
420    pub cmsg_level: c_int,
421    pub cmsg_type: c_int,
422}
423
424#[repr(C)]
425struct sock_fprog {
426    pub length: c_ushort,
427    pub filter: *const sock_filter,
428}
429
430#[derive(Copy, Clone, PartialEq, Eq, Debug)]
431#[repr(C)]
432pub struct sock_filter {
433    pub code: u16,
434    pub jt: u8,
435    pub jf: u8,
436    pub k: u32,
437}
438
439// BPF instruction classes (3 bits, mask: 0b00111)
440pub const BPF_LD: u16 = 0b000;
441pub const BPF_LDX: u16 = 0b001;
442pub const BPF_ST: u16 = 0b010;
443pub const BPF_STX: u16 = 0b011;
444pub const BPF_ALU: u16 = 0b100;
445pub const BPF_JMP: u16 = 0b101;
446pub const BPF_RET: u16 = 0b110;
447pub const BPF_MISC: u16 = 0b111;
448
449// BPF LD/LDX/ST/STX width (2 bits, mask: 0b11000)
450pub const BPF_W: u16 = 0b00000; // 32-bit
451pub const BPF_H: u16 = 0b01000; // 16-bit
452pub const BPF_B: u16 = 0b10000; // 8-bit
453
454// BPF LD/LDX/ST/STX addressing mode (3 bits, mask: 0b11100000)
455pub const BPF_IMM: u16 = 0b00000000;
456pub const BPF_ABS: u16 = 0b00100000;
457pub const BPF_IND: u16 = 0b01000000;
458pub const BPF_MEM: u16 = 0b01100000;
459pub const BPF_LEN: u16 = 0b10000000;
460pub const BPF_MSH: u16 = 0b10100000;
461
462// BPF ALU operations (4 bits, mask: 0b11110000)
463pub const BPF_ADD: u16 = 0b00000000;
464pub const BPF_SUB: u16 = 0b00010000;
465pub const BPF_MUL: u16 = 0b00100000;
466pub const BPF_DIV: u16 = 0b00110000;
467pub const BPF_OR: u16 = 0b01000000;
468pub const BPF_AND: u16 = 0b01010000;
469pub const BPF_LSH: u16 = 0b01100000;
470pub const BPF_RSH: u16 = 0b01110000;
471pub const BPF_NEG: u16 = 0b10000000;
472pub const BPF_MOD: u16 = 0b10010000;
473pub const BPF_XOR: u16 = 0b10100000;
474
475// BPF JMP operations (4 bits, mask: 0b11110000)
476pub const BPF_JA: u16 = 0b00000000;
477pub const BPF_JEQ: u16 = 0b00010000;
478pub const BPF_JGT: u16 = 0b00100000;
479pub const BPF_JGE: u16 = 0b00110000;
480pub const BPF_JSET: u16 = 0b01000000;
481
482// BPF ALU/JMP source (1 bit, mask: 0b1000)
483pub const BPF_K: u16 = 0b0000;
484pub const BPF_X: u16 = 0b1000;
485
486pub const SECBIT_NOROOT: u32 = 1;
487pub const SECBIT_NOROOT_LOCKED: u32 = 2;
488pub const SECBIT_NO_SETUID_FIXUP: u32 = 4;
489pub const SECBIT_NO_SETUID_FIXUP_LOCKED: u32 = 8;
490pub const SECBIT_KEEP_CAPS: u32 = 16;
491pub const SECBIT_KEEP_CAPS_LOCKED: u32 = 32;
492pub const SECBIT_NO_CAP_AMBIENT_RAISE: u32 = 64;
493pub const SECBIT_NO_CAP_AMBIENT_RAISE_LOCKED: u32 = 128;
494
495#[macro_export]
496macro_rules! bpf {
497    (@const_one $tok:tt) => {
498        1
499    };
500
501    (@get_label_or_zero ([$label:expr]: $($tok:tt)+)) => {
502        $label
503    };
504
505    (@get_label_or_zero ($($tok:tt)+)) => {
506        0
507    };
508
509    (@count_instructions
510        $(
511            ($($tok:tt)+)
512        ),+
513    ) => {{
514        let mut count = 0;
515        $(
516            count += $crate::bpf!(@const_one ($($tok)+));
517        )+
518
519        count
520    }};
521
522    (@max_label_plus_one
523        $(
524            ($($tok:tt)+)
525        ),+
526    ) => {{
527        let mut max = -1;
528        $(
529            let label = $crate::bpf!(@get_label_or_zero ($($tok)+));
530            if label > max {
531                max = label;
532            }
533        )+
534
535        if max < 0 {
536            0
537        } else {
538            (max + 1) as usize
539        }
540    }};
541
542    (@fill_label $labels:expr, $nth_instruction:expr, [$label:expr]: $($tok:tt)+) => {
543        $labels[$label] = $nth_instruction;
544    };
545
546    (@fill_label $labels:expr, $nth_instruction:expr, $($tok:tt)+) => {};
547
548    (@fill_labels
549        $labels:expr,
550        $(
551            ($($tok:tt)+)
552        ),+
553    ) => {{
554        let mut nth_instruction = 0;
555        $(
556            $crate::bpf!(@fill_label $labels, nth_instruction, $($tok)+);
557            #[allow(unused_assignments)]
558            {
559                nth_instruction += 1;
560            }
561        )+
562    }};
563
564    (@target $labels:expr, $nth_instruction:expr, $target:expr) => {{
565        let target = ($labels[$target] as i32 - $nth_instruction as i32 - 1);
566        if target < 0 || target > 255 {
567            panic!("invalid jump");
568        }
569
570        target as u8
571    }};
572
573    (@into_u32 $value:expr) => {{
574        let value = $value;
575        if value as i128 > core::u32::MAX as i128 || (value as i128) < core::i32::MIN as i128 {
576            panic!("out of range value");
577        }
578        value as u32
579    }};
580
581    (@op $labels:expr, $nth_instruction:expr, [$label:expr]: $($tok:tt)+) => { $crate::bpf!(@op $labels, $nth_instruction, $($tok)+) };
582
583    (@op $labels:expr, $nth_instruction:expr, a = *abs[$addr:expr]) => { $crate::sock_filter { code: $crate::BPF_LD | $crate::BPF_W | $crate::BPF_ABS, jt: 0, jf: 0, k: $addr } };
584    (@op $labels:expr, $nth_instruction:expr, a &= $value:expr) => { $crate::sock_filter { code: $crate::BPF_ALU | $crate::BPF_AND | $crate::BPF_K, jt: 0, jf: 0, k: $value } };
585    (@op $labels:expr, $nth_instruction:expr, if a == $value:expr => jump @$target:expr) => { $crate::sock_filter { code: $crate::BPF_JMP | $crate::BPF_JEQ | $crate::BPF_K, jt: $crate::bpf!(@target $labels, $nth_instruction, $target), jf: 0, k: $crate::bpf!(@into_u32 $value) } };
586    (@op $labels:expr, $nth_instruction:expr, if a != $value:expr => jump @$target:expr) => { $crate::sock_filter { code: $crate::BPF_JMP | $crate::BPF_JEQ | $crate::BPF_K, jt: 0, jf: $crate::bpf!(@target $labels, $nth_instruction, $target), k: $crate::bpf!(@into_u32 $value) } };
587    (@op $labels:expr, $nth_instruction:expr, jump @$target:expr) => { $crate::sock_filter { code: $crate::BPF_JMP | $crate::BPF_JA, jt: 0, jf: 0, k: $crate::bpf!(@target $labels, $nth_instruction, $target) as u32 } };
588    (@op $labels:expr, $nth_instruction:expr, return $value:expr) => { $crate::sock_filter { code: $crate::BPF_RET | $crate::BPF_K, jt: 0, jf: 0, k: $value } };
589    (@op $labels:expr, $nth_instruction:expr, seccomp_allow) => { $crate::bpf!(@op $labels, $nth_instruction, return $crate::SECCOMP_RET_ALLOW) };
590    (@op $labels:expr, $nth_instruction:expr, seccomp_kill_thread) => { $crate::bpf!(@op $labels, $nth_instruction, return $crate::SECCOMP_RET_KILL_THREAD) };
591    (@op $labels:expr, $nth_instruction:expr, a = syscall_nr) => { $crate::bpf!(@op $labels, $nth_instruction, a = *abs[0]) };
592    (@op $labels:expr, $nth_instruction:expr, a = syscall_arg[$nth_arg:expr]) => { $crate::bpf!(@op $labels, $nth_instruction, a = *abs[16 + $nth_arg * 8]) };
593
594    (
595        $(
596            ($($tok:tt)+),
597        )+
598    ) => {{
599        let mut filter = [
600            $crate::sock_filter { code: 0, jt: 0, jf: 0, k: 0 };
601            { $crate::bpf!(@count_instructions $(($($tok)+)),+) }
602        ];
603
604        let mut labels = [
605            0;
606            { $crate::bpf!(@max_label_plus_one $(($($tok)+)),+) }
607        ];
608
609        $crate::bpf!(@fill_labels labels, $(($($tok)+)),+);
610
611        {
612            let mut nth_instruction = 0;
613
614            $(
615                #[allow(clippy::indexing_slicing)]
616                {
617                    filter[nth_instruction] = $crate::bpf!(@op labels, nth_instruction, $($tok)+);
618                }
619                nth_instruction += 1;
620            )+
621
622            let _ = nth_instruction;
623        }
624
625        filter
626    }};
627}
628
629#[test]
630fn test_bpf_jump() {
631    assert_eq!(
632        bpf! {
633            (if a == 1234 => jump @0),
634            (return 10),
635            ([0]: return 20),
636        },
637        [
638            sock_filter {
639                code: BPF_JMP | BPF_JEQ | BPF_K,
640                jt: 1,
641                jf: 0,
642                k: 1234
643            },
644            sock_filter {
645                code: BPF_RET,
646                jt: 0,
647                jf: 0,
648                k: 10
649            },
650            sock_filter {
651                code: BPF_RET,
652                jt: 0,
653                jf: 0,
654                k: 20
655            },
656        ]
657    );
658
659    assert_eq!(
660        bpf! {
661            (if a == 20 => jump @2),
662            (if a == 10 => jump @2),
663            ([0]: return 0),
664            ([1]: return 1),
665            ([2]: return 2),
666        },
667        [
668            sock_filter {
669                code: BPF_JMP | BPF_JEQ | BPF_K,
670                jt: 3,
671                jf: 0,
672                k: 20
673            },
674            sock_filter {
675                code: BPF_JMP | BPF_JEQ | BPF_K,
676                jt: 2,
677                jf: 0,
678                k: 10
679            },
680            sock_filter {
681                code: BPF_RET,
682                jt: 0,
683                jf: 0,
684                k: 0
685            },
686            sock_filter {
687                code: BPF_RET,
688                jt: 0,
689                jf: 0,
690                k: 1
691            },
692            sock_filter {
693                code: BPF_RET,
694                jt: 0,
695                jf: 0,
696                k: 2
697            },
698        ]
699    );
700}
701
702pub const STDIN_FILENO: c_int = 0;
703pub const STDOUT_FILENO: c_int = 1;
704pub const STDERR_FILENO: c_int = 2;
705
706pub const AF_UNIX: u32 = 1;
707pub const SOCK_STREAM: u32 = 1;
708pub const SOCK_SEQPACKET: u32 = 5;
709pub const SOCK_CLOEXEC: u32 = 0x80000;
710pub const SOL_SOCKET: c_int = 1;
711pub const SCM_RIGHTS: c_int = 1;
712pub const MSG_NOSIGNAL: u32 = 0x4000;
713
714pub const SEEK_SET: u32 = 0;
715pub const SEEK_CUR: u32 = 1;
716pub const SEEK_END: u32 = 2;
717
718pub const O_ASYNC: u32 = 0x2000;
719
720#[allow(non_snake_case)]
721const fn CMSG_ALIGN(len: usize) -> usize {
722    (len + core::mem::size_of::<usize>() - 1) & !(core::mem::size_of::<usize>() - 1)
723}
724
725#[allow(non_snake_case)]
726pub unsafe fn CMSG_FIRSTHDR(mhdr: *const msghdr) -> *mut cmsghdr {
727    if (*mhdr).msg_controllen >= core::mem::size_of::<cmsghdr>() {
728        (*mhdr).msg_control.cast::<cmsghdr>()
729    } else {
730        core::ptr::null_mut()
731    }
732}
733
734#[allow(non_snake_case)]
735pub unsafe fn CMSG_DATA(cmsg: *mut cmsghdr) -> *mut c_uchar {
736    cmsg.add(1).cast::<c_uchar>()
737}
738
739#[allow(non_snake_case)]
740pub const fn CMSG_SPACE(length: usize) -> usize {
741    CMSG_ALIGN(length) + CMSG_ALIGN(core::mem::size_of::<cmsghdr>())
742}
743
744#[allow(non_snake_case)]
745pub const fn CMSG_LEN(length: usize) -> usize {
746    CMSG_ALIGN(core::mem::size_of::<cmsghdr>()) + length
747}
748
749// The following was copied from the `cstr_core` crate.
750//
751// TODO: Remove this once this is stable: https://github.com/rust-lang/rust/issues/105723
752#[inline]
753#[doc(hidden)]
754#[allow(clippy::indexing_slicing)]
755pub const fn cstr_is_valid(bytes: &[u8]) -> bool {
756    if bytes.is_empty() || bytes[bytes.len() - 1] != 0 {
757        return false;
758    }
759
760    let mut index = 0;
761    while index < bytes.len() - 1 {
762        if bytes[index] == 0 {
763            return false;
764        }
765        index += 1;
766    }
767    true
768}
769
770#[macro_export]
771macro_rules! cstr {
772    ($e:expr) => {{
773        const STR: &[u8] = concat!($e, "\0").as_bytes();
774        const STR_VALID: bool = $crate::cstr_is_valid(STR);
775        let _ = [(); 0 - (!(STR_VALID) as usize)];
776        #[allow(unused_unsafe)]
777        unsafe {
778            core::ffi::CStr::from_bytes_with_nul_unchecked(STR)
779        }
780    }}
781}
782
783#[derive(Clone, Debug)]
784pub struct Error {
785    #[cfg(not(feature = "std"))]
786    message: &'static str,
787    #[cfg(feature = "std")]
788    message: Cow<'static, str>,
789    errno: c_int,
790}
791
792impl core::fmt::Display for Error {
793    #[cold]
794    fn fmt(&self, fmt: &mut core::fmt::Formatter) -> core::fmt::Result {
795        let mut is_err = false;
796        self.fmt_to_string(|chunk| {
797            if fmt.write_str(chunk).is_err() {
798                is_err = true;
799            }
800        });
801
802        if is_err {
803            Err(core::fmt::Error)
804        } else {
805            Ok(())
806        }
807    }
808}
809
810#[cfg(feature = "std")]
811impl std::error::Error for Error {}
812
813#[cfg(feature = "std")]
814impl From<std::string::String> for Error {
815    fn from(message: std::string::String) -> Self {
816        Error {
817            message: message.into(),
818            errno: 0,
819        }
820    }
821}
822
823impl From<&'static str> for Error {
824    fn from(message: &'static str) -> Self {
825        Error::from_str(message)
826    }
827}
828
829fn write_number(value: u32, write_str: &mut dyn FnMut(&str)) {
830    let n = if value >= 10 {
831        write_number(value / 10, write_str);
832        value % 10
833    } else {
834        value
835    };
836
837    let s = [n as u8 + b'0'];
838    let s = unsafe { core::str::from_utf8_unchecked(&s) };
839    write_str(s);
840}
841
842impl Error {
843    pub fn fmt_to_string(&self, mut write_str: impl FnMut(&str)) {
844        self.fmt_to_string_impl(&mut write_str);
845    }
846
847    // Avoid pulling in core::fmt machinery to keep the code size low.
848    #[cold]
849    fn fmt_to_string_impl(&self, write_str: &mut dyn FnMut(&str)) {
850        write_str(&self.message);
851
852        if self.errno == 0 {
853            return;
854        }
855
856        write_str(" (errno = ");
857        write_number(self.errno as u32, write_str);
858
859        let errno = match self.errno as u32 {
860            EPERM => Some("EPERM"),
861            ENOENT => Some("ENOENT"),
862            ESRCH => Some("ESRCH"),
863            EINTR => Some("EINTR"),
864            EIO => Some("EIO"),
865            ENXIO => Some("ENXIO"),
866            E2BIG => Some("E2BIG"),
867            ENOEXEC => Some("ENOEXEC"),
868            EBADF => Some("EBADF"),
869            ECHILD => Some("ECHILD"),
870            EAGAIN => Some("EAGAIN"),
871            ENOMEM => Some("ENOMEM"),
872            EACCES => Some("EACCES"),
873            EFAULT => Some("EFAULT"),
874            ENOTBLK => Some("ENOTBLK"),
875            EBUSY => Some("EBUSY"),
876            EEXIST => Some("EEXIST"),
877            EXDEV => Some("EXDEV"),
878            ENODEV => Some("ENODEV"),
879            ENOTDIR => Some("ENOTDIR"),
880            EISDIR => Some("EISDIR"),
881            EINVAL => Some("EINVAL"),
882            ENFILE => Some("ENFILE"),
883            EMFILE => Some("EMFILE"),
884            ENOTTY => Some("ENOTTY"),
885            ETXTBSY => Some("ETXTBSY"),
886            EFBIG => Some("EFBIG"),
887            ENOSPC => Some("ENOSPC"),
888            ESPIPE => Some("ESPIPE"),
889            EROFS => Some("EROFS"),
890            EMLINK => Some("EMLINK"),
891            EPIPE => Some("EPIPE"),
892            EDOM => Some("EDOM"),
893            ERANGE => Some("ERANGE"),
894            _ => None,
895        };
896
897        if let Some(errno) = errno {
898            write_str(" (");
899            write_str(errno);
900            write_str(")");
901        }
902
903        write_str(")");
904    }
905
906    #[cfg(feature = "std")]
907    #[cold]
908    pub fn from_os_error(message: &'static str, error: std::io::Error) -> Self {
909        Self {
910            message: message.into(),
911            errno: error.raw_os_error().unwrap_or(0),
912        }
913    }
914
915    #[cfg(feature = "std")]
916    #[cold]
917    pub fn from_last_os_error(message: &'static str) -> Self {
918        Self {
919            message: message.into(),
920            errno: std::io::Error::last_os_error().raw_os_error().unwrap_or(0),
921        }
922    }
923
924    #[cold]
925    pub const fn from_errno(message: &'static str, errno: i32) -> Self {
926        Self {
927            #[cfg(not(feature = "std"))]
928            message,
929            #[cfg(feature = "std")]
930            message: Cow::Borrowed(message),
931
932            errno,
933        }
934    }
935
936    #[cold]
937    pub const fn from_str(message: &'static str) -> Self {
938        Self {
939            #[cfg(not(feature = "std"))]
940            message,
941            #[cfg(feature = "std")]
942            message: Cow::Borrowed(message),
943
944            errno: 0,
945        }
946    }
947
948    #[inline]
949    pub fn from_syscall(message: &'static str, result: i64) -> Result<(), Self> {
950        if result >= -4095 && result < 0 {
951            Err(Self::from_syscall_unchecked(message, result))
952        } else {
953            Ok(())
954        }
955    }
956
957    #[cold]
958    #[inline]
959    const fn from_syscall_unchecked(message: &'static str, result: i64) -> Self {
960        Self {
961            #[cfg(not(feature = "std"))]
962            message,
963            #[cfg(feature = "std")]
964            message: Cow::Borrowed(message),
965
966            errno: -result as i32,
967        }
968    }
969
970    #[inline]
971    pub fn errno(&self) -> u32 {
972        self.errno as u32
973    }
974}
975
976#[cfg(target_arch = "x86_64")]
977#[inline(never)]
978#[cold]
979pub fn abort() -> ! {
980    // In practice `core::hint::unreachable_unchecked` emits this,
981    // but technically calling it is undefined behavior which could
982    // affect unrelated code, so let's just call it through `asm!`.
983
984    unsafe {
985        core::arch::asm!("ud2", options(noreturn, nostack));
986    }
987}
988
989/// An owned file descriptor. Will be automatically closed on drop.
990#[repr(transparent)]
991#[derive(PartialEq, Eq, PartialOrd, Ord, Debug)]
992pub struct Fd(c_int);
993
994/// An unowned file descriptor.
995#[repr(transparent)]
996#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Debug)]
997pub struct FdRef<'a>(c_int, PhantomData<&'a Fd>);
998
999impl Fd {
1000    pub fn raw(&self) -> c_int {
1001        self.0
1002    }
1003
1004    #[inline]
1005    pub const fn from_raw_unchecked(fd: c_int) -> Self {
1006        Self(fd)
1007    }
1008
1009    pub fn borrow(&self) -> FdRef {
1010        FdRef(self.0, PhantomData)
1011    }
1012
1013    pub fn close(mut self) -> Result<(), Error> {
1014        self.close_inplace()?;
1015        Ok(())
1016    }
1017
1018    pub fn leak(mut self) -> c_int {
1019        core::mem::replace(&mut self.0, -1)
1020    }
1021
1022    fn close_inplace(&mut self) -> Result<(), Error> {
1023        if self.raw() < 0 {
1024            return Ok(());
1025        }
1026
1027        let fd = core::mem::replace(&mut self.0, -1);
1028        let result = unsafe { syscall_readonly!(SYS_close, fd) };
1029        Error::from_syscall("close", result)
1030    }
1031}
1032
1033impl Drop for Fd {
1034    fn drop(&mut self) {
1035        let _ = self.close_inplace();
1036    }
1037}
1038
1039impl<'a> FdRef<'a> {
1040    pub fn raw(&self) -> c_int {
1041        self.0
1042    }
1043
1044    #[inline]
1045    pub const fn from_raw_unchecked(fd: c_int) -> Self {
1046        Self(fd, PhantomData)
1047    }
1048}
1049
1050impl<'a> From<&'a Fd> for FdRef<'a> {
1051    fn from(fd: &'a Fd) -> Self {
1052        FdRef(fd.0, PhantomData)
1053    }
1054}
1055
1056impl<'a> From<&'a mut Fd> for FdRef<'a> {
1057    fn from(fd: &'a mut Fd) -> Self {
1058        FdRef(fd.0, PhantomData)
1059    }
1060}
1061
1062impl core::fmt::Write for Fd {
1063    fn write_str(&mut self, string: &str) -> core::fmt::Result {
1064        FdRef::from(self).write_str(string)
1065    }
1066}
1067
1068impl<'a> core::fmt::Write for FdRef<'a> {
1069    fn write_str(&mut self, string: &str) -> core::fmt::Result {
1070        let mut bytes = string.as_bytes();
1071        while !bytes.is_empty() {
1072            let count = sys_write(*self, bytes).map_err(|_| core::fmt::Error)?;
1073            if count == 0 {
1074                return Err(core::fmt::Error);
1075            }
1076            bytes = bytes.get(count..).ok_or(core::fmt::Error)?;
1077        }
1078
1079        Ok(())
1080    }
1081}
1082
1083fn sys_getdents64(fd: FdRef, buffer: &mut [u8]) -> Result<Option<usize>, Error> {
1084    let length = buffer.len();
1085    let bytes_read = unsafe { syscall!(SYS_getdents64, fd.raw(), buffer, length) };
1086    Error::from_syscall("getdents64", bytes_read)?;
1087
1088    if bytes_read == 0 {
1089        Ok(None)
1090    } else {
1091        Ok(Some(bytes_read as usize))
1092    }
1093}
1094
1095pub unsafe fn sys_arch_prctl_set_gs(value: usize) -> Result<(), Error> {
1096    let result = syscall_readonly!(SYS_arch_prctl, ARCH_SET_GS, value);
1097    Error::from_syscall("arch_prctl(ARCH_SET_GS)", result)?;
1098    Ok(())
1099}
1100
1101pub fn sys_sched_yield() -> Result<(), Error> {
1102    // On Linux this always succeeds, although technically it could fail
1103    // due to a seccomp sandbox, so let's return an error anyway.
1104    let result = unsafe { syscall_readonly!(SYS_sched_yield) };
1105    Error::from_syscall("sched_yield", result)?;
1106    Ok(())
1107}
1108
1109pub fn sys_socketpair(domain: u32, kind: u32, protocol: u32) -> Result<(Fd, Fd), Error> {
1110    let mut output: [c_int; 2] = [-1, -1];
1111    let fd = unsafe { syscall_readonly!(SYS_socketpair, domain, kind, protocol, &mut output[..]) };
1112    Error::from_syscall("socketpair", fd)?;
1113    Ok((Fd(output[0] as c_int), Fd(output[1] as c_int)))
1114}
1115
1116pub fn sys_pipe2(flags: c_uint) -> Result<(Fd, Fd), Error> {
1117    let mut pipes: [c_int; 2] = [-1, -1];
1118    let result = unsafe { syscall_readonly!(SYS_pipe2, pipes.as_mut_ptr(), flags) };
1119    Error::from_syscall("pipe2", result)?;
1120    Ok((Fd::from_raw_unchecked(pipes[0]), Fd::from_raw_unchecked(pipes[1])))
1121}
1122
1123pub fn sys_open(path: &CStr, flags: c_uint) -> Result<Fd, Error> {
1124    let fd = unsafe { syscall_readonly!(SYS_open, path.as_ptr(), flags, 0) };
1125    Error::from_syscall("open", fd)?;
1126    Ok(Fd(fd as c_int))
1127}
1128
1129pub fn sys_openat(dir: FdRef, path: &CStr, flags: c_uint) -> Result<Fd, Error> {
1130    let fd = unsafe { syscall_readonly!(SYS_openat, dir, path.as_ptr(), flags, 0) };
1131    Error::from_syscall("openat", fd)?;
1132    Ok(Fd(fd as c_int))
1133}
1134
1135pub fn sys_memfd_create(name: &CStr, flags: c_uint) -> Result<Fd, Error> {
1136    let fd = unsafe { syscall_readonly!(SYS_memfd_create, name.as_ptr(), flags) };
1137    Error::from_syscall("memfd_create", fd)?;
1138    Ok(Fd(fd as c_int))
1139}
1140
1141pub fn sys_fcntl(fd: FdRef, cmd: u32, arg: u32) -> Result<(), Error> {
1142    let result = unsafe { syscall_readonly!(SYS_fcntl, fd, cmd, arg) };
1143    Error::from_syscall("fcntl", result)?;
1144    Ok(())
1145}
1146
1147pub fn sys_close_range(first_fd: c_int, last_fd: c_int, flags: c_uint) -> Result<(), Error> {
1148    let result = unsafe { syscall_readonly!(SYS_close_range, first_fd, last_fd, flags) };
1149    Error::from_syscall("close_range", result)
1150}
1151
1152pub fn sys_ftruncate(fd: FdRef, length: c_ulong) -> Result<(), Error> {
1153    let result = unsafe { syscall!(SYS_ftruncate, fd, length) };
1154    Error::from_syscall("ftruncate", result)
1155}
1156
1157pub fn sys_chdir(path: &CStr) -> Result<(), Error> {
1158    let result = unsafe { syscall_readonly!(SYS_chdir, path.as_ptr()) };
1159    Error::from_syscall("chdir", result)
1160}
1161
1162pub fn sys_fchdir(fd: FdRef) -> Result<(), Error> {
1163    let result = unsafe { syscall_readonly!(SYS_fchdir, fd) };
1164    Error::from_syscall("fchdir", result)
1165}
1166
1167pub unsafe fn sys_mmap(
1168    address: *mut c_void,
1169    length: c_size_t,
1170    protection: c_uint,
1171    flags: c_uint,
1172    fd: Option<FdRef>,
1173    offset: c_ulong,
1174) -> Result<*mut c_void, Error> {
1175    let result = syscall!(SYS_mmap, address, length, protection, flags, fd, offset);
1176    Error::from_syscall("mmap", result)?;
1177    Ok(result as *mut c_void)
1178}
1179
1180pub unsafe fn sys_munmap(address: *mut c_void, length: c_size_t) -> Result<(), Error> {
1181    let result = syscall!(SYS_munmap, address, length);
1182    Error::from_syscall("munmap", result)
1183}
1184
1185pub unsafe fn sys_mremap(
1186    address: *mut c_void,
1187    old_length: c_size_t,
1188    new_length: c_size_t,
1189    flags: c_uint,
1190    new_address: *mut c_void,
1191) -> Result<*mut c_void, Error> {
1192    let result = syscall!(SYS_mremap, address, old_length, new_length, flags, new_address);
1193    Error::from_syscall("mremap", result)?;
1194    Ok(result as *mut c_void)
1195}
1196
1197pub unsafe fn sys_mprotect(address: *mut c_void, length: c_size_t, protection: c_uint) -> Result<(), Error> {
1198    let result = syscall!(SYS_mprotect, address, length, protection);
1199    Error::from_syscall("mprotect", result)
1200}
1201
1202pub unsafe fn sys_madvise(address: *mut c_void, length: c_size_t, advice: c_uint) -> Result<(), Error> {
1203    let result = syscall!(SYS_madvise, address, length, advice);
1204    Error::from_syscall("madvise", result)
1205}
1206
1207pub fn sys_getpid() -> Result<pid_t, Error> {
1208    let result = unsafe { syscall_readonly!(SYS_getpid) };
1209    Error::from_syscall("getpid", result)?;
1210    Ok(result as pid_t)
1211}
1212
1213pub fn sys_getuid() -> Result<uid_t, Error> {
1214    let result = unsafe { syscall_readonly!(SYS_getuid) };
1215    Error::from_syscall("getuid", result)?;
1216    Ok(result as u32)
1217}
1218
1219pub fn sys_getgid() -> Result<gid_t, Error> {
1220    let result = unsafe { syscall_readonly!(SYS_getgid) };
1221    Error::from_syscall("getgid", result)?;
1222    Ok(result as u32)
1223}
1224
1225pub fn sys_kill(pid: pid_t, signal: c_uint) -> Result<(), Error> {
1226    let result = unsafe { syscall_readonly!(SYS_kill, pid, signal) };
1227    Error::from_syscall("kill", result)?;
1228    Ok(())
1229}
1230
1231pub unsafe fn sys_read_raw(fd: FdRef, buffer: *mut u8, length: usize) -> Result<c_size_t, Error> {
1232    let result = unsafe { syscall!(SYS_read, fd.raw(), buffer, length) };
1233    Error::from_syscall("read", result)?;
1234    Ok(result as c_size_t)
1235}
1236
1237pub fn sys_read(fd: FdRef, buffer: &mut [u8]) -> Result<c_size_t, Error> {
1238    unsafe { sys_read_raw(fd, buffer.as_mut_ptr(), buffer.len()) }
1239}
1240
1241pub fn sys_write(fd: FdRef, buffer: &[u8]) -> Result<c_size_t, Error> {
1242    let result = unsafe { syscall_readonly!(SYS_write, fd.raw(), buffer.as_ptr(), buffer.len()) };
1243    Error::from_syscall("write", result)?;
1244    Ok(result as c_size_t)
1245}
1246
1247pub fn sys_lseek(fd: FdRef, offset: i64, whence: u32) -> Result<u64, Error> {
1248    let result = unsafe { syscall_readonly!(SYS_lseek, fd.raw(), offset, whence) };
1249    Error::from_syscall("lseek", result)?;
1250    Ok(result as u64)
1251}
1252
1253pub unsafe fn sys_process_vm_readv(pid: pid_t, local_iovec: &[iovec], remote_iovec: &[iovec]) -> Result<usize, Error> {
1254    let result = unsafe {
1255        syscall!(
1256            SYS_process_vm_readv,
1257            pid,
1258            local_iovec,
1259            local_iovec.len(),
1260            remote_iovec,
1261            remote_iovec.len(),
1262            0
1263        )
1264    };
1265    Error::from_syscall("process_vm_readv", result)?;
1266    Ok(result as usize)
1267}
1268
1269pub unsafe fn sys_process_vm_writev(pid: pid_t, local_iovec: &[iovec], remote_iovec: &[iovec]) -> Result<usize, Error> {
1270    let result = unsafe {
1271        syscall!(
1272            SYS_process_vm_writev,
1273            pid,
1274            local_iovec,
1275            local_iovec.len(),
1276            remote_iovec,
1277            remote_iovec.len(),
1278            0
1279        )
1280    };
1281    Error::from_syscall("process_vm_writev", result)?;
1282    Ok(result as usize)
1283}
1284
1285pub unsafe fn sys_writev(fd: FdRef, iv: &[iovec]) -> Result<usize, Error> {
1286    let result = unsafe { syscall!(SYS_writev, fd, iv, iv.len()) };
1287    Error::from_syscall("writev", result)?;
1288    Ok(result as usize)
1289}
1290
1291pub fn sys_sendmsg(fd: FdRef, message: &msghdr, flags: u32) -> Result<usize, Error> {
1292    let result = unsafe { syscall_readonly!(SYS_sendmsg, fd.raw(), message as *const msghdr, flags) };
1293    Error::from_syscall("sendmsg", result)?;
1294    Ok(result as usize)
1295}
1296
1297pub fn sys_recvmsg(fd: FdRef, message: &mut msghdr, flags: u32) -> Result<usize, Error> {
1298    let result = unsafe { syscall!(SYS_recvmsg, fd.raw(), message as *mut msghdr, flags) };
1299    Error::from_syscall("recvmsg", result)?;
1300    Ok(result as usize)
1301}
1302
1303pub fn sys_exit(errcode: c_int) -> Result<(), Error> {
1304    let result = unsafe { syscall_readonly!(SYS_exit, errcode) };
1305    Error::from_syscall("exit", result)?;
1306    Ok(())
1307}
1308
1309pub fn sys_dup3(old_fd: c_int, new_fd: c_int, flags: c_uint) -> Result<(), Error> {
1310    let result = unsafe { syscall_readonly!(SYS_dup3, old_fd, new_fd, flags) };
1311    Error::from_syscall("dup3", result)?;
1312    Ok(())
1313}
1314
1315pub unsafe fn sys_execveat(
1316    dirfd: Option<FdRef>,
1317    path: &CStr,
1318    argv: &[*const c_uchar],
1319    envp: &[*const c_uchar],
1320    flags: c_uint,
1321) -> Result<(), Error> {
1322    let result = unsafe { syscall_readonly!(SYS_execveat, dirfd, path.as_ptr(), argv, envp, flags) };
1323    Error::from_syscall("execveat", result)?;
1324    Ok(())
1325}
1326
1327pub fn sys_ptrace_traceme() -> Result<(), Error> {
1328    let result = unsafe { syscall_readonly!(SYS_ptrace, 0, 0, 0) };
1329    Error::from_syscall("ptrace (PTRACE_TRACEME)", result)?;
1330    Ok(())
1331}
1332
1333pub fn sys_prctl_set_no_new_privs() -> Result<(), Error> {
1334    const PR_SET_NO_NEW_PRIVS: usize = 38;
1335    let result = unsafe { syscall_readonly!(SYS_prctl, PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) };
1336    Error::from_syscall("prctl(PR_SET_NO_NEW_PRIVS)", result)
1337}
1338
1339pub fn sys_prctl_cap_ambient_clear_all() -> Result<(), Error> {
1340    const PR_CAP_AMBIENT: usize = 47;
1341    const PR_CAP_AMBIENT_CLEAR_ALL: usize = 4;
1342    let result = unsafe { syscall_readonly!(SYS_prctl, PR_CAP_AMBIENT, PR_CAP_AMBIENT_CLEAR_ALL, 0, 0, 0) };
1343    Error::from_syscall("prctl(PR_CAP_AMBIENT)", result)
1344}
1345
1346pub fn sys_prctl_set_securebits(bits: u32) -> Result<(), Error> {
1347    const PR_SET_SECUREBITS: usize = 28;
1348    let result = unsafe { syscall_readonly!(SYS_prctl, PR_SET_SECUREBITS, bits, 0, 0, 0) };
1349    Error::from_syscall("prctl(PR_SET_SECUREBITS)", result)
1350}
1351
1352pub fn sys_prctl_set_name(name: &[u8; 16]) -> Result<(), Error> {
1353    const PR_SET_NAME: usize = 15;
1354    let result = unsafe { syscall_readonly!(SYS_prctl, PR_SET_NAME, name.as_ptr(), 0, 0, 0) };
1355    Error::from_syscall("prctl(PR_SET_NAME)", result)
1356}
1357
1358pub fn sys_capset(header: &__user_cap_header_struct, data: &[__user_cap_data_struct; 2]) -> Result<(), Error> {
1359    let result = unsafe {
1360        syscall_readonly!(
1361            SYS_capset,
1362            header as *const __user_cap_header_struct,
1363            data as *const __user_cap_data_struct
1364        )
1365    };
1366    Error::from_syscall("capset", result)
1367}
1368
1369pub fn sys_capset_drop_all() -> Result<(), Error> {
1370    let cap_user_header = __user_cap_header_struct {
1371        version: _LINUX_CAPABILITY_VERSION_3,
1372        pid: 0,
1373    };
1374    let cap_user_data = [__user_cap_data_struct {
1375        effective: 0,
1376        inheritable: 0,
1377        permitted: 0,
1378    }; 2];
1379
1380    sys_capset(&cap_user_header, &cap_user_data)
1381}
1382
1383pub fn sys_seccomp_set_mode_filter(filter: &[sock_filter]) -> Result<(), Error> {
1384    let filter = sock_fprog {
1385        length: if let Ok(length) = c_ushort::try_from(filter.len()) {
1386            length
1387        } else {
1388            return Err(Error::from_errno("seccomp(SECCOMP_SET_MODE_FILTER)", EINVAL as i32));
1389        },
1390        filter: filter.as_ptr(),
1391    };
1392
1393    let result = unsafe { syscall_readonly!(SYS_seccomp, SECCOMP_SET_MODE_FILTER, 0, core::ptr::addr_of!(filter)) };
1394    Error::from_syscall("seccomp(SECCOMP_SET_MODE_FILTER)", result)
1395}
1396
1397pub fn sys_setrlimit(resource: u32, limit: &rlimit) -> Result<(), Error> {
1398    let result = unsafe { syscall_readonly!(SYS_setrlimit, resource, limit as *const rlimit) };
1399    Error::from_syscall("setrlimit", result)
1400}
1401
1402pub fn sys_sethostname(name: &str) -> Result<(), Error> {
1403    let result = unsafe { syscall_readonly!(SYS_sethostname, name.as_ptr(), name.len()) };
1404    Error::from_syscall("sethostname", result)
1405}
1406
1407pub fn sys_setdomainname(name: &str) -> Result<(), Error> {
1408    let result = unsafe { syscall_readonly!(SYS_setdomainname, name.as_ptr(), name.len()) };
1409    Error::from_syscall("setdomainname", result)
1410}
1411
1412pub fn sys_mount(dev_name: &CStr, dir_name: &CStr, kind: &CStr, flags: u32, data: Option<&CStr>) -> Result<(), Error> {
1413    let result = unsafe {
1414        syscall_readonly!(
1415            SYS_mount,
1416            dev_name.as_ptr(),
1417            dir_name.as_ptr(),
1418            kind.as_ptr(),
1419            flags,
1420            data.map_or(core::ptr::null(), |data| data.as_ptr())
1421        )
1422    };
1423    Error::from_syscall("mount", result)
1424}
1425
1426pub fn sys_umount2(target: &CStr, flags: u32) -> Result<(), Error> {
1427    let result = unsafe { syscall_readonly!(SYS_umount2, target.as_ptr(), flags) };
1428    Error::from_syscall("umount2", result)
1429}
1430
1431pub fn sys_pivot_root(new_root: &CStr, old_root: &CStr) -> Result<(), Error> {
1432    let result = unsafe { syscall_readonly!(SYS_pivot_root, new_root.as_ptr(), old_root.as_ptr()) };
1433    Error::from_syscall("pivot_root", result)
1434}
1435
1436pub fn sys_unshare(flags: u32) -> Result<(), Error> {
1437    let result = unsafe { syscall_readonly!(SYS_unshare, flags) };
1438    Error::from_syscall("unshare", result)
1439}
1440
1441/// Calls the `futex` syscall with `FUTEX_WAIT` operation.
1442///
1443/// This will block *if* the value of the `futex` is equal to the `expected_value`.
1444///
1445/// Possible non-fatal errors:
1446///   - `EAGAIN`: the value of `futex` is not equal to `expected_value`
1447///   - `EINTR`: the syscall was interrupted by a signal
1448///   - `ETIMEDOUT`: the specified timeout has elapsed without the futex being woken up
1449pub fn sys_futex_wait(futex: &AtomicU32, expected_value: u32, timeout: Option<Duration>) -> Result<(), Error> {
1450    let ts: Option<timespec> = timeout.map(|timeout| timespec {
1451        tv_sec: timeout.as_secs() as i64,
1452        tv_nsec: u64::from(timeout.subsec_nanos()) as i64,
1453    });
1454
1455    let result = unsafe {
1456        syscall!(
1457            SYS_futex,
1458            futex as *const AtomicU32,
1459            FUTEX_WAIT,
1460            expected_value,
1461            ts.as_ref().map_or(core::ptr::null(), |ts| ts as *const timespec)
1462        )
1463    };
1464    Error::from_syscall("futex (wait)", result)
1465}
1466
1467/// Wakes up at most one thread waiting on `futex`.
1468///
1469/// Will return `true` if anybody was woken up.
1470pub fn sys_futex_wake_one(futex: &AtomicU32) -> Result<bool, Error> {
1471    let result = unsafe { syscall_readonly!(SYS_futex, futex as *const AtomicU32, FUTEX_WAKE, 1) };
1472    Error::from_syscall("futex (wake)", result)?;
1473    Ok(result == 1)
1474}
1475
1476pub fn sys_set_tid_address(address: *const u32) -> Result<(), Error> {
1477    let result = unsafe { syscall_readonly!(SYS_set_tid_address, address) };
1478    Error::from_syscall("set_tid_address", result)?;
1479    Ok(())
1480}
1481
1482pub unsafe fn sys_rt_sigaction(signal: u32, new_action: &kernel_sigaction, old_action: Option<&mut kernel_sigaction>) -> Result<(), Error> {
1483    let result = unsafe {
1484        syscall_readonly!(
1485            SYS_rt_sigaction,
1486            signal,
1487            new_action as *const kernel_sigaction,
1488            old_action.map_or(core::ptr::null_mut(), |old_action| old_action as *mut kernel_sigaction),
1489            core::mem::size_of::<kernel_sigset_t>()
1490        )
1491    };
1492    Error::from_syscall("rt_sigaction", result)?;
1493    Ok(())
1494}
1495
1496pub unsafe fn sys_rt_sigprocmask(how: u32, new_sigset: &kernel_sigset_t, old_sigset: Option<&mut kernel_sigset_t>) -> Result<(), Error> {
1497    let result = unsafe {
1498        syscall_readonly!(
1499            SYS_rt_sigprocmask,
1500            how,
1501            new_sigset as *const kernel_sigset_t,
1502            old_sigset.map_or(core::ptr::null_mut(), |old_sigset| old_sigset as *mut kernel_sigset_t),
1503            core::mem::size_of::<kernel_sigset_t>()
1504        )
1505    };
1506    Error::from_syscall("rt_sigprocmask", result)?;
1507    Ok(())
1508}
1509
1510pub unsafe fn sys_sigaltstack(new_stack: &stack_t, old_stack: Option<&mut stack_t>) -> Result<(), Error> {
1511    let result = unsafe {
1512        syscall_readonly!(
1513            SYS_sigaltstack,
1514            new_stack as *const stack_t,
1515            old_stack.map_or(core::ptr::null_mut(), |old_stack| old_stack as *mut stack_t)
1516        )
1517    };
1518    Error::from_syscall("sigaltstack", result)?;
1519    Ok(())
1520}
1521
1522pub fn sys_clock_gettime(clock_id: u32) -> Result<Duration, Error> {
1523    let mut output = timespec { tv_sec: 0, tv_nsec: 0 };
1524    let result = unsafe { syscall_readonly!(SYS_clock_gettime, clock_id, core::ptr::addr_of_mut!(output)) };
1525    Error::from_syscall("clock_gettime", result)?;
1526
1527    let duration = Duration::new(output.tv_sec as u64, output.tv_nsec as u32);
1528    Ok(duration)
1529}
1530
1531pub fn sys_waitid(which: u32, pid: pid_t, info: &mut siginfo_t, options: u32, usage: Option<&mut rusage>) -> Result<(), Error> {
1532    let result = unsafe {
1533        syscall_readonly!(
1534            SYS_waitid,
1535            which,
1536            pid,
1537            info as *mut siginfo_t,
1538            options,
1539            usage.map_or(core::ptr::null_mut(), |usage| usage as *mut rusage)
1540        )
1541    };
1542
1543    Error::from_syscall("waitid", result)?;
1544    Ok(())
1545}
1546
1547pub fn vm_read_memory<const N_LOCAL: usize, const N_REMOTE: usize>(
1548    pid: pid_t,
1549    local: [&mut [MaybeUninit<u8>]; N_LOCAL],
1550    remote: [(usize, usize); N_REMOTE],
1551) -> Result<usize, Error> {
1552    let local_iovec = local.map(|slice| iovec {
1553        iov_base: slice.as_mut_ptr().cast(),
1554        iov_len: slice.len() as u64,
1555    });
1556    let remote_iovec = remote.map(|(address, length)| iovec {
1557        iov_base: address as *mut c_void,
1558        iov_len: length as u64,
1559    });
1560    unsafe { sys_process_vm_readv(pid, &local_iovec, &remote_iovec) }
1561}
1562
1563pub fn vm_write_memory<const N_LOCAL: usize, const N_REMOTE: usize>(
1564    pid: pid_t,
1565    local: [&[u8]; N_LOCAL],
1566    remote: [(usize, usize); N_REMOTE],
1567) -> Result<usize, Error> {
1568    let local_iovec = local.map(|slice| iovec {
1569        iov_base: slice.as_ptr().cast_mut().cast(),
1570        iov_len: slice.len() as u64,
1571    });
1572    let remote_iovec = remote.map(|(address, length)| iovec {
1573        iov_base: address as *mut c_void,
1574        iov_len: length as u64,
1575    });
1576    unsafe { sys_process_vm_writev(pid, &local_iovec, &remote_iovec) }
1577}
1578
1579pub fn writev<const N: usize>(fd: FdRef, list: [&[u8]; N]) -> Result<usize, Error> {
1580    let iv = list.map(|slice| iovec {
1581        iov_base: slice.as_ptr().cast_mut().cast(),
1582        iov_len: slice.len() as u64,
1583    });
1584    unsafe { sys_writev(fd, &iv) }
1585}
1586
1587#[inline(always)] // To prevent the buffer from being copied.
1588pub fn readdir(dirfd: FdRef) -> Dirent64Iter {
1589    Dirent64Iter {
1590        dirfd,
1591        buffer: [0; 1024], // TODO: Use MaybeUninit.
1592        bytes_available: 0,
1593        position: 0,
1594    }
1595}
1596
1597#[repr(transparent)]
1598pub struct Dirent64<'a> {
1599    raw: linux_dirent64,
1600    _lifetime: core::marker::PhantomData<&'a [u8]>,
1601}
1602
1603impl<'a> Dirent64<'a> {
1604    pub fn d_type(&self) -> c_uchar {
1605        self.raw.d_type
1606    }
1607
1608    pub fn d_name(&self) -> &'a [u8] {
1609        unsafe {
1610            let name = self.raw.d_name.as_ptr();
1611            let length = {
1612                let mut p = self.raw.d_name.as_ptr();
1613                while *p != 0 {
1614                    p = p.add(1);
1615                }
1616
1617                p as usize - name as usize
1618            };
1619
1620            core::slice::from_raw_parts(name.cast(), length)
1621        }
1622    }
1623}
1624
1625pub struct Dirent64Iter<'a> {
1626    dirfd: FdRef<'a>,
1627    buffer: [u8; 1024],
1628    bytes_available: usize,
1629    position: usize,
1630}
1631
1632impl<'a> Iterator for Dirent64Iter<'a> {
1633    type Item = Result<Dirent64<'a>, Error>;
1634    fn next(&mut self) -> Option<Self::Item> {
1635        loop {
1636            if self.position < self.bytes_available {
1637                let dirent = unsafe { core::ptr::read_unaligned(self.buffer.as_ptr().add(self.position).cast::<Dirent64>()) };
1638
1639                self.position += usize::from(dirent.raw.d_reclen);
1640                return Some(Ok(dirent));
1641            }
1642
1643            match sys_getdents64(self.dirfd, &mut self.buffer) {
1644                Ok(Some(bytes_available)) => self.bytes_available = bytes_available,
1645                Ok(None) => return None,
1646                Err(error) => return Some(Err(error)),
1647            };
1648        }
1649    }
1650}
1651
1652pub fn sendfd(socket: FdRef, fd: FdRef) -> Result<(), Error> {
1653    let mut dummy: c_int = 0;
1654    let mut buffer = [0; CMSG_SPACE(core::mem::size_of::<c_int>())];
1655
1656    let mut iov = iovec {
1657        iov_base: core::ptr::addr_of_mut!(dummy).cast::<c_void>(),
1658        iov_len: core::mem::size_of_val(&dummy) as u64,
1659    };
1660
1661    let mut header = msghdr {
1662        msg_name: core::ptr::null_mut(),
1663        msg_namelen: 0,
1664        msg_iov: &mut iov,
1665        msg_iovlen: 1,
1666        msg_control: buffer.as_mut_ptr().cast::<c_void>(),
1667        msg_controllen: core::mem::size_of_val(&buffer),
1668        msg_flags: 0,
1669    };
1670
1671    let control_header = cmsghdr {
1672        cmsg_len: CMSG_LEN(core::mem::size_of::<c_int>()),
1673        cmsg_level: SOL_SOCKET,
1674        cmsg_type: SCM_RIGHTS,
1675    };
1676
1677    #[allow(clippy::cast_ptr_alignment)]
1678    unsafe {
1679        core::ptr::write_unaligned(CMSG_FIRSTHDR(&header), control_header);
1680        core::ptr::write_unaligned(CMSG_DATA(buffer.as_mut_ptr().cast::<cmsghdr>()).cast::<c_int>(), fd.raw());
1681    }
1682
1683    header.msg_controllen = CMSG_LEN(core::mem::size_of::<c_int>());
1684    sys_sendmsg(socket, &header, MSG_NOSIGNAL)?;
1685
1686    Ok(())
1687}
1688
1689pub fn recvfd(socket: FdRef) -> Result<Fd, Error> {
1690    let mut dummy: c_int = 0;
1691    let mut buffer = [0; CMSG_SPACE(core::mem::size_of::<c_int>())];
1692
1693    let mut iov = iovec {
1694        iov_base: core::ptr::addr_of_mut!(dummy).cast::<c_void>(),
1695        iov_len: core::mem::size_of_val(&dummy) as u64,
1696    };
1697
1698    let mut header = msghdr {
1699        msg_name: core::ptr::null_mut(),
1700        msg_namelen: 0,
1701        msg_iov: &mut iov,
1702        msg_iovlen: 1,
1703        msg_control: buffer.as_mut_ptr().cast::<c_void>(),
1704        msg_controllen: core::mem::size_of_val(&buffer),
1705        msg_flags: 0,
1706    };
1707
1708    let count = sys_recvmsg(socket, &mut header, 0)?;
1709    if count != core::mem::size_of::<c_int>() {
1710        return Err(Error::from_str("recvfd failed: received unexpected number of bytes"));
1711    }
1712
1713    if header.msg_controllen != CMSG_SPACE(core::mem::size_of::<c_int>()) {
1714        return Err(Error::from_str("recvfd failed: invalid control message size"));
1715    }
1716
1717    let control_header = unsafe { &mut *header.msg_control.cast::<cmsghdr>() };
1718
1719    if control_header.cmsg_level != SOL_SOCKET {
1720        return Err(Error::from_str("recvfd failed: invalid control message level"));
1721    }
1722
1723    if control_header.cmsg_type != SCM_RIGHTS {
1724        return Err(Error::from_str("recvfd failed: invalid control message type"));
1725    }
1726
1727    let fd = unsafe { core::ptr::read_unaligned(CMSG_DATA(control_header).cast::<c_int>()) };
1728
1729    Ok(Fd::from_raw_unchecked(fd))
1730}