polkavm/sandbox/
linux.rs

1#![allow(clippy::undocumented_unsafe_blocks)]
2#![allow(clippy::manual_range_contains)]
3
4extern crate polkavm_linux_raw as linux_raw;
5
6use polkavm_common::{
7    abi::VM_MAX_PAGE_SIZE,
8    error::{ExecutionError, Trap},
9    program::Reg,
10    utils::{align_to_next_page_usize, slice_assume_init_mut, Access, AsUninitSliceMut, Gas},
11    zygote::{
12        AddressTable, AddressTablePacked,
13        SandboxMemoryConfig, VmCtx, SANDBOX_EMPTY_NATIVE_PROGRAM_COUNTER, SANDBOX_EMPTY_NTH_INSTRUCTION, VMCTX_FUTEX_BUSY,
14        VMCTX_FUTEX_HOSTCALL, VMCTX_FUTEX_IDLE, VMCTX_FUTEX_INIT, VMCTX_FUTEX_TRAP, VM_ADDR_NATIVE_CODE,
15    },
16};
17
18use super::ExecuteArgs;
19
20pub use linux_raw::Error;
21
22use core::ffi::{c_int, c_uint};
23use core::ops::Range;
24use core::sync::atomic::Ordering;
25use core::time::Duration;
26use linux_raw::{abort, cstr, syscall_readonly, Fd, Mmap, STDERR_FILENO, STDIN_FILENO};
27use std::borrow::Cow;
28use std::time::Instant;
29use std::sync::{Arc, Mutex};
30
31use super::{SandboxKind, SandboxInit, SandboxVec, get_native_page_size};
32use crate::api::{BackendAccess, CompiledModuleKind, MemoryAccessError, Module, HostcallHandler};
33use crate::compiler::CompiledModule;
34use crate::config::GasMeteringKind;
35
36pub struct SandboxConfig {
37    enable_logger: bool,
38}
39
40impl SandboxConfig {
41    pub fn new() -> Self {
42        SandboxConfig { enable_logger: false }
43    }
44}
45
46impl super::SandboxConfig for SandboxConfig {
47    fn enable_logger(&mut self, value: bool) {
48        self.enable_logger = value;
49    }
50}
51
52impl Default for SandboxConfig {
53    fn default() -> Self {
54        Self::new()
55    }
56}
57
58#[repr(C)]
59struct CloneArgs {
60    /// Flags.
61    flags: u64,
62    /// Where to store PID file descriptor. (int *)
63    pidfd: *mut c_int,
64    /// Where to store child TID in child's memory. (pid_t *)
65    child_tid: u64,
66    /// Where to store child TID in parent's memory. (pid_t *)
67    parent_tid: u64,
68    /// Signal to deliver to parent on child termination.
69    exit_signal: u64,
70    /// Pointer to lowest byte of stack.
71    stack: u64,
72    /// Size of the stack.
73    stack_size: u64,
74    /// Location of the new TLS.
75    tls: u64,
76}
77
78/// Closes all file descriptors except the ones given.
79fn close_other_file_descriptors(preserved_fds: &[c_int]) -> Result<(), Error> {
80    let mut start_at = 0;
81    for &fd in preserved_fds {
82        if start_at == fd {
83            start_at = fd + 1;
84            continue;
85        }
86
87        if start_at > fd {
88            // Preserved file descriptors must be sorted.
89            return Err(Error::from_str("internal error: preserved file descriptors are not sorted"));
90        }
91
92        if linux_raw::sys_close_range(start_at, fd - 1, 0).is_err() {
93            return close_other_file_descriptors_legacy(preserved_fds);
94        }
95
96        start_at = fd + 1;
97    }
98
99    if linux_raw::sys_close_range(start_at, c_int::MAX, 0).is_err() {
100        return close_other_file_descriptors_legacy(preserved_fds);
101    }
102
103    Ok(())
104}
105
106/// Closes all file descriptors except the ones given.
107///
108/// For compatibility with old versions of Linux.
109fn close_other_file_descriptors_legacy(preserved_fds: &[c_int]) -> Result<(), Error> {
110    let dirfd = linux_raw::sys_open(
111        cstr!("/proc/self/fd"),
112        linux_raw::O_RDONLY | linux_raw::O_DIRECTORY | linux_raw::O_CLOEXEC,
113    )?;
114    for dirent in linux_raw::readdir(dirfd.borrow()) {
115        let dirent = dirent?;
116        let name = dirent.d_name();
117        if !name.iter().all(|&byte| byte >= b'0' && byte <= b'9') {
118            continue;
119        }
120
121        let name = core::str::from_utf8(name)
122            .ok()
123            .ok_or_else(|| Error::from_str("entry in '/proc/self/fd' is not valid utf-8"))?;
124        let fd: c_int = name
125            .parse()
126            .ok()
127            .ok_or_else(|| Error::from_str("entry in '/proc/self/fd' is not a number"))?;
128        if fd == dirfd.raw() || preserved_fds.iter().any(|&pfd| pfd == fd) {
129            continue;
130        }
131
132        Fd::from_raw_unchecked(fd).close()?;
133    }
134
135    dirfd.close()?;
136    Ok(())
137}
138
139struct Sigmask {
140    sigset_original: linux_raw::kernel_sigset_t,
141}
142
143impl Sigmask {
144    /// Temporarily blocks all signals from being delivered.
145    fn block_all_signals() -> Result<Self, Error> {
146        let sigset_all: linux_raw::kernel_sigset_t = !0;
147        let mut sigset_original: linux_raw::kernel_sigset_t = 0;
148        unsafe { linux_raw::sys_rt_sigprocmask(linux_raw::SIG_SETMASK, &sigset_all, Some(&mut sigset_original))? };
149
150        Ok(Sigmask { sigset_original })
151    }
152
153    /// Unblocks signal delivery.
154    fn unblock(mut self) -> Result<(), Error> {
155        let result = self.unblock_inplace();
156        core::mem::forget(self);
157        result
158    }
159
160    /// Unblocks signal delivery.
161    fn unblock_inplace(&mut self) -> Result<(), Error> {
162        unsafe { linux_raw::sys_rt_sigprocmask(linux_raw::SIG_SETMASK, &self.sigset_original, None) }
163    }
164}
165
166impl Drop for Sigmask {
167    fn drop(&mut self) {
168        let _ = self.unblock_inplace();
169    }
170}
171
172#[derive(Debug)]
173struct ChildProcess {
174    pid: c_int,
175    pidfd: Option<Fd>,
176}
177
178#[derive(Debug)]
179enum ChildStatus {
180    Running,
181    NotRunning,
182    Exited(c_int),
183    ExitedDueToSignal(c_int),
184}
185
186impl ChildStatus {
187    pub fn is_running(&self) -> bool {
188        matches!(self, Self::Running)
189    }
190}
191
192struct Signal(c_int);
193impl core::fmt::Display for Signal {
194    fn fmt(&self, fmt: &mut core::fmt::Formatter) -> core::fmt::Result {
195        let name = match self.0 as u32 {
196            linux_raw::SIGABRT => "SIGABRT",
197            linux_raw::SIGBUS => "SIGBUS",
198            linux_raw::SIGCHLD => "SIGCHLD",
199            linux_raw::SIGCONT => "SIGCONT",
200            linux_raw::SIGFPE => "SIGFPE",
201            linux_raw::SIGHUP => "SIGHUP",
202            linux_raw::SIGILL => "SIGILL",
203            linux_raw::SIGINT => "SIGINT",
204            linux_raw::SIGKILL => "SIGKILL",
205            linux_raw::SIGPIPE => "SIGPIPE",
206            linux_raw::SIGSEGV => "SIGSEGV",
207            linux_raw::SIGSTOP => "SIGSTOP",
208            linux_raw::SIGSYS => "SIGSYS",
209            linux_raw::SIGTERM => "SIGTERM",
210            linux_raw::SIGTRAP => "SIGTRAP",
211            _ => return write!(fmt, "{}", self.0)
212        };
213
214        fmt.write_str(name)
215    }
216}
217
218impl core::fmt::Display for ChildStatus {
219    fn fmt(&self, fmt: &mut core::fmt::Formatter) -> core::fmt::Result {
220        match self {
221            ChildStatus::Running => fmt.write_str("running"),
222            ChildStatus::NotRunning => fmt.write_str("not running"),
223            ChildStatus::Exited(code) => write!(fmt, "exited (status = {code})"),
224            ChildStatus::ExitedDueToSignal(signum) => write!(fmt, "exited due to signal (signal = {})", Signal(*signum)),
225        }
226    }
227}
228
229impl ChildProcess {
230    fn waitid(&mut self, flags: u32) -> Result<linux_raw::siginfo_t, Error> {
231        let mut siginfo: linux_raw::siginfo_t = unsafe { core::mem::zeroed() };
232        let mut result;
233        loop {
234            result = if let Some(ref pidfd) = self.pidfd {
235                linux_raw::sys_waitid(linux_raw::P_PIDFD, pidfd.raw(), &mut siginfo, flags, None)
236            } else {
237                linux_raw::sys_waitid(linux_raw::P_PID, self.pid, &mut siginfo, flags, None)
238            };
239
240            if let Err(error) = result {
241                if error.errno() == linux_raw::EINTR {
242                    // Should not happen since we should be blocking all signals while this is called, but just in case.
243                    continue;
244                }
245
246                return Err(error);
247            }
248
249            return Ok(siginfo);
250        }
251    }
252
253    fn check_status(&mut self, non_blocking: bool) -> Result<ChildStatus, Error> {
254        // The __WALL here is needed since we're not specifying an exit signal
255        // when cloning the child process, so we'd get an ECHILD error without this flag.
256        //
257        // (And we're not using __WCLONE since that doesn't work for children which ran execve.)
258        let mut flags = linux_raw::WEXITED | linux_raw::__WALL;
259        if non_blocking {
260            flags |= linux_raw::WNOHANG;
261        }
262
263        match self.waitid(flags) {
264            Ok(ok) => unsafe {
265                if ok.si_signo() == 0 && ok.si_pid() == 0 {
266                    Ok(ChildStatus::Running)
267                } else if ok.si_signo() as u32 == linux_raw::SIGCHLD && ok.si_code() as u32 == linux_raw::CLD_EXITED {
268                    Ok(ChildStatus::Exited(ok.si_status()))
269                } else if ok.si_signo() as u32 == linux_raw::SIGCHLD && (ok.si_code() as u32 == linux_raw::CLD_KILLED || ok.si_code() as u32 == linux_raw::CLD_DUMPED) {
270                    Ok(ChildStatus::ExitedDueToSignal(linux_raw::WTERMSIG(ok.si_status())))
271                } else if ok.si_signo() as u32 == linux_raw::SIGCHLD && ok.si_code() as u32 == linux_raw::CLD_STOPPED {
272                    Err(Error::from_last_os_error("waitid failed: unexpected CLD_STOPPED status"))
273                } else if ok.si_signo() as u32 == linux_raw::SIGCHLD && ok.si_code() as u32 == linux_raw::CLD_TRAPPED {
274                    Err(Error::from_last_os_error("waitid failed: unexpected CLD_TRAPPED status"))
275                } else if ok.si_signo() as u32 == linux_raw::SIGCHLD && ok.si_code() as u32 == linux_raw::CLD_CONTINUED {
276                    Err(Error::from_last_os_error("waitid failed: unexpected CLD_CONTINUED status"))
277                } else if ok.si_signo() != 0 {
278                    Ok(ChildStatus::ExitedDueToSignal(ok.si_signo()))
279                } else {
280                    Err(Error::from_last_os_error("waitid failed: internal error: unexpected state"))
281                }
282            },
283            Err(error) => {
284                if error.errno() == linux_raw::ECHILD {
285                    Ok(ChildStatus::NotRunning)
286                } else {
287                    Err(error)
288                }
289            }
290        }
291    }
292
293    fn send_signal(&mut self, signal: c_uint) -> Result<(), Error> {
294        unsafe {
295            if let Some(ref pidfd) = self.pidfd {
296                let errcode = syscall_readonly!(linux_raw::SYS_pidfd_send_signal, pidfd, signal, 0, 0);
297                Error::from_syscall("pidfd_send_signal", errcode)
298            } else {
299                linux_raw::sys_kill(self.pid, signal)
300            }
301        }
302    }
303}
304
305impl Drop for ChildProcess {
306    fn drop(&mut self) {
307        #[cfg(polkavm_dev_debug_zygote)]
308        let _ = self.send_signal(linux_raw::SIGINT);
309
310        #[cfg(not(polkavm_dev_debug_zygote))]
311        if self.send_signal(linux_raw::SIGKILL).is_ok() {
312            // Reap the zombie process.
313            let _ = self.check_status(false);
314        }
315    }
316}
317
318const ZYGOTE_BLOB_CONST: &[u8] = include_bytes!("./polkavm-zygote");
319static ZYGOTE_BLOB: &[u8] = ZYGOTE_BLOB_CONST;
320
321// Here we extract the necessary addresses directly from the zygote binary at compile time.
322const ZYGOTE_ADDRESS_TABLE: AddressTable = {
323    const fn starts_with(haystack: &[u8], needle: &[u8]) -> bool {
324        if haystack.len() < needle.len() {
325            return false;
326        }
327
328        let mut index = 0;
329        while index < needle.len() {
330            if haystack[index] != needle[index] {
331                return false;
332            }
333            index += 1;
334        }
335
336        true
337    }
338
339    const fn cast_slice<T>(slice: &[u8]) -> &T where T: Copy {
340        assert!(slice.len() >= core::mem::size_of::<T>());
341        assert!(core::mem::align_of::<T>() == 1);
342
343        // SAFETY: The size and alignment requirements of `T` were `assert`ed,
344        //         and it's `Copy` so it's guaranteed not to drop, so this is always safe.
345        unsafe {
346            &*slice.as_ptr().cast::<T>()
347        }
348    }
349
350    #[repr(C)]
351    #[derive(Copy, Clone)]
352    struct U16([u8; 2]);
353
354    impl U16 {
355        const fn get(self) -> u16 {
356            u16::from_ne_bytes(self.0)
357        }
358    }
359
360    #[repr(C)]
361    #[derive(Copy, Clone)]
362    struct U32([u8; 4]);
363
364    impl U32 {
365        const fn get(self) -> u32 {
366            u32::from_ne_bytes(self.0)
367        }
368    }
369
370    #[repr(C)]
371    #[derive(Copy, Clone)]
372    struct U64([u8; 8]);
373
374    impl U64 {
375        const fn get(self) -> u64 {
376            u64::from_ne_bytes(self.0)
377        }
378    }
379
380    #[repr(C)]
381    #[derive(Copy, Clone)]
382    struct ElfIdent {
383        magic: [u8; 4],
384        class: u8,
385        data: u8,
386        version: u8,
387        os_abi: u8,
388        abi_version: u8,
389        padding: [u8; 7],
390    }
391
392    #[repr(C)]
393    #[derive(Copy, Clone)]
394    struct ElfHeader {
395        e_ident: ElfIdent,
396        e_type: U16,
397        e_machine: U16,
398        e_version: U32,
399        e_entry: U64,
400        e_phoff: U64,
401        e_shoff: U64,
402        e_flags: U32,
403        e_ehsize: U16,
404        e_phentsize: U16,
405        e_phnum: U16,
406        e_shentsize: U16,
407        e_shnum: U16,
408        e_shstrndx: U16,
409    }
410
411    #[repr(C)]
412    #[derive(Copy, Clone)]
413    struct ElfSectionHeader {
414        sh_name: U32,
415        sh_type: U32,
416        sh_flags: U64,
417        sh_addr: U64,
418        sh_offset: U64,
419        sh_size: U64,
420        sh_link: U32,
421        sh_info: U32,
422        sh_addralign: U64,
423        sh_entsize: U64,
424    }
425
426    impl ElfHeader {
427        const fn section_header<'a>(&self, blob: &'a [u8], nth_section: u16) -> &'a ElfSectionHeader {
428            let size = self.e_shentsize.get() as usize;
429            assert!(size == core::mem::size_of::<ElfSectionHeader>());
430
431            let offset = self.e_shoff.get() as usize + nth_section as usize * size;
432            cast_slice(blob.split_at(offset).1)
433        }
434    }
435
436    impl ElfSectionHeader {
437        const fn data<'a>(&self, blob: &'a [u8]) -> &'a [u8] {
438            blob.split_at(self.sh_offset.get() as usize).1.split_at(self.sh_size.get() as usize).0
439        }
440    }
441
442    let header: &ElfHeader = cast_slice(ZYGOTE_BLOB_CONST);
443    let shstr = header.section_header(ZYGOTE_BLOB_CONST, header.e_shstrndx.get()).data(ZYGOTE_BLOB_CONST);
444
445    let mut address_table = None;
446    let mut nth_section = 0;
447    while nth_section < header.e_shnum.get() {
448        let section_header = header.section_header(ZYGOTE_BLOB_CONST, nth_section);
449        if starts_with(shstr.split_at(section_header.sh_name.get() as usize).1, b".address_table") {
450            let data = section_header.data(ZYGOTE_BLOB_CONST);
451            assert!(data.len() == core::mem::size_of::<AddressTablePacked>());
452            address_table = Some(AddressTable::from_packed(cast_slice::<AddressTablePacked>(data)));
453            break;
454        }
455        nth_section += 1;
456    }
457
458    let Some(address_table) = address_table else { panic!("broken zygote binary") };
459    address_table
460};
461
462fn create_empty_memfd(name: &core::ffi::CStr) -> Result<Fd, Error> {
463    linux_raw::sys_memfd_create(name, linux_raw::MFD_CLOEXEC | linux_raw::MFD_ALLOW_SEALING)
464}
465
466// Creating these is relatively slow, so we can keep one ready to go in memory at all times to speed up instantiation.
467static CACHED_PROGRAM_MEMFD: core::sync::atomic::AtomicI32 = core::sync::atomic::AtomicI32::new(-1);
468
469fn create_program_memfd() -> Result<Fd, Error> {
470    let memfd_raw = CACHED_PROGRAM_MEMFD.load(Ordering::Relaxed);
471    if memfd_raw != -1 && CACHED_PROGRAM_MEMFD.compare_exchange(memfd_raw, -1, Ordering::Relaxed, Ordering::Relaxed).is_ok() {
472        Ok(Fd::from_raw_unchecked(memfd_raw))
473    } else {
474        create_empty_memfd(cstr!("polkavm_program"))
475    }
476}
477
478fn cache_program_memfd_if_necessary() {
479    if CACHED_PROGRAM_MEMFD.load(Ordering::Relaxed) != -1 {
480        return;
481    }
482
483    let memfd = match create_empty_memfd(cstr!("polkavm_program")) {
484        Ok(memfd) => memfd,
485        Err(error) => {
486            // This should never happen.
487            log::warn!("Failed to create a memfd: {error}");
488            return;
489        }
490    };
491
492    if CACHED_PROGRAM_MEMFD.compare_exchange(-1, memfd.raw(), Ordering::Relaxed, Ordering::Relaxed).is_ok() {
493        memfd.leak();
494    }
495}
496
497fn prepare_sealed_memfd<const N: usize>(memfd: Fd, length: usize, data: [&[u8]; N]) -> Result<Fd, Error> {
498    let native_page_size = get_native_page_size();
499    if length % native_page_size != 0 {
500        return Err(Error::from_str("memfd size doesn't end on a page boundary"));
501    }
502
503    linux_raw::sys_ftruncate(memfd.borrow(), length as linux_raw::c_ulong)?;
504
505    let expected_bytes_written = data.iter().map(|slice| slice.len()).sum::<usize>();
506    let bytes_written = linux_raw::writev(memfd.borrow(), data)?;
507    if bytes_written != expected_bytes_written {
508        return Err(Error::from_str("failed to prepare memfd: incomplete write"));
509    }
510
511    linux_raw::sys_fcntl(
512        memfd.borrow(),
513        linux_raw::F_ADD_SEALS,
514        linux_raw::F_SEAL_SEAL | linux_raw::F_SEAL_SHRINK | linux_raw::F_SEAL_GROW | linux_raw::F_SEAL_WRITE,
515    )?;
516
517    Ok(memfd)
518}
519
520fn prepare_zygote() -> Result<Fd, Error> {
521    #[cfg(debug_assertions)]
522    if cfg!(polkavm_dev_debug_zygote) {
523        let paths = [
524            std::path::Path::new(env!("CARGO_MANIFEST_DIR")).join("../polkavm-zygote/target/x86_64-unknown-linux-gnu/debug/polkavm-zygote"),
525            std::path::Path::new(env!("CARGO_MANIFEST_DIR")).join("../polkavm-zygote/target/x86_64-unknown-linux-gnu/release/polkavm-zygote"),
526            std::path::Path::new(env!("CARGO_MANIFEST_DIR")).join("src/sandbox/polkavm-zygote"),
527            std::path::PathBuf::from("./polkavm-zygote"),
528        ];
529
530        let Some(path) = paths.into_iter().find(|path| {
531            path.exists() && std::fs::read(path).map(|data| data == ZYGOTE_BLOB).unwrap_or(false)
532        }) else {
533            panic!("no matching zygote binary found for debugging");
534        };
535
536        let path = std::ffi::CString::new(path.to_str().expect("invalid path to zygote")).expect("invalid path to zygote");
537        return Ok(linux_raw::sys_open(&path, linux_raw::O_CLOEXEC | linux_raw::O_PATH).unwrap());
538    }
539
540    let native_page_size = get_native_page_size();
541
542    #[allow(clippy::unwrap_used)]
543    // The size of the zygote blob is always going to be much less than the size of usize, so this never fails.
544    let length_aligned = align_to_next_page_usize(native_page_size, ZYGOTE_BLOB.len()).unwrap();
545    prepare_sealed_memfd(create_empty_memfd(cstr!("polkavm_zygote"))?, length_aligned, [ZYGOTE_BLOB])
546}
547
548fn prepare_vmctx() -> Result<(Fd, Mmap), Error> {
549    let native_page_size = get_native_page_size();
550
551    #[allow(clippy::unwrap_used)] // The size of VmCtx is always going to be much less than the size of usize, so this never fails.
552    let length_aligned = align_to_next_page_usize(native_page_size, core::mem::size_of::<VmCtx>()).unwrap();
553
554    let memfd = create_empty_memfd(cstr!("polkavm_vmctx"))?;
555    linux_raw::sys_ftruncate(memfd.borrow(), length_aligned as linux_raw::c_ulong)?;
556    linux_raw::sys_fcntl(
557        memfd.borrow(),
558        linux_raw::F_ADD_SEALS,
559        linux_raw::F_SEAL_SEAL | linux_raw::F_SEAL_SHRINK | linux_raw::F_SEAL_GROW,
560    )?;
561
562    let vmctx = unsafe {
563        linux_raw::Mmap::map(
564            core::ptr::null_mut(),
565            length_aligned,
566            linux_raw::PROT_READ | linux_raw::PROT_WRITE,
567            linux_raw::MAP_SHARED,
568            Some(memfd.borrow()),
569            0,
570        )?
571    };
572
573    unsafe {
574        *vmctx.as_mut_ptr().cast::<VmCtx>() = VmCtx::new();
575    }
576
577    Ok((memfd, vmctx))
578}
579
580unsafe fn child_main(zygote_memfd: Fd, child_socket: Fd, uid_map: &str, gid_map: &str, logging_pipe: Option<Fd>) -> Result<(), Error> {
581    // Change the name of the process.
582    linux_raw::sys_prctl_set_name(b"polkavm-sandbox\0")?;
583
584    if !cfg!(polkavm_dev_debug_zygote) {
585        // Overwrite the hostname and domainname.
586        linux_raw::sys_sethostname("localhost")?;
587        linux_raw::sys_setdomainname("localhost")?;
588
589        // Disable the 'setgroups' syscall. Probably unnecessary since we'll do it though seccomp anyway, but just in case.
590        // (See CVE-2014-8989 for more details.)
591        let proc_self = linux_raw::sys_open(cstr!("/proc/self"), linux_raw::O_CLOEXEC | linux_raw::O_PATH)?;
592        let fd = linux_raw::sys_openat(proc_self.borrow(), cstr!("setgroups"), linux_raw::O_CLOEXEC | linux_raw::O_WRONLY)?;
593        linux_raw::sys_write(fd.borrow(), b"deny")?;
594        fd.close()?;
595
596        // Set up UID and GID maps. This can only be done once, so if we do it here we'll block the possibility of doing it later.
597        let fd = linux_raw::sys_openat(proc_self.borrow(), cstr!("gid_map"), linux_raw::O_CLOEXEC | linux_raw::O_RDWR)?;
598        linux_raw::sys_write(fd.borrow(), gid_map.as_bytes())?;
599        fd.close()?;
600
601        let fd = linux_raw::sys_openat(proc_self.borrow(), cstr!("uid_map"), linux_raw::O_CLOEXEC | linux_raw::O_RDWR)?;
602        linux_raw::sys_write(fd.borrow(), uid_map.as_bytes())?;
603        fd.close()?;
604        proc_self.close()?;
605    }
606
607    let fd_limit = if logging_pipe.is_some() {
608        4
609    } else {
610        3
611    };
612
613    // This should never happen in practice, but can in theory if the user closes stdin or stderr manually.
614    // TODO: Actually support this?
615    for fd in [zygote_memfd.raw(), child_socket.raw()].into_iter().chain(logging_pipe.as_ref().map(|fd| fd.raw())) {
616        if fd == STDIN_FILENO {
617            return Err(Error::from_str("internal error: fd overlaps with stdin"));
618        }
619
620        if fd == STDERR_FILENO {
621            return Err(Error::from_str("internal error: fd overlaps with stderr"));
622        }
623    }
624
625    // Replace the stdin fd (which we don't need).
626    linux_raw::sys_dup3(child_socket.raw(), STDIN_FILENO, 0)?;
627    child_socket.close()?;
628
629    // Clean up any file descriptors which might have been opened by the host process.
630    let mut fds_to_keep = [core::ffi::c_int::MAX; 3];
631    let fds_to_keep = {
632        let mut count = 1;
633        fds_to_keep[0] = STDIN_FILENO;
634        if let Some(logging_pipe) = logging_pipe {
635            linux_raw::sys_dup3(logging_pipe.raw(), STDERR_FILENO, 0)?;
636            logging_pipe.close()?;
637            fds_to_keep[count] = STDERR_FILENO;
638            count += 1;
639        }
640
641        fds_to_keep[count] = zygote_memfd.raw();
642        count += 1;
643
644        fds_to_keep.sort_unstable(); // Should be a no-op.
645        &fds_to_keep[..count]
646    };
647    close_other_file_descriptors(fds_to_keep)?;
648
649    if !cfg!(polkavm_dev_debug_zygote) {
650        // Hide the host filesystem.
651        let mount_flags = linux_raw::MS_REC | linux_raw::MS_NODEV | linux_raw::MS_NOEXEC | linux_raw::MS_NOSUID | linux_raw::MS_RDONLY;
652        linux_raw::sys_mount(cstr!("none"), cstr!("/mnt"), cstr!("tmpfs"), mount_flags, Some(cstr!("size=0")))?;
653        linux_raw::sys_chdir(cstr!("/mnt"))?;
654        linux_raw::sys_pivot_root(cstr!("."), cstr!("."))?;
655        linux_raw::sys_umount2(cstr!("."), linux_raw::MNT_DETACH)?;
656    }
657
658    // Clear all of our ambient capabilities.
659    linux_raw::sys_prctl_cap_ambient_clear_all()?;
660
661    // Flag ourselves that we won't ever want to acquire any new privileges.
662    linux_raw::sys_prctl_set_no_new_privs()?;
663
664    if !cfg!(polkavm_dev_debug_zygote) {
665        linux_raw::sys_prctl_set_securebits(
666            // Make UID == 0 have no special privileges.
667            linux_raw::SECBIT_NOROOT |
668            linux_raw::SECBIT_NOROOT_LOCKED |
669            // Calling 'setuid' from/to UID == 0 doesn't change any privileges.
670            linux_raw::SECBIT_NO_SETUID_FIXUP |
671            linux_raw::SECBIT_NO_SETUID_FIXUP_LOCKED |
672            // The process cannot add capabilities to its ambient set.
673            linux_raw::SECBIT_NO_CAP_AMBIENT_RAISE |
674            linux_raw::SECBIT_NO_CAP_AMBIENT_RAISE_LOCKED,
675        )?;
676    }
677
678    // Set resource limits.
679    let max_memory = 8 * 1024 * 1024 * 1024;
680    linux_raw::sys_setrlimit(
681        linux_raw::RLIMIT_DATA,
682        &linux_raw::rlimit {
683            rlim_cur: max_memory,
684            rlim_max: max_memory,
685        },
686    )?;
687    linux_raw::sys_setrlimit(
688        linux_raw::RLIMIT_STACK,
689        &linux_raw::rlimit {
690            rlim_cur: 16 * 1024,
691            rlim_max: 16 * 1024,
692        },
693    )?;
694
695    linux_raw::sys_setrlimit(linux_raw::RLIMIT_NOFILE, &linux_raw::rlimit { rlim_cur: fd_limit, rlim_max: fd_limit })?;
696    linux_raw::sys_setrlimit(linux_raw::RLIMIT_NPROC, &linux_raw::rlimit { rlim_cur: 1, rlim_max: 1 })?;
697    linux_raw::sys_setrlimit(linux_raw::RLIMIT_FSIZE, &linux_raw::rlimit { rlim_cur: 0, rlim_max: 0 })?;
698    linux_raw::sys_setrlimit(linux_raw::RLIMIT_LOCKS, &linux_raw::rlimit { rlim_cur: 0, rlim_max: 0 })?;
699    linux_raw::sys_setrlimit(linux_raw::RLIMIT_MEMLOCK, &linux_raw::rlimit { rlim_cur: 0, rlim_max: 0 })?;
700    linux_raw::sys_setrlimit(linux_raw::RLIMIT_MSGQUEUE, &linux_raw::rlimit { rlim_cur: 0, rlim_max: 0 })?;
701
702    // Finally, drop all capabilities.
703    linux_raw::sys_capset_drop_all()?;
704
705    if cfg!(polkavm_dev_debug_zygote) {
706        let pid = linux_raw::sys_getpid()?;
707        linux_raw::sys_kill(pid, linux_raw::SIGSTOP)?;
708    }
709
710    let child_argv: [*const u8; 2] = [b"polkavm-zygote\0".as_ptr(), core::ptr::null()];
711    let child_envp: [*const u8; 1] = [core::ptr::null()];
712    linux_raw::sys_execveat(
713        Some(zygote_memfd.borrow()),
714        cstr!(""),
715        &child_argv,
716        &child_envp,
717        linux_raw::AT_EMPTY_PATH,
718    )?;
719
720    // This should never happen, but since the never type is still unstable let's return normally.
721    Ok(())
722}
723
724#[derive(Clone)]
725pub struct SandboxProgram(Arc<SandboxProgramInner>);
726
727struct SandboxProgramInner {
728    memfd: Fd,
729    memory_config: SandboxMemoryConfig,
730    code_range: Range<usize>,
731}
732
733impl super::SandboxProgram for SandboxProgram {
734    fn machine_code(&self) -> Cow<[u8]> {
735        // The code is kept inside of the memfd and we don't have it readily accessible.
736        // So if necessary just read it back from the memfd.
737        let mut buffer = vec![0; self.0.code_range.len()];
738        linux_raw::sys_lseek(self.0.memfd.borrow(), self.0.code_range.start as i64, linux_raw::SEEK_SET).expect("failed to get machine code of the program: seek failed");
739
740        let mut position = 0;
741        while position < self.0.code_range.len() {
742            let count = match linux_raw::sys_read(self.0.memfd.borrow(), &mut buffer[position..]) {
743                Ok(count) => count,
744                Err(error) if error.errno() == linux_raw::EINTR => continue,
745                Err(error) => panic!("failed to get machine code of the program: read failed: {error}")
746            };
747
748            assert_ne!(count, 0);
749            position += count as usize;
750        }
751
752        Cow::Owned(buffer)
753    }
754}
755
756#[derive(Clone, PartialEq, Eq, Hash, Debug)]
757pub struct Map<'a> {
758    pub start: u64,
759    pub end: u64,
760    pub is_readable: bool,
761    pub is_writable: bool,
762    pub is_executable: bool,
763    pub is_shared: bool,
764    pub file_offset: u64,
765    pub major: u64,
766    pub minor: u64,
767    pub inode: u64,
768    pub name: &'a [u8],
769}
770
771fn parse_u64_radix(input: &[u8], radix: u32) -> Option<u64> {
772    u64::from_str_radix(core::str::from_utf8(input).ok()?, radix).ok()
773}
774
775fn get_until<'a>(p: &mut &'a [u8], delimiter: u8) -> &'a [u8] {
776    let mut found = None;
777    for (index, ch) in p.iter().enumerate() {
778        if *ch == delimiter {
779            found = Some(index);
780            break;
781        }
782    }
783
784    if let Some(index) = found {
785        let (before, after) = p.split_at(index);
786        *p = &after[1..];
787        before
788    } else {
789        let before = *p;
790        *p = b"";
791        before
792    }
793}
794
795fn get_char(p: &mut &[u8]) -> Option<u8> {
796    let ch = p.first()?;
797    *p = &p[1..];
798    Some(*ch)
799}
800
801fn skip_whitespace(p: &mut &[u8]) {
802    while let Some(ch) = p.first() {
803        if *ch == b' ' {
804            *p = &p[1..];
805        } else {
806            break;
807        }
808    }
809}
810
811impl<'a> Map<'a> {
812    fn parse(mut line: &'a [u8]) -> Option<Self> {
813        let start = parse_u64_radix(get_until(&mut line, b'-'), 16)?;
814        let end = parse_u64_radix(get_until(&mut line, b' '), 16)?;
815        let is_readable = get_char(&mut line)? == b'r';
816        let is_writable = get_char(&mut line)? == b'w';
817        let is_executable = get_char(&mut line)? == b'x';
818        let is_shared = get_char(&mut line)? == b's';
819        get_char(&mut line);
820
821        let file_offset = parse_u64_radix(get_until(&mut line, b' '), 16)?;
822        let major = parse_u64_radix(get_until(&mut line, b':'), 16)?;
823        let minor = parse_u64_radix(get_until(&mut line, b' '), 16)?;
824        let inode = parse_u64_radix(get_until(&mut line, b' '), 10)?;
825        skip_whitespace(&mut line);
826        let name = line;
827
828        Some(Map {
829            start,
830            end,
831            is_readable,
832            is_writable,
833            is_executable,
834            is_shared,
835            file_offset,
836            major,
837            minor,
838            inode,
839            name,
840        })
841    }
842}
843
844fn get_message(vmctx: &VmCtx) -> Option<String> {
845    let message = unsafe {
846        let message_length = *vmctx.message_length.get() as usize;
847        let message = &*vmctx.message_buffer.get();
848        &message[..core::cmp::min(message_length, message.len())]
849    };
850
851    if message.is_empty() {
852        return None;
853    }
854
855    // The message is in shared memory, so clone it first to make sure
856    // it doesn't change under us and violate string's invariants.
857    let message = message.to_vec();
858    match String::from_utf8(message) {
859        Ok(message) => Some(message),
860        Err(error) => {
861            let message = error.into_bytes();
862            Some(String::from_utf8_lossy(&message).into_owned())
863        }
864    }
865}
866
867unsafe fn set_message(vmctx: &VmCtx, message: core::fmt::Arguments) {
868    struct Adapter<'a>(std::io::Cursor<&'a mut [u8]>);
869    impl<'a> core::fmt::Write for Adapter<'a> {
870        fn write_str(&mut self, string: &str) -> Result<(), core::fmt::Error> {
871            use std::io::Write;
872            self.0.write_all(string.as_bytes()).map_err(|_| core::fmt::Error)
873        }
874    }
875
876    let buffer: &mut [u8] = &mut *vmctx.message_buffer.get();
877    let mut cursor = Adapter(std::io::Cursor::new(buffer));
878    let _ = core::fmt::write(&mut cursor, message);
879    let length = cursor.0.position() as usize;
880
881    *vmctx.message_length.get() = length as u32;
882}
883
884pub struct Sandbox {
885    _lifetime_pipe: Fd,
886    vmctx_mmap: Mmap,
887    child: ChildProcess,
888    socket: Fd,
889
890    count_wait_loop_start: u64,
891    count_futex_wait: u64,
892
893    module: Option<Module>,
894    gas_metering: Option<GasMeteringKind>,
895}
896
897impl Drop for Sandbox {
898    fn drop(&mut self) {
899        let vmctx = self.vmctx();
900        let child_futex_wait = unsafe { *vmctx.counters.syscall_futex_wait.get() };
901        let child_loop_start = unsafe { *vmctx.counters.syscall_wait_loop_start.get() };
902        log::debug!(
903            "Host futex wait count: {}/{} ({:.02}%)",
904            self.count_futex_wait,
905            self.count_wait_loop_start,
906            self.count_futex_wait as f64 / self.count_wait_loop_start as f64 * 100.0
907        );
908        log::debug!(
909            "Child futex wait count: {}/{} ({:.02}%)",
910            child_futex_wait,
911            child_loop_start,
912            child_futex_wait as f64 / child_loop_start as f64 * 100.0
913        );
914    }
915}
916
917impl super::SandboxAddressSpace for () {
918    fn native_code_address(&self) -> u64 {
919        VM_ADDR_NATIVE_CODE
920    }
921}
922
923impl super::Sandbox for Sandbox {
924    const KIND: SandboxKind = SandboxKind::Linux;
925
926    type Access<'r> = SandboxAccess<'r>;
927    type Config = SandboxConfig;
928    type Error = Error;
929    type Program = SandboxProgram;
930    type AddressSpace = ();
931
932    fn as_sandbox_vec(vec: &SandboxVec) -> &Mutex<Vec<Self>> {
933        #[allow(clippy::match_wildcard_for_single_variants)]
934        match vec {
935            SandboxVec::Linux(ref vec) => vec,
936            _ => unreachable!(),
937        }
938    }
939
940    fn as_compiled_module(module: &Module) -> &CompiledModule<Self> {
941        match module.compiled_module() {
942            CompiledModuleKind::Linux(ref module) => module,
943            _ => unreachable!(),
944        }
945    }
946
947    fn reserve_address_space() -> Result<Self::AddressSpace, Self::Error> {
948        Ok(())
949    }
950
951    fn prepare_program(init: SandboxInit, (): Self::AddressSpace) -> Result<Self::Program, Self::Error> {
952        static PADDING: [u8; VM_MAX_PAGE_SIZE as usize] = [0; VM_MAX_PAGE_SIZE as usize];
953
954        let native_page_size = get_native_page_size();
955        let cfg = init.memory_config(native_page_size)?;
956        let ro_data_padding = &PADDING[..cfg.ro_data_fd_size as usize - init.guest_init.ro_data.len()];
957        let rw_data_padding = &PADDING[..cfg.rw_data_fd_size as usize - init.guest_init.rw_data.len()];
958        let code_padding = &PADDING[..cfg.code_size as usize - init.code.len()];
959
960        let memfd = prepare_sealed_memfd(
961            create_program_memfd()?,
962            cfg.ro_data_fd_size as usize + cfg.rw_data_fd_size as usize + cfg.code_size as usize + cfg.jump_table_size as usize,
963            [
964                init.guest_init.ro_data,
965                ro_data_padding,
966                init.guest_init.rw_data,
967                rw_data_padding,
968                init.code,
969                code_padding,
970                init.jump_table
971            ]
972        )?;
973
974        let code_offset = cfg.ro_data_fd_size as usize + cfg.rw_data_fd_size as usize;
975        let code_range = code_offset..code_offset + init.code.len();
976
977        Ok(SandboxProgram(Arc::new(SandboxProgramInner {
978            memfd,
979            memory_config: cfg,
980            code_range,
981        })))
982    }
983
984    fn spawn(config: &SandboxConfig) -> Result<Self, Error> {
985        let sigset = Sigmask::block_all_signals()?;
986        let zygote_memfd = prepare_zygote()?;
987        let (vmctx_memfd, vmctx_mmap) = prepare_vmctx()?;
988        let (socket, child_socket) = linux_raw::sys_socketpair(linux_raw::AF_UNIX, linux_raw::SOCK_SEQPACKET | linux_raw::SOCK_CLOEXEC, 0)?;
989        let (lifetime_pipe_host, lifetime_pipe_child) = linux_raw::sys_pipe2(linux_raw::O_CLOEXEC)?;
990
991        let sandbox_flags =
992            if !cfg!(polkavm_dev_debug_zygote) {
993                u64::from(linux_raw::CLONE_NEWCGROUP
994                    | linux_raw::CLONE_NEWIPC
995                    | linux_raw::CLONE_NEWNET
996                    | linux_raw::CLONE_NEWNS
997                    | linux_raw::CLONE_NEWPID
998                    | linux_raw::CLONE_NEWUSER
999                    | linux_raw::CLONE_NEWUTS)
1000            } else {
1001                0
1002            };
1003
1004        let mut pidfd: c_int = -1;
1005        let args = CloneArgs {
1006            flags: linux_raw::CLONE_CLEAR_SIGHAND | u64::from(linux_raw::CLONE_PIDFD) | sandbox_flags,
1007            pidfd: &mut pidfd,
1008            child_tid: 0,
1009            parent_tid: 0,
1010            exit_signal: 0,
1011            stack: 0,
1012            stack_size: 0,
1013            tls: 0,
1014        };
1015
1016        let uid = linux_raw::sys_getuid()?;
1017        let gid = linux_raw::sys_getgid()?;
1018
1019        let uid_map = format!("0 {} 1\n", uid);
1020        let gid_map = format!("0 {} 1\n", gid);
1021
1022        let (logger_rx, logger_tx) = if config.enable_logger {
1023            let (rx, tx) = linux_raw::sys_pipe2(linux_raw::O_CLOEXEC)?;
1024            (Some(rx), Some(tx))
1025        } else {
1026            (None, None)
1027        };
1028
1029        // Fork a new process.
1030        let mut child_pid =
1031            unsafe { linux_raw::syscall!(linux_raw::SYS_clone3, core::ptr::addr_of!(args), core::mem::size_of::<CloneArgs>()) };
1032
1033        if child_pid < 0 {
1034            // Fallback for Linux versions older than 5.5.
1035            let error = Error::from_last_os_error("clone");
1036            child_pid = unsafe { linux_raw::syscall!(linux_raw::SYS_clone, sandbox_flags, 0, 0, 0, 0) };
1037
1038            if child_pid < 0 {
1039                return Err(error);
1040            }
1041        }
1042
1043        if child_pid == 0 {
1044            // We're in the child.
1045            //
1046            // Calling into libc from here risks a deadlock as other threads might have
1047            // been holding onto internal libc locks while we were cloning ourselves,
1048            // so from now on we can't use anything from libc anymore.
1049            core::mem::forget(sigset);
1050
1051            unsafe {
1052                match child_main(zygote_memfd, child_socket, &uid_map, &gid_map, logger_tx) {
1053                    Ok(()) => {
1054                        // This is impossible.
1055                        abort();
1056                    }
1057                    Err(error) => {
1058                        let vmctx = &*vmctx_mmap.as_ptr().cast::<VmCtx>();
1059                        set_message(vmctx, format_args!("fatal error while spawning child: {error}"));
1060
1061                        abort();
1062                    }
1063                }
1064            }
1065        }
1066
1067        if let Some(logger_rx) = logger_rx {
1068            // Hook up the child process' STDERR to our logger.
1069            std::thread::Builder::new()
1070                .name("polkavm-logger".into())
1071                .spawn(move || {
1072                    let mut tmp = [0; 4096];
1073                    let mut buffer = Vec::new();
1074                    loop {
1075                        if buffer.len() > 8192 {
1076                            // Make sure the child can't exhaust our memory by spamming logs.
1077                            buffer.clear();
1078                        }
1079
1080                        match linux_raw::sys_read(logger_rx.borrow(), &mut tmp) {
1081                            Err(error) if error.errno() == linux_raw::EINTR => continue,
1082                            Err(error) => {
1083                                log::warn!("Failed to read from logger: {}", error);
1084                                break;
1085                            }
1086                            Ok(0) => break,
1087                            Ok(count) => {
1088                                let mut tmp = &tmp[..count];
1089                                while !tmp.is_empty() {
1090                                    if let Some(index) = tmp.iter().position(|&byte| byte == b'\n') {
1091                                        buffer.extend_from_slice(&tmp[..index]);
1092                                        tmp = &tmp[index + 1..];
1093
1094                                        log::trace!(target: "polkavm::zygote", "Child #{}: {}", child_pid, String::from_utf8_lossy(&buffer));
1095                                        buffer.clear();
1096                                    } else {
1097                                        buffer.extend_from_slice(tmp);
1098                                        break;
1099                                    }
1100                                }
1101                            }
1102                        }
1103                    }
1104                })
1105                .map_err(|error| Error::from_os_error("failed to spawn logger thread", error))?;
1106        }
1107
1108        let mut child = ChildProcess {
1109            pid: child_pid as c_int,
1110            pidfd: if pidfd < 0 { None } else { Some(Fd::from_raw_unchecked(pidfd)) },
1111        };
1112
1113        // We're in the parent. Restore the signal mask.
1114        child_socket.close()?;
1115        sigset.unblock()?;
1116
1117        fn wait_for_futex(vmctx: &VmCtx, child: &mut ChildProcess, current_state: u32, target_state: u32) -> Result<(), Error> {
1118            let instant = Instant::now();
1119            loop {
1120                let state = vmctx.futex.load(Ordering::Relaxed);
1121                if state == target_state {
1122                    return Ok(());
1123                }
1124
1125                if state != current_state {
1126                    return Err(Error::from_str("failed to initialize sandbox process: unexpected futex state"));
1127                }
1128
1129                let status = child.check_status(true)?;
1130                if !status.is_running() {
1131                    let message = get_message(vmctx);
1132                    if let Some(message) = message {
1133                        let error = Error::from(format!("failed to initialize sandbox process: {status}: {message}"));
1134                        return Err(error);
1135                    } else {
1136                        return Err(Error::from(format!(
1137                            "failed to initialize sandbox process: child process unexpectedly quit: {status}",
1138                        )));
1139                    }
1140                }
1141
1142                if !cfg!(polkavm_dev_debug_zygote) && instant.elapsed() > core::time::Duration::from_secs(10) {
1143                    // This should never happen, but just in case.
1144                    return Err(Error::from_str("failed to initialize sandbox process: initialization timeout"));
1145                }
1146
1147                match linux_raw::sys_futex_wait(&vmctx.futex, state, Some(core::time::Duration::from_millis(100))) {
1148                    Ok(()) => continue,
1149                    Err(error)
1150                        if error.errno() == linux_raw::EAGAIN
1151                            || error.errno() == linux_raw::EINTR
1152                            || error.errno() == linux_raw::ETIMEDOUT =>
1153                    {
1154                        continue
1155                    }
1156                    Err(error) => return Err(error),
1157                }
1158            }
1159        }
1160
1161        #[cfg(debug_assertions)]
1162        if cfg!(polkavm_dev_debug_zygote) {
1163            use core::fmt::Write;
1164            std::thread::sleep(core::time::Duration::from_millis(200));
1165
1166            let mut command = String::new();
1167            // Make sure gdb can actually attach to the worker process.
1168            if std::fs::read_to_string("/proc/sys/kernel/yama/ptrace_scope").map(|value| value.trim() == "1").unwrap_or(false) {
1169                command.push_str("echo 0 | sudo tee /proc/sys/kernel/yama/ptrace_scope ;");
1170            }
1171
1172            command.push_str(concat!(
1173                "gdb",
1174                " -ex 'set pagination off'",
1175                " -ex 'layout split'",
1176                " -ex 'set print asm-demangle on'",
1177                " -ex 'set debuginfod enabled off'",
1178                " -ex 'tcatch exec'",
1179                " -ex 'handle SIGSTOP nostop'",
1180            ));
1181
1182            let _ = write!(&mut command, " -ex 'attach {}' -ex 'continue'", child.pid);
1183
1184            let mut cmd =
1185                if std::env::var_os("DISPLAY").is_some() {
1186                    // Running X11; open gdb in a terminal.
1187                    let mut cmd = std::process::Command::new("urxvt");
1188                    cmd
1189                        .args(["-fg", "rgb:ffff/ffff/ffff"])
1190                        .args(["-bg", "rgba:0000/0000/0000/7777"])
1191                        .arg("-e")
1192                        .arg("sh")
1193                        .arg("-c")
1194                        .arg(&command);
1195                    cmd
1196                } else {
1197                    // Not running under X11; just run it as-is.
1198                    let mut cmd = std::process::Command::new("sh");
1199                    cmd
1200                        .arg("-c")
1201                        .arg(&command);
1202                    cmd
1203                };
1204
1205            let mut gdb = match cmd.spawn() {
1206                Ok(child) => child,
1207                Err(error) => {
1208                    panic!("failed to launch: '{cmd:?}': {error}");
1209                }
1210            };
1211
1212            let pid = child.pid;
1213            std::thread::spawn(move || {
1214                let _ = gdb.wait();
1215                let _ = linux_raw::sys_kill(pid, linux_raw::SIGKILL);
1216            });
1217        }
1218
1219        let vmctx = unsafe { &*vmctx_mmap.as_ptr().cast::<VmCtx>() };
1220
1221        // Send the vmctx memfd to the child process.
1222        if let Err(error) = linux_raw::sendfd(socket.borrow(), vmctx_memfd.borrow()) {
1223            let message = get_message(vmctx);
1224            if let Some(message) = message {
1225                let error = Error::from(format!("failed to initialize sandbox process: {error} (root cause: {message})"));
1226                return Err(error);
1227            }
1228
1229            return Err(error);
1230        }
1231
1232        linux_raw::sendfd(socket.borrow(), lifetime_pipe_child.borrow())?;
1233        lifetime_pipe_child.close()?;
1234
1235        // Wait until the child process receives the vmctx memfd.
1236        wait_for_futex(vmctx, &mut child, VMCTX_FUTEX_BUSY, VMCTX_FUTEX_INIT)?;
1237
1238        // Grab the child process' maps and see what we can unmap.
1239        //
1240        // The child process can't do it itself as it's too sandboxed.
1241        let maps = std::fs::read(format!("/proc/{}/maps", child_pid))
1242            .map_err(|error| Error::from_errno("failed to read child's maps", error.raw_os_error().unwrap_or(0)))?;
1243
1244        for line in maps.split(|&byte| byte == b'\n') {
1245            if line.is_empty() {
1246                continue;
1247            }
1248
1249            let map = Map::parse(line).ok_or_else(|| Error::from_str("failed to parse the maps of the child process"))?;
1250            match map.name {
1251                b"[stack]" => {
1252                    vmctx.init.stack_address.store(map.start, Ordering::Relaxed);
1253                    vmctx.init.stack_length.store(map.end - map.start, Ordering::Relaxed);
1254                }
1255                b"[vdso]" => {
1256                    vmctx.init.vdso_address.store(map.start, Ordering::Relaxed);
1257                    vmctx.init.vdso_length.store(map.end - map.start, Ordering::Relaxed);
1258                }
1259                b"[vvar]" => {
1260                    vmctx.init.vvar_address.store(map.start, Ordering::Relaxed);
1261                    vmctx.init.vvar_length.store(map.end - map.start, Ordering::Relaxed);
1262                }
1263                b"[vsyscall]" => {
1264                    if map.is_readable {
1265                        return Err(Error::from_str("failed to initialize sandbox process: vsyscall region is readable"));
1266                    }
1267                }
1268                _ => {}
1269            }
1270        }
1271
1272        // Wake the child so that it finishes initialization.
1273        vmctx.futex.store(VMCTX_FUTEX_BUSY, Ordering::Release);
1274        linux_raw::sys_futex_wake_one(&vmctx.futex)?;
1275
1276        // Wait for the child to finish initialization.
1277        wait_for_futex(vmctx, &mut child, VMCTX_FUTEX_BUSY, VMCTX_FUTEX_IDLE)?;
1278
1279        Ok(Sandbox {
1280            _lifetime_pipe: lifetime_pipe_host,
1281            vmctx_mmap,
1282            child,
1283            socket,
1284
1285            count_wait_loop_start: 0,
1286            count_futex_wait: 0,
1287
1288            module: None,
1289            gas_metering: None,
1290        })
1291    }
1292
1293    fn execute(&mut self, mut args: ExecuteArgs) -> Result<(), ExecutionError<Self::Error>> {
1294        self.wait_if_necessary(match args.hostcall_handler {
1295            Some(ref mut hostcall_handler) => Some(&mut *hostcall_handler),
1296            None => None,
1297        }, true)?;
1298
1299        if args.is_async && args.hostcall_handler.is_some() {
1300            return Err(Error::from_str("requested asynchronous execution with a borrowed hostcall handler").into());
1301        }
1302
1303        unsafe {
1304            if let Some(module) = args.module {
1305                args.flags |= polkavm_common::zygote::VM_RPC_FLAG_RECONFIGURE;
1306
1307                let compiled_module = Self::as_compiled_module(module);
1308                let program = &compiled_module.sandbox_program;
1309                *self.vmctx().memory_config.get() = program.0.memory_config.clone();
1310                *self.vmctx().heap_info.heap_top.get() = u64::from(module.memory_map().heap_base());
1311                *self.vmctx().heap_info.heap_threshold.get() = u64::from(module.memory_map().rw_data_range().end);
1312                self.gas_metering = module.gas_metering();
1313                self.module = Some(module.clone());
1314            }
1315
1316            if let Some(gas) = crate::sandbox::get_gas(&args, self.gas_metering) {
1317                *self.vmctx().gas().get() = gas;
1318            }
1319
1320            *self.vmctx().rpc_address.get() = args.entry_point.map_or(0, |entry_point|
1321                Self::as_compiled_module(self.module.as_ref().unwrap()).export_trampolines[entry_point] as usize
1322            ) as u64;
1323
1324            *self.vmctx().rpc_flags.get() = args.flags;
1325            *self.vmctx().rpc_sbrk.get() = args.sbrk;
1326
1327            if let Some(regs) = args.regs {
1328                (*self.vmctx().regs().get()).copy_from_slice(regs);
1329            }
1330
1331            self.vmctx().futex.store(VMCTX_FUTEX_BUSY, Ordering::Release);
1332            linux_raw::sys_futex_wake_one(&self.vmctx().futex)?;
1333
1334            if let Some(module) = args.module {
1335                let compiled_module = Self::as_compiled_module(module);
1336                // TODO: This can block forever.
1337                linux_raw::sendfd(self.socket.borrow(), compiled_module.sandbox_program.0.memfd.borrow())?;
1338            }
1339        }
1340
1341        if !args.is_async {
1342            self.wait_if_necessary(match args.hostcall_handler {
1343                Some(ref mut hostcall_handler) => Some(&mut *hostcall_handler),
1344                None => None,
1345            }, args.entry_point.is_none())?;
1346        }
1347
1348        Ok(())
1349    }
1350
1351    #[inline]
1352    fn access(&mut self) -> SandboxAccess {
1353        SandboxAccess { sandbox: self }
1354    }
1355
1356    fn pid(&self) -> Option<u32> {
1357        Some(self.child.pid as u32)
1358    }
1359
1360    fn address_table() -> AddressTable {
1361        ZYGOTE_ADDRESS_TABLE
1362    }
1363
1364    fn vmctx_regs_offset() -> usize {
1365        get_field_offset!(VmCtx::new(), |base| base.regs().get())
1366    }
1367
1368    fn vmctx_gas_offset() -> usize {
1369        get_field_offset!(VmCtx::new(), |base| base.gas().get())
1370    }
1371
1372    fn vmctx_heap_info_offset() -> usize {
1373        get_field_offset!(VmCtx::new(), |base| base.heap_info())
1374    }
1375
1376    fn gas_remaining_impl(&self) -> Result<Option<Gas>, super::OutOfGas> {
1377        if self.gas_metering.is_none() { return Ok(None) };
1378        let raw_gas = unsafe { *self.vmctx().gas().get() };
1379        Gas::from_i64(raw_gas).ok_or(super::OutOfGas).map(Some)
1380    }
1381
1382    fn sync(&mut self) -> Result<(), Self::Error> {
1383        self.wait_if_necessary(None, true).map_err(|error| {
1384            match error {
1385                ExecutionError::Trap(..) => Error::from_str("unexpected trap"),
1386                ExecutionError::OutOfGas => Error::from_str("unexpected out of gas"),
1387                ExecutionError::Error(error) => error,
1388            }
1389        })
1390    }
1391}
1392
1393impl Sandbox {
1394    #[inline]
1395    fn vmctx(&self) -> &VmCtx {
1396        unsafe { &*self.vmctx_mmap.as_ptr().cast::<VmCtx>() }
1397    }
1398
1399    #[inline(never)]
1400    #[cold]
1401    fn wait(&mut self, mut hostcall_handler: Option<HostcallHandler>, low_latency: bool) -> Result<(), ExecutionError<Error>> {
1402        let mut spin_target = 0;
1403        let mut yield_target = 0;
1404        if low_latency {
1405            yield_target = 20;
1406        }
1407
1408        'outer: loop {
1409            self.count_wait_loop_start += 1;
1410
1411            let state = self.vmctx().futex.load(Ordering::Relaxed);
1412            if state == VMCTX_FUTEX_IDLE {
1413                core::sync::atomic::fence(Ordering::Acquire);
1414                return Ok(());
1415            }
1416
1417            if state == VMCTX_FUTEX_TRAP {
1418                core::sync::atomic::fence(Ordering::Acquire);
1419
1420                self.vmctx().futex.store(VMCTX_FUTEX_BUSY, Ordering::Release);
1421                linux_raw::sys_futex_wake_one(&self.vmctx().futex)?;
1422
1423                return Err(ExecutionError::Trap(Trap::default()));
1424            }
1425
1426            if state == VMCTX_FUTEX_HOSTCALL {
1427                core::sync::atomic::fence(Ordering::Acquire);
1428
1429                let hostcall_handler = match hostcall_handler {
1430                    Some(ref mut hostcall_handler) => &mut *hostcall_handler,
1431                    None => {
1432                        unsafe {
1433                            *self.vmctx().hostcall().get() = polkavm_common::zygote::HOSTCALL_ABORT_EXECUTION;
1434                        }
1435                        self.vmctx().futex.store(VMCTX_FUTEX_BUSY, Ordering::Release);
1436                        linux_raw::sys_futex_wake_one(&self.vmctx().futex)?;
1437
1438                        return Err(Error::from_str("hostcall called without any hostcall handler set").into());
1439                    }
1440                };
1441
1442                let hostcall = unsafe { *self.vmctx().hostcall().get() };
1443                if hostcall == polkavm_common::HOSTCALL_TRACE {
1444                    // When tracing aggressively spin to avoid having to call into the kernel.
1445                    spin_target = 512;
1446                }
1447
1448                match hostcall_handler(hostcall, super::Sandbox::access(self).into()) {
1449                    Ok(()) => {
1450                        self.vmctx().futex.store(VMCTX_FUTEX_BUSY, Ordering::Release);
1451                        linux_raw::sys_futex_wake_one(&self.vmctx().futex)?;
1452                        continue;
1453                    }
1454                    Err(trap) => {
1455                        unsafe {
1456                            *self.vmctx().hostcall().get() = polkavm_common::zygote::HOSTCALL_ABORT_EXECUTION;
1457                        }
1458                        self.vmctx().futex.store(VMCTX_FUTEX_BUSY, Ordering::Release);
1459                        linux_raw::sys_futex_wake_one(&self.vmctx().futex)?;
1460
1461                        return Err(ExecutionError::Trap(trap));
1462                    }
1463                }
1464            }
1465
1466            if state != VMCTX_FUTEX_BUSY {
1467                return Err(Error::from_str("internal error: unexpected worker process state").into());
1468            }
1469
1470            // We're going to be waiting anyway, so do some useful work if we can.
1471            cache_program_memfd_if_necessary();
1472
1473            for _ in 0..yield_target {
1474                let _ = linux_raw::sys_sched_yield();
1475                if self.vmctx().futex.load(Ordering::Relaxed) != VMCTX_FUTEX_BUSY {
1476                    continue 'outer;
1477                }
1478            }
1479
1480            for _ in 0..spin_target {
1481                core::hint::spin_loop();
1482                if self.vmctx().futex.load(Ordering::Relaxed) != VMCTX_FUTEX_BUSY {
1483                    continue 'outer;
1484                }
1485            }
1486
1487            self.count_futex_wait += 1;
1488            match linux_raw::sys_futex_wait(&self.vmctx().futex, VMCTX_FUTEX_BUSY, Some(core::time::Duration::from_millis(100))) {
1489                Ok(()) => continue,
1490                Err(error) if error.errno() == linux_raw::EAGAIN || error.errno() == linux_raw::EINTR => continue,
1491                Err(error) if error.errno() == linux_raw::ETIMEDOUT => {
1492                    log::trace!("Timeout expired while waiting for child #{}...", self.child.pid);
1493                    self.check_child_status()?;
1494                }
1495                Err(error) => return Err(error.into()),
1496            }
1497        }
1498    }
1499
1500    fn check_child_status(&mut self) -> Result<(), Error> {
1501        let status = self.child.check_status(true)?;
1502        if status.is_running() {
1503            return Ok(());
1504        }
1505
1506        log::trace!("Child #{} is not running anymore: {status}", self.child.pid);
1507        let message = get_message(self.vmctx());
1508        if let Some(message) = message {
1509            Err(Error::from(format!("{status}: {message}")))
1510        } else {
1511            Err(Error::from(format!("worker process unexpectedly quit: {status}")))
1512        }
1513    }
1514
1515    #[inline]
1516    fn wait_if_necessary(&mut self, hostcall_handler: Option<HostcallHandler>, low_latency: bool) -> Result<(), ExecutionError<Error>> {
1517        if self.vmctx().futex.load(Ordering::Relaxed) != VMCTX_FUTEX_IDLE {
1518            self.wait(hostcall_handler, low_latency)?;
1519        }
1520
1521        Ok(())
1522    }
1523}
1524
1525pub struct SandboxAccess<'a> {
1526    sandbox: &'a mut Sandbox,
1527}
1528
1529impl<'a> From<SandboxAccess<'a>> for BackendAccess<'a> {
1530    fn from(access: SandboxAccess<'a>) -> Self {
1531        BackendAccess::CompiledLinux(access)
1532    }
1533}
1534
1535impl<'a> Access<'a> for SandboxAccess<'a> {
1536    type Error = MemoryAccessError<linux_raw::Error>;
1537
1538    fn get_reg(&self, reg: Reg) -> u32 {
1539        let regs = unsafe { &*self.sandbox.vmctx().regs().get() };
1540        regs[reg as usize]
1541    }
1542
1543    fn set_reg(&mut self, reg: Reg, value: u32) {
1544        unsafe {
1545            (*self.sandbox.vmctx().regs().get())[reg as usize] = value;
1546        }
1547    }
1548
1549    fn read_memory_into_slice<'slice, T>(&self, address: u32, buffer: &'slice mut T) -> Result<&'slice mut [u8], Self::Error>
1550    where
1551        T: ?Sized + AsUninitSliceMut,
1552    {
1553        let slice = buffer.as_uninit_slice_mut();
1554        log::trace!(
1555            "Reading memory: 0x{:x}-0x{:x} ({} bytes)",
1556            address,
1557            address as usize + slice.len(),
1558            slice.len()
1559        );
1560
1561        if address as usize + slice.len() > 0xffffffff {
1562            return Err(MemoryAccessError {
1563                address,
1564                length: slice.len() as u64,
1565                error: Error::from_str("out of range read"),
1566            });
1567        }
1568
1569        let length = slice.len();
1570        match linux_raw::vm_read_memory(self.sandbox.child.pid, [slice], [(address as usize, length)]) {
1571            Ok(actual_length) if actual_length == length => {
1572                unsafe { Ok(slice_assume_init_mut(slice)) }
1573            },
1574            Ok(_) => {
1575                Err(MemoryAccessError {
1576                    address,
1577                    length: slice.len() as u64,
1578                    error: Error::from_str("incomplete read"),
1579                })
1580            },
1581            Err(error) => {
1582                Err(MemoryAccessError {
1583                    address,
1584                    length: slice.len() as u64,
1585                    error,
1586                })
1587            }
1588        }
1589    }
1590
1591    fn write_memory(&mut self, address: u32, data: &[u8]) -> Result<(), Self::Error> {
1592        log::trace!(
1593            "Writing memory: 0x{:x}-0x{:x} ({} bytes)",
1594            address,
1595            address as usize + data.len(),
1596            data.len()
1597        );
1598
1599        if address as usize + data.len() > 0xffffffff {
1600            return Err(MemoryAccessError {
1601                address,
1602                length: data.len() as u64,
1603                error: Error::from_str("out of range write"),
1604            });
1605        }
1606
1607        self.sandbox.vmctx().is_memory_dirty.store(true, Ordering::Relaxed);
1608
1609        let length = data.len();
1610        match linux_raw::vm_write_memory(self.sandbox.child.pid, [data], [(address as usize, length)]) {
1611            Ok(actual_length) if actual_length == length => {
1612                Ok(())
1613            },
1614            Ok(_) => {
1615                Err(MemoryAccessError {
1616                    address,
1617                    length: data.len() as u64,
1618                    error: Error::from_str("incomplete write"),
1619                })
1620            },
1621            Err(error) => {
1622                Err(MemoryAccessError {
1623                    address,
1624                    length: data.len() as u64,
1625                    error,
1626                })
1627            }
1628        }
1629    }
1630
1631    fn sbrk(&mut self, size: u32) -> Option<u32> {
1632        if size == 0 {
1633            return Some(unsafe { *self.sandbox.vmctx().heap_info().heap_top.get() as u32 });
1634        }
1635
1636        debug_assert_eq!(self.sandbox.vmctx().futex.load(Ordering::Relaxed), VMCTX_FUTEX_HOSTCALL);
1637
1638        unsafe {
1639            *self.sandbox.vmctx().rpc_sbrk.get() = size;
1640            *self.sandbox.vmctx().hostcall().get() = polkavm_common::zygote::HOSTCALL_SBRK;
1641        }
1642
1643        self.sandbox.vmctx().futex.store(VMCTX_FUTEX_BUSY, Ordering::Release);
1644        if let Err(error) = linux_raw::sys_futex_wake_one(&self.sandbox.vmctx().futex) {
1645            panic!("sbrk failed: {error}");
1646        }
1647
1648        let mut timestamp = Instant::now();
1649        loop {
1650            let _ = linux_raw::sys_sched_yield();
1651            if self.sandbox.vmctx().futex.load(Ordering::Relaxed) == VMCTX_FUTEX_BUSY {
1652                let new_timestamp = Instant::now();
1653                let elapsed = new_timestamp - timestamp;
1654                if elapsed >= Duration::from_millis(100) {
1655                    timestamp = new_timestamp;
1656                    if let Err(error) = self.sandbox.check_child_status() {
1657                        panic!("sbrk failed: {error}");
1658                    }
1659                }
1660                continue;
1661            }
1662
1663            core::sync::atomic::fence(Ordering::Acquire);
1664            break;
1665        }
1666
1667        debug_assert_eq!(self.sandbox.vmctx().futex.load(Ordering::Relaxed), VMCTX_FUTEX_HOSTCALL);
1668
1669        let result = unsafe { *self.sandbox.vmctx().rpc_sbrk.get() };
1670        if result == 0 {
1671            None
1672        } else {
1673            Some(result)
1674        }
1675    }
1676
1677    fn heap_size(&self) -> u32 {
1678        let heap_base = unsafe { (*self.sandbox.vmctx().memory_config.get()).memory_map.heap_base() };
1679        let heap_top = unsafe { *self.sandbox.vmctx().heap_info().heap_top.get() };
1680        (heap_top - u64::from(heap_base)) as u32
1681    }
1682
1683    fn program_counter(&self) -> Option<u32> {
1684        let value = unsafe { *self.sandbox.vmctx().nth_instruction().get() };
1685
1686        if value == SANDBOX_EMPTY_NTH_INSTRUCTION {
1687            None
1688        } else {
1689            Some(value)
1690        }
1691    }
1692
1693    fn native_program_counter(&self) -> Option<u64> {
1694        let value = unsafe { *self.sandbox.vmctx().rip().get() };
1695
1696        if value == SANDBOX_EMPTY_NATIVE_PROGRAM_COUNTER {
1697            None
1698        } else {
1699            Some(value)
1700        }
1701    }
1702
1703    fn gas_remaining(&self) -> Option<Gas> {
1704        use super::Sandbox;
1705        self.sandbox.gas_remaining_impl().ok().unwrap_or(Some(Gas::MIN))
1706    }
1707
1708    fn consume_gas(&mut self, gas: u64) {
1709        if self.sandbox.gas_metering.is_none() { return }
1710        let gas_remaining = unsafe { &mut *self.sandbox.vmctx().gas().get() };
1711        *gas_remaining = gas_remaining.checked_sub_unsigned(gas).unwrap_or(-1);
1712    }
1713}