1#![allow(clippy::undocumented_unsafe_blocks)]
2#![allow(clippy::manual_range_contains)]
3
4extern crate polkavm_linux_raw as linux_raw;
5
6use polkavm_common::{
7 abi::VM_MAX_PAGE_SIZE,
8 error::{ExecutionError, Trap},
9 program::Reg,
10 utils::{align_to_next_page_usize, slice_assume_init_mut, Access, AsUninitSliceMut, Gas},
11 zygote::{
12 AddressTable, AddressTablePacked,
13 SandboxMemoryConfig, VmCtx, SANDBOX_EMPTY_NATIVE_PROGRAM_COUNTER, SANDBOX_EMPTY_NTH_INSTRUCTION, VMCTX_FUTEX_BUSY,
14 VMCTX_FUTEX_HOSTCALL, VMCTX_FUTEX_IDLE, VMCTX_FUTEX_INIT, VMCTX_FUTEX_TRAP, VM_ADDR_NATIVE_CODE,
15 },
16};
17
18use super::ExecuteArgs;
19
20pub use linux_raw::Error;
21
22use core::ffi::{c_int, c_uint};
23use core::ops::Range;
24use core::sync::atomic::Ordering;
25use core::time::Duration;
26use linux_raw::{abort, cstr, syscall_readonly, Fd, Mmap, STDERR_FILENO, STDIN_FILENO};
27use std::borrow::Cow;
28use std::time::Instant;
29use std::sync::{Arc, Mutex};
30
31use super::{SandboxKind, SandboxInit, SandboxVec, get_native_page_size};
32use crate::api::{BackendAccess, CompiledModuleKind, MemoryAccessError, Module, HostcallHandler};
33use crate::compiler::CompiledModule;
34use crate::config::GasMeteringKind;
35
36pub struct SandboxConfig {
37 enable_logger: bool,
38}
39
40impl SandboxConfig {
41 pub fn new() -> Self {
42 SandboxConfig { enable_logger: false }
43 }
44}
45
46impl super::SandboxConfig for SandboxConfig {
47 fn enable_logger(&mut self, value: bool) {
48 self.enable_logger = value;
49 }
50}
51
52impl Default for SandboxConfig {
53 fn default() -> Self {
54 Self::new()
55 }
56}
57
58#[repr(C)]
59struct CloneArgs {
60 flags: u64,
62 pidfd: *mut c_int,
64 child_tid: u64,
66 parent_tid: u64,
68 exit_signal: u64,
70 stack: u64,
72 stack_size: u64,
74 tls: u64,
76}
77
78fn close_other_file_descriptors(preserved_fds: &[c_int]) -> Result<(), Error> {
80 let mut start_at = 0;
81 for &fd in preserved_fds {
82 if start_at == fd {
83 start_at = fd + 1;
84 continue;
85 }
86
87 if start_at > fd {
88 return Err(Error::from_str("internal error: preserved file descriptors are not sorted"));
90 }
91
92 if linux_raw::sys_close_range(start_at, fd - 1, 0).is_err() {
93 return close_other_file_descriptors_legacy(preserved_fds);
94 }
95
96 start_at = fd + 1;
97 }
98
99 if linux_raw::sys_close_range(start_at, c_int::MAX, 0).is_err() {
100 return close_other_file_descriptors_legacy(preserved_fds);
101 }
102
103 Ok(())
104}
105
106fn close_other_file_descriptors_legacy(preserved_fds: &[c_int]) -> Result<(), Error> {
110 let dirfd = linux_raw::sys_open(
111 cstr!("/proc/self/fd"),
112 linux_raw::O_RDONLY | linux_raw::O_DIRECTORY | linux_raw::O_CLOEXEC,
113 )?;
114 for dirent in linux_raw::readdir(dirfd.borrow()) {
115 let dirent = dirent?;
116 let name = dirent.d_name();
117 if !name.iter().all(|&byte| byte >= b'0' && byte <= b'9') {
118 continue;
119 }
120
121 let name = core::str::from_utf8(name)
122 .ok()
123 .ok_or_else(|| Error::from_str("entry in '/proc/self/fd' is not valid utf-8"))?;
124 let fd: c_int = name
125 .parse()
126 .ok()
127 .ok_or_else(|| Error::from_str("entry in '/proc/self/fd' is not a number"))?;
128 if fd == dirfd.raw() || preserved_fds.iter().any(|&pfd| pfd == fd) {
129 continue;
130 }
131
132 Fd::from_raw_unchecked(fd).close()?;
133 }
134
135 dirfd.close()?;
136 Ok(())
137}
138
139struct Sigmask {
140 sigset_original: linux_raw::kernel_sigset_t,
141}
142
143impl Sigmask {
144 fn block_all_signals() -> Result<Self, Error> {
146 let sigset_all: linux_raw::kernel_sigset_t = !0;
147 let mut sigset_original: linux_raw::kernel_sigset_t = 0;
148 unsafe { linux_raw::sys_rt_sigprocmask(linux_raw::SIG_SETMASK, &sigset_all, Some(&mut sigset_original))? };
149
150 Ok(Sigmask { sigset_original })
151 }
152
153 fn unblock(mut self) -> Result<(), Error> {
155 let result = self.unblock_inplace();
156 core::mem::forget(self);
157 result
158 }
159
160 fn unblock_inplace(&mut self) -> Result<(), Error> {
162 unsafe { linux_raw::sys_rt_sigprocmask(linux_raw::SIG_SETMASK, &self.sigset_original, None) }
163 }
164}
165
166impl Drop for Sigmask {
167 fn drop(&mut self) {
168 let _ = self.unblock_inplace();
169 }
170}
171
172#[derive(Debug)]
173struct ChildProcess {
174 pid: c_int,
175 pidfd: Option<Fd>,
176}
177
178#[derive(Debug)]
179enum ChildStatus {
180 Running,
181 NotRunning,
182 Exited(c_int),
183 ExitedDueToSignal(c_int),
184}
185
186impl ChildStatus {
187 pub fn is_running(&self) -> bool {
188 matches!(self, Self::Running)
189 }
190}
191
192struct Signal(c_int);
193impl core::fmt::Display for Signal {
194 fn fmt(&self, fmt: &mut core::fmt::Formatter) -> core::fmt::Result {
195 let name = match self.0 as u32 {
196 linux_raw::SIGABRT => "SIGABRT",
197 linux_raw::SIGBUS => "SIGBUS",
198 linux_raw::SIGCHLD => "SIGCHLD",
199 linux_raw::SIGCONT => "SIGCONT",
200 linux_raw::SIGFPE => "SIGFPE",
201 linux_raw::SIGHUP => "SIGHUP",
202 linux_raw::SIGILL => "SIGILL",
203 linux_raw::SIGINT => "SIGINT",
204 linux_raw::SIGKILL => "SIGKILL",
205 linux_raw::SIGPIPE => "SIGPIPE",
206 linux_raw::SIGSEGV => "SIGSEGV",
207 linux_raw::SIGSTOP => "SIGSTOP",
208 linux_raw::SIGSYS => "SIGSYS",
209 linux_raw::SIGTERM => "SIGTERM",
210 linux_raw::SIGTRAP => "SIGTRAP",
211 _ => return write!(fmt, "{}", self.0)
212 };
213
214 fmt.write_str(name)
215 }
216}
217
218impl core::fmt::Display for ChildStatus {
219 fn fmt(&self, fmt: &mut core::fmt::Formatter) -> core::fmt::Result {
220 match self {
221 ChildStatus::Running => fmt.write_str("running"),
222 ChildStatus::NotRunning => fmt.write_str("not running"),
223 ChildStatus::Exited(code) => write!(fmt, "exited (status = {code})"),
224 ChildStatus::ExitedDueToSignal(signum) => write!(fmt, "exited due to signal (signal = {})", Signal(*signum)),
225 }
226 }
227}
228
229impl ChildProcess {
230 fn waitid(&mut self, flags: u32) -> Result<linux_raw::siginfo_t, Error> {
231 let mut siginfo: linux_raw::siginfo_t = unsafe { core::mem::zeroed() };
232 let mut result;
233 loop {
234 result = if let Some(ref pidfd) = self.pidfd {
235 linux_raw::sys_waitid(linux_raw::P_PIDFD, pidfd.raw(), &mut siginfo, flags, None)
236 } else {
237 linux_raw::sys_waitid(linux_raw::P_PID, self.pid, &mut siginfo, flags, None)
238 };
239
240 if let Err(error) = result {
241 if error.errno() == linux_raw::EINTR {
242 continue;
244 }
245
246 return Err(error);
247 }
248
249 return Ok(siginfo);
250 }
251 }
252
253 fn check_status(&mut self, non_blocking: bool) -> Result<ChildStatus, Error> {
254 let mut flags = linux_raw::WEXITED | linux_raw::__WALL;
259 if non_blocking {
260 flags |= linux_raw::WNOHANG;
261 }
262
263 match self.waitid(flags) {
264 Ok(ok) => unsafe {
265 if ok.si_signo() == 0 && ok.si_pid() == 0 {
266 Ok(ChildStatus::Running)
267 } else if ok.si_signo() as u32 == linux_raw::SIGCHLD && ok.si_code() as u32 == linux_raw::CLD_EXITED {
268 Ok(ChildStatus::Exited(ok.si_status()))
269 } else if ok.si_signo() as u32 == linux_raw::SIGCHLD && (ok.si_code() as u32 == linux_raw::CLD_KILLED || ok.si_code() as u32 == linux_raw::CLD_DUMPED) {
270 Ok(ChildStatus::ExitedDueToSignal(linux_raw::WTERMSIG(ok.si_status())))
271 } else if ok.si_signo() as u32 == linux_raw::SIGCHLD && ok.si_code() as u32 == linux_raw::CLD_STOPPED {
272 Err(Error::from_last_os_error("waitid failed: unexpected CLD_STOPPED status"))
273 } else if ok.si_signo() as u32 == linux_raw::SIGCHLD && ok.si_code() as u32 == linux_raw::CLD_TRAPPED {
274 Err(Error::from_last_os_error("waitid failed: unexpected CLD_TRAPPED status"))
275 } else if ok.si_signo() as u32 == linux_raw::SIGCHLD && ok.si_code() as u32 == linux_raw::CLD_CONTINUED {
276 Err(Error::from_last_os_error("waitid failed: unexpected CLD_CONTINUED status"))
277 } else if ok.si_signo() != 0 {
278 Ok(ChildStatus::ExitedDueToSignal(ok.si_signo()))
279 } else {
280 Err(Error::from_last_os_error("waitid failed: internal error: unexpected state"))
281 }
282 },
283 Err(error) => {
284 if error.errno() == linux_raw::ECHILD {
285 Ok(ChildStatus::NotRunning)
286 } else {
287 Err(error)
288 }
289 }
290 }
291 }
292
293 fn send_signal(&mut self, signal: c_uint) -> Result<(), Error> {
294 unsafe {
295 if let Some(ref pidfd) = self.pidfd {
296 let errcode = syscall_readonly!(linux_raw::SYS_pidfd_send_signal, pidfd, signal, 0, 0);
297 Error::from_syscall("pidfd_send_signal", errcode)
298 } else {
299 linux_raw::sys_kill(self.pid, signal)
300 }
301 }
302 }
303}
304
305impl Drop for ChildProcess {
306 fn drop(&mut self) {
307 #[cfg(polkavm_dev_debug_zygote)]
308 let _ = self.send_signal(linux_raw::SIGINT);
309
310 #[cfg(not(polkavm_dev_debug_zygote))]
311 if self.send_signal(linux_raw::SIGKILL).is_ok() {
312 let _ = self.check_status(false);
314 }
315 }
316}
317
318const ZYGOTE_BLOB_CONST: &[u8] = include_bytes!("./polkavm-zygote");
319static ZYGOTE_BLOB: &[u8] = ZYGOTE_BLOB_CONST;
320
321const ZYGOTE_ADDRESS_TABLE: AddressTable = {
323 const fn starts_with(haystack: &[u8], needle: &[u8]) -> bool {
324 if haystack.len() < needle.len() {
325 return false;
326 }
327
328 let mut index = 0;
329 while index < needle.len() {
330 if haystack[index] != needle[index] {
331 return false;
332 }
333 index += 1;
334 }
335
336 true
337 }
338
339 const fn cast_slice<T>(slice: &[u8]) -> &T where T: Copy {
340 assert!(slice.len() >= core::mem::size_of::<T>());
341 assert!(core::mem::align_of::<T>() == 1);
342
343 unsafe {
346 &*slice.as_ptr().cast::<T>()
347 }
348 }
349
350 #[repr(C)]
351 #[derive(Copy, Clone)]
352 struct U16([u8; 2]);
353
354 impl U16 {
355 const fn get(self) -> u16 {
356 u16::from_ne_bytes(self.0)
357 }
358 }
359
360 #[repr(C)]
361 #[derive(Copy, Clone)]
362 struct U32([u8; 4]);
363
364 impl U32 {
365 const fn get(self) -> u32 {
366 u32::from_ne_bytes(self.0)
367 }
368 }
369
370 #[repr(C)]
371 #[derive(Copy, Clone)]
372 struct U64([u8; 8]);
373
374 impl U64 {
375 const fn get(self) -> u64 {
376 u64::from_ne_bytes(self.0)
377 }
378 }
379
380 #[repr(C)]
381 #[derive(Copy, Clone)]
382 struct ElfIdent {
383 magic: [u8; 4],
384 class: u8,
385 data: u8,
386 version: u8,
387 os_abi: u8,
388 abi_version: u8,
389 padding: [u8; 7],
390 }
391
392 #[repr(C)]
393 #[derive(Copy, Clone)]
394 struct ElfHeader {
395 e_ident: ElfIdent,
396 e_type: U16,
397 e_machine: U16,
398 e_version: U32,
399 e_entry: U64,
400 e_phoff: U64,
401 e_shoff: U64,
402 e_flags: U32,
403 e_ehsize: U16,
404 e_phentsize: U16,
405 e_phnum: U16,
406 e_shentsize: U16,
407 e_shnum: U16,
408 e_shstrndx: U16,
409 }
410
411 #[repr(C)]
412 #[derive(Copy, Clone)]
413 struct ElfSectionHeader {
414 sh_name: U32,
415 sh_type: U32,
416 sh_flags: U64,
417 sh_addr: U64,
418 sh_offset: U64,
419 sh_size: U64,
420 sh_link: U32,
421 sh_info: U32,
422 sh_addralign: U64,
423 sh_entsize: U64,
424 }
425
426 impl ElfHeader {
427 const fn section_header<'a>(&self, blob: &'a [u8], nth_section: u16) -> &'a ElfSectionHeader {
428 let size = self.e_shentsize.get() as usize;
429 assert!(size == core::mem::size_of::<ElfSectionHeader>());
430
431 let offset = self.e_shoff.get() as usize + nth_section as usize * size;
432 cast_slice(blob.split_at(offset).1)
433 }
434 }
435
436 impl ElfSectionHeader {
437 const fn data<'a>(&self, blob: &'a [u8]) -> &'a [u8] {
438 blob.split_at(self.sh_offset.get() as usize).1.split_at(self.sh_size.get() as usize).0
439 }
440 }
441
442 let header: &ElfHeader = cast_slice(ZYGOTE_BLOB_CONST);
443 let shstr = header.section_header(ZYGOTE_BLOB_CONST, header.e_shstrndx.get()).data(ZYGOTE_BLOB_CONST);
444
445 let mut address_table = None;
446 let mut nth_section = 0;
447 while nth_section < header.e_shnum.get() {
448 let section_header = header.section_header(ZYGOTE_BLOB_CONST, nth_section);
449 if starts_with(shstr.split_at(section_header.sh_name.get() as usize).1, b".address_table") {
450 let data = section_header.data(ZYGOTE_BLOB_CONST);
451 assert!(data.len() == core::mem::size_of::<AddressTablePacked>());
452 address_table = Some(AddressTable::from_packed(cast_slice::<AddressTablePacked>(data)));
453 break;
454 }
455 nth_section += 1;
456 }
457
458 let Some(address_table) = address_table else { panic!("broken zygote binary") };
459 address_table
460};
461
462fn create_empty_memfd(name: &core::ffi::CStr) -> Result<Fd, Error> {
463 linux_raw::sys_memfd_create(name, linux_raw::MFD_CLOEXEC | linux_raw::MFD_ALLOW_SEALING)
464}
465
466static CACHED_PROGRAM_MEMFD: core::sync::atomic::AtomicI32 = core::sync::atomic::AtomicI32::new(-1);
468
469fn create_program_memfd() -> Result<Fd, Error> {
470 let memfd_raw = CACHED_PROGRAM_MEMFD.load(Ordering::Relaxed);
471 if memfd_raw != -1 && CACHED_PROGRAM_MEMFD.compare_exchange(memfd_raw, -1, Ordering::Relaxed, Ordering::Relaxed).is_ok() {
472 Ok(Fd::from_raw_unchecked(memfd_raw))
473 } else {
474 create_empty_memfd(cstr!("polkavm_program"))
475 }
476}
477
478fn cache_program_memfd_if_necessary() {
479 if CACHED_PROGRAM_MEMFD.load(Ordering::Relaxed) != -1 {
480 return;
481 }
482
483 let memfd = match create_empty_memfd(cstr!("polkavm_program")) {
484 Ok(memfd) => memfd,
485 Err(error) => {
486 log::warn!("Failed to create a memfd: {error}");
488 return;
489 }
490 };
491
492 if CACHED_PROGRAM_MEMFD.compare_exchange(-1, memfd.raw(), Ordering::Relaxed, Ordering::Relaxed).is_ok() {
493 memfd.leak();
494 }
495}
496
497fn prepare_sealed_memfd<const N: usize>(memfd: Fd, length: usize, data: [&[u8]; N]) -> Result<Fd, Error> {
498 let native_page_size = get_native_page_size();
499 if length % native_page_size != 0 {
500 return Err(Error::from_str("memfd size doesn't end on a page boundary"));
501 }
502
503 linux_raw::sys_ftruncate(memfd.borrow(), length as linux_raw::c_ulong)?;
504
505 let expected_bytes_written = data.iter().map(|slice| slice.len()).sum::<usize>();
506 let bytes_written = linux_raw::writev(memfd.borrow(), data)?;
507 if bytes_written != expected_bytes_written {
508 return Err(Error::from_str("failed to prepare memfd: incomplete write"));
509 }
510
511 linux_raw::sys_fcntl(
512 memfd.borrow(),
513 linux_raw::F_ADD_SEALS,
514 linux_raw::F_SEAL_SEAL | linux_raw::F_SEAL_SHRINK | linux_raw::F_SEAL_GROW | linux_raw::F_SEAL_WRITE,
515 )?;
516
517 Ok(memfd)
518}
519
520fn prepare_zygote() -> Result<Fd, Error> {
521 #[cfg(debug_assertions)]
522 if cfg!(polkavm_dev_debug_zygote) {
523 let paths = [
524 std::path::Path::new(env!("CARGO_MANIFEST_DIR")).join("../polkavm-zygote/target/x86_64-unknown-linux-gnu/debug/polkavm-zygote"),
525 std::path::Path::new(env!("CARGO_MANIFEST_DIR")).join("../polkavm-zygote/target/x86_64-unknown-linux-gnu/release/polkavm-zygote"),
526 std::path::Path::new(env!("CARGO_MANIFEST_DIR")).join("src/sandbox/polkavm-zygote"),
527 std::path::PathBuf::from("./polkavm-zygote"),
528 ];
529
530 let Some(path) = paths.into_iter().find(|path| {
531 path.exists() && std::fs::read(path).map(|data| data == ZYGOTE_BLOB).unwrap_or(false)
532 }) else {
533 panic!("no matching zygote binary found for debugging");
534 };
535
536 let path = std::ffi::CString::new(path.to_str().expect("invalid path to zygote")).expect("invalid path to zygote");
537 return Ok(linux_raw::sys_open(&path, linux_raw::O_CLOEXEC | linux_raw::O_PATH).unwrap());
538 }
539
540 let native_page_size = get_native_page_size();
541
542 #[allow(clippy::unwrap_used)]
543 let length_aligned = align_to_next_page_usize(native_page_size, ZYGOTE_BLOB.len()).unwrap();
545 prepare_sealed_memfd(create_empty_memfd(cstr!("polkavm_zygote"))?, length_aligned, [ZYGOTE_BLOB])
546}
547
548fn prepare_vmctx() -> Result<(Fd, Mmap), Error> {
549 let native_page_size = get_native_page_size();
550
551 #[allow(clippy::unwrap_used)] let length_aligned = align_to_next_page_usize(native_page_size, core::mem::size_of::<VmCtx>()).unwrap();
553
554 let memfd = create_empty_memfd(cstr!("polkavm_vmctx"))?;
555 linux_raw::sys_ftruncate(memfd.borrow(), length_aligned as linux_raw::c_ulong)?;
556 linux_raw::sys_fcntl(
557 memfd.borrow(),
558 linux_raw::F_ADD_SEALS,
559 linux_raw::F_SEAL_SEAL | linux_raw::F_SEAL_SHRINK | linux_raw::F_SEAL_GROW,
560 )?;
561
562 let vmctx = unsafe {
563 linux_raw::Mmap::map(
564 core::ptr::null_mut(),
565 length_aligned,
566 linux_raw::PROT_READ | linux_raw::PROT_WRITE,
567 linux_raw::MAP_SHARED,
568 Some(memfd.borrow()),
569 0,
570 )?
571 };
572
573 unsafe {
574 *vmctx.as_mut_ptr().cast::<VmCtx>() = VmCtx::new();
575 }
576
577 Ok((memfd, vmctx))
578}
579
580unsafe fn child_main(zygote_memfd: Fd, child_socket: Fd, uid_map: &str, gid_map: &str, logging_pipe: Option<Fd>) -> Result<(), Error> {
581 linux_raw::sys_prctl_set_name(b"polkavm-sandbox\0")?;
583
584 if !cfg!(polkavm_dev_debug_zygote) {
585 linux_raw::sys_sethostname("localhost")?;
587 linux_raw::sys_setdomainname("localhost")?;
588
589 let proc_self = linux_raw::sys_open(cstr!("/proc/self"), linux_raw::O_CLOEXEC | linux_raw::O_PATH)?;
592 let fd = linux_raw::sys_openat(proc_self.borrow(), cstr!("setgroups"), linux_raw::O_CLOEXEC | linux_raw::O_WRONLY)?;
593 linux_raw::sys_write(fd.borrow(), b"deny")?;
594 fd.close()?;
595
596 let fd = linux_raw::sys_openat(proc_self.borrow(), cstr!("gid_map"), linux_raw::O_CLOEXEC | linux_raw::O_RDWR)?;
598 linux_raw::sys_write(fd.borrow(), gid_map.as_bytes())?;
599 fd.close()?;
600
601 let fd = linux_raw::sys_openat(proc_self.borrow(), cstr!("uid_map"), linux_raw::O_CLOEXEC | linux_raw::O_RDWR)?;
602 linux_raw::sys_write(fd.borrow(), uid_map.as_bytes())?;
603 fd.close()?;
604 proc_self.close()?;
605 }
606
607 let fd_limit = if logging_pipe.is_some() {
608 4
609 } else {
610 3
611 };
612
613 for fd in [zygote_memfd.raw(), child_socket.raw()].into_iter().chain(logging_pipe.as_ref().map(|fd| fd.raw())) {
616 if fd == STDIN_FILENO {
617 return Err(Error::from_str("internal error: fd overlaps with stdin"));
618 }
619
620 if fd == STDERR_FILENO {
621 return Err(Error::from_str("internal error: fd overlaps with stderr"));
622 }
623 }
624
625 linux_raw::sys_dup3(child_socket.raw(), STDIN_FILENO, 0)?;
627 child_socket.close()?;
628
629 let mut fds_to_keep = [core::ffi::c_int::MAX; 3];
631 let fds_to_keep = {
632 let mut count = 1;
633 fds_to_keep[0] = STDIN_FILENO;
634 if let Some(logging_pipe) = logging_pipe {
635 linux_raw::sys_dup3(logging_pipe.raw(), STDERR_FILENO, 0)?;
636 logging_pipe.close()?;
637 fds_to_keep[count] = STDERR_FILENO;
638 count += 1;
639 }
640
641 fds_to_keep[count] = zygote_memfd.raw();
642 count += 1;
643
644 fds_to_keep.sort_unstable(); &fds_to_keep[..count]
646 };
647 close_other_file_descriptors(fds_to_keep)?;
648
649 if !cfg!(polkavm_dev_debug_zygote) {
650 let mount_flags = linux_raw::MS_REC | linux_raw::MS_NODEV | linux_raw::MS_NOEXEC | linux_raw::MS_NOSUID | linux_raw::MS_RDONLY;
652 linux_raw::sys_mount(cstr!("none"), cstr!("/mnt"), cstr!("tmpfs"), mount_flags, Some(cstr!("size=0")))?;
653 linux_raw::sys_chdir(cstr!("/mnt"))?;
654 linux_raw::sys_pivot_root(cstr!("."), cstr!("."))?;
655 linux_raw::sys_umount2(cstr!("."), linux_raw::MNT_DETACH)?;
656 }
657
658 linux_raw::sys_prctl_cap_ambient_clear_all()?;
660
661 linux_raw::sys_prctl_set_no_new_privs()?;
663
664 if !cfg!(polkavm_dev_debug_zygote) {
665 linux_raw::sys_prctl_set_securebits(
666 linux_raw::SECBIT_NOROOT |
668 linux_raw::SECBIT_NOROOT_LOCKED |
669 linux_raw::SECBIT_NO_SETUID_FIXUP |
671 linux_raw::SECBIT_NO_SETUID_FIXUP_LOCKED |
672 linux_raw::SECBIT_NO_CAP_AMBIENT_RAISE |
674 linux_raw::SECBIT_NO_CAP_AMBIENT_RAISE_LOCKED,
675 )?;
676 }
677
678 let max_memory = 8 * 1024 * 1024 * 1024;
680 linux_raw::sys_setrlimit(
681 linux_raw::RLIMIT_DATA,
682 &linux_raw::rlimit {
683 rlim_cur: max_memory,
684 rlim_max: max_memory,
685 },
686 )?;
687 linux_raw::sys_setrlimit(
688 linux_raw::RLIMIT_STACK,
689 &linux_raw::rlimit {
690 rlim_cur: 16 * 1024,
691 rlim_max: 16 * 1024,
692 },
693 )?;
694
695 linux_raw::sys_setrlimit(linux_raw::RLIMIT_NOFILE, &linux_raw::rlimit { rlim_cur: fd_limit, rlim_max: fd_limit })?;
696 linux_raw::sys_setrlimit(linux_raw::RLIMIT_NPROC, &linux_raw::rlimit { rlim_cur: 1, rlim_max: 1 })?;
697 linux_raw::sys_setrlimit(linux_raw::RLIMIT_FSIZE, &linux_raw::rlimit { rlim_cur: 0, rlim_max: 0 })?;
698 linux_raw::sys_setrlimit(linux_raw::RLIMIT_LOCKS, &linux_raw::rlimit { rlim_cur: 0, rlim_max: 0 })?;
699 linux_raw::sys_setrlimit(linux_raw::RLIMIT_MEMLOCK, &linux_raw::rlimit { rlim_cur: 0, rlim_max: 0 })?;
700 linux_raw::sys_setrlimit(linux_raw::RLIMIT_MSGQUEUE, &linux_raw::rlimit { rlim_cur: 0, rlim_max: 0 })?;
701
702 linux_raw::sys_capset_drop_all()?;
704
705 if cfg!(polkavm_dev_debug_zygote) {
706 let pid = linux_raw::sys_getpid()?;
707 linux_raw::sys_kill(pid, linux_raw::SIGSTOP)?;
708 }
709
710 let child_argv: [*const u8; 2] = [b"polkavm-zygote\0".as_ptr(), core::ptr::null()];
711 let child_envp: [*const u8; 1] = [core::ptr::null()];
712 linux_raw::sys_execveat(
713 Some(zygote_memfd.borrow()),
714 cstr!(""),
715 &child_argv,
716 &child_envp,
717 linux_raw::AT_EMPTY_PATH,
718 )?;
719
720 Ok(())
722}
723
724#[derive(Clone)]
725pub struct SandboxProgram(Arc<SandboxProgramInner>);
726
727struct SandboxProgramInner {
728 memfd: Fd,
729 memory_config: SandboxMemoryConfig,
730 code_range: Range<usize>,
731}
732
733impl super::SandboxProgram for SandboxProgram {
734 fn machine_code(&self) -> Cow<[u8]> {
735 let mut buffer = vec![0; self.0.code_range.len()];
738 linux_raw::sys_lseek(self.0.memfd.borrow(), self.0.code_range.start as i64, linux_raw::SEEK_SET).expect("failed to get machine code of the program: seek failed");
739
740 let mut position = 0;
741 while position < self.0.code_range.len() {
742 let count = match linux_raw::sys_read(self.0.memfd.borrow(), &mut buffer[position..]) {
743 Ok(count) => count,
744 Err(error) if error.errno() == linux_raw::EINTR => continue,
745 Err(error) => panic!("failed to get machine code of the program: read failed: {error}")
746 };
747
748 assert_ne!(count, 0);
749 position += count as usize;
750 }
751
752 Cow::Owned(buffer)
753 }
754}
755
756#[derive(Clone, PartialEq, Eq, Hash, Debug)]
757pub struct Map<'a> {
758 pub start: u64,
759 pub end: u64,
760 pub is_readable: bool,
761 pub is_writable: bool,
762 pub is_executable: bool,
763 pub is_shared: bool,
764 pub file_offset: u64,
765 pub major: u64,
766 pub minor: u64,
767 pub inode: u64,
768 pub name: &'a [u8],
769}
770
771fn parse_u64_radix(input: &[u8], radix: u32) -> Option<u64> {
772 u64::from_str_radix(core::str::from_utf8(input).ok()?, radix).ok()
773}
774
775fn get_until<'a>(p: &mut &'a [u8], delimiter: u8) -> &'a [u8] {
776 let mut found = None;
777 for (index, ch) in p.iter().enumerate() {
778 if *ch == delimiter {
779 found = Some(index);
780 break;
781 }
782 }
783
784 if let Some(index) = found {
785 let (before, after) = p.split_at(index);
786 *p = &after[1..];
787 before
788 } else {
789 let before = *p;
790 *p = b"";
791 before
792 }
793}
794
795fn get_char(p: &mut &[u8]) -> Option<u8> {
796 let ch = p.first()?;
797 *p = &p[1..];
798 Some(*ch)
799}
800
801fn skip_whitespace(p: &mut &[u8]) {
802 while let Some(ch) = p.first() {
803 if *ch == b' ' {
804 *p = &p[1..];
805 } else {
806 break;
807 }
808 }
809}
810
811impl<'a> Map<'a> {
812 fn parse(mut line: &'a [u8]) -> Option<Self> {
813 let start = parse_u64_radix(get_until(&mut line, b'-'), 16)?;
814 let end = parse_u64_radix(get_until(&mut line, b' '), 16)?;
815 let is_readable = get_char(&mut line)? == b'r';
816 let is_writable = get_char(&mut line)? == b'w';
817 let is_executable = get_char(&mut line)? == b'x';
818 let is_shared = get_char(&mut line)? == b's';
819 get_char(&mut line);
820
821 let file_offset = parse_u64_radix(get_until(&mut line, b' '), 16)?;
822 let major = parse_u64_radix(get_until(&mut line, b':'), 16)?;
823 let minor = parse_u64_radix(get_until(&mut line, b' '), 16)?;
824 let inode = parse_u64_radix(get_until(&mut line, b' '), 10)?;
825 skip_whitespace(&mut line);
826 let name = line;
827
828 Some(Map {
829 start,
830 end,
831 is_readable,
832 is_writable,
833 is_executable,
834 is_shared,
835 file_offset,
836 major,
837 minor,
838 inode,
839 name,
840 })
841 }
842}
843
844fn get_message(vmctx: &VmCtx) -> Option<String> {
845 let message = unsafe {
846 let message_length = *vmctx.message_length.get() as usize;
847 let message = &*vmctx.message_buffer.get();
848 &message[..core::cmp::min(message_length, message.len())]
849 };
850
851 if message.is_empty() {
852 return None;
853 }
854
855 let message = message.to_vec();
858 match String::from_utf8(message) {
859 Ok(message) => Some(message),
860 Err(error) => {
861 let message = error.into_bytes();
862 Some(String::from_utf8_lossy(&message).into_owned())
863 }
864 }
865}
866
867unsafe fn set_message(vmctx: &VmCtx, message: core::fmt::Arguments) {
868 struct Adapter<'a>(std::io::Cursor<&'a mut [u8]>);
869 impl<'a> core::fmt::Write for Adapter<'a> {
870 fn write_str(&mut self, string: &str) -> Result<(), core::fmt::Error> {
871 use std::io::Write;
872 self.0.write_all(string.as_bytes()).map_err(|_| core::fmt::Error)
873 }
874 }
875
876 let buffer: &mut [u8] = &mut *vmctx.message_buffer.get();
877 let mut cursor = Adapter(std::io::Cursor::new(buffer));
878 let _ = core::fmt::write(&mut cursor, message);
879 let length = cursor.0.position() as usize;
880
881 *vmctx.message_length.get() = length as u32;
882}
883
884pub struct Sandbox {
885 _lifetime_pipe: Fd,
886 vmctx_mmap: Mmap,
887 child: ChildProcess,
888 socket: Fd,
889
890 count_wait_loop_start: u64,
891 count_futex_wait: u64,
892
893 module: Option<Module>,
894 gas_metering: Option<GasMeteringKind>,
895}
896
897impl Drop for Sandbox {
898 fn drop(&mut self) {
899 let vmctx = self.vmctx();
900 let child_futex_wait = unsafe { *vmctx.counters.syscall_futex_wait.get() };
901 let child_loop_start = unsafe { *vmctx.counters.syscall_wait_loop_start.get() };
902 log::debug!(
903 "Host futex wait count: {}/{} ({:.02}%)",
904 self.count_futex_wait,
905 self.count_wait_loop_start,
906 self.count_futex_wait as f64 / self.count_wait_loop_start as f64 * 100.0
907 );
908 log::debug!(
909 "Child futex wait count: {}/{} ({:.02}%)",
910 child_futex_wait,
911 child_loop_start,
912 child_futex_wait as f64 / child_loop_start as f64 * 100.0
913 );
914 }
915}
916
917impl super::SandboxAddressSpace for () {
918 fn native_code_address(&self) -> u64 {
919 VM_ADDR_NATIVE_CODE
920 }
921}
922
923impl super::Sandbox for Sandbox {
924 const KIND: SandboxKind = SandboxKind::Linux;
925
926 type Access<'r> = SandboxAccess<'r>;
927 type Config = SandboxConfig;
928 type Error = Error;
929 type Program = SandboxProgram;
930 type AddressSpace = ();
931
932 fn as_sandbox_vec(vec: &SandboxVec) -> &Mutex<Vec<Self>> {
933 #[allow(clippy::match_wildcard_for_single_variants)]
934 match vec {
935 SandboxVec::Linux(ref vec) => vec,
936 _ => unreachable!(),
937 }
938 }
939
940 fn as_compiled_module(module: &Module) -> &CompiledModule<Self> {
941 match module.compiled_module() {
942 CompiledModuleKind::Linux(ref module) => module,
943 _ => unreachable!(),
944 }
945 }
946
947 fn reserve_address_space() -> Result<Self::AddressSpace, Self::Error> {
948 Ok(())
949 }
950
951 fn prepare_program(init: SandboxInit, (): Self::AddressSpace) -> Result<Self::Program, Self::Error> {
952 static PADDING: [u8; VM_MAX_PAGE_SIZE as usize] = [0; VM_MAX_PAGE_SIZE as usize];
953
954 let native_page_size = get_native_page_size();
955 let cfg = init.memory_config(native_page_size)?;
956 let ro_data_padding = &PADDING[..cfg.ro_data_fd_size as usize - init.guest_init.ro_data.len()];
957 let rw_data_padding = &PADDING[..cfg.rw_data_fd_size as usize - init.guest_init.rw_data.len()];
958 let code_padding = &PADDING[..cfg.code_size as usize - init.code.len()];
959
960 let memfd = prepare_sealed_memfd(
961 create_program_memfd()?,
962 cfg.ro_data_fd_size as usize + cfg.rw_data_fd_size as usize + cfg.code_size as usize + cfg.jump_table_size as usize,
963 [
964 init.guest_init.ro_data,
965 ro_data_padding,
966 init.guest_init.rw_data,
967 rw_data_padding,
968 init.code,
969 code_padding,
970 init.jump_table
971 ]
972 )?;
973
974 let code_offset = cfg.ro_data_fd_size as usize + cfg.rw_data_fd_size as usize;
975 let code_range = code_offset..code_offset + init.code.len();
976
977 Ok(SandboxProgram(Arc::new(SandboxProgramInner {
978 memfd,
979 memory_config: cfg,
980 code_range,
981 })))
982 }
983
984 fn spawn(config: &SandboxConfig) -> Result<Self, Error> {
985 let sigset = Sigmask::block_all_signals()?;
986 let zygote_memfd = prepare_zygote()?;
987 let (vmctx_memfd, vmctx_mmap) = prepare_vmctx()?;
988 let (socket, child_socket) = linux_raw::sys_socketpair(linux_raw::AF_UNIX, linux_raw::SOCK_SEQPACKET | linux_raw::SOCK_CLOEXEC, 0)?;
989 let (lifetime_pipe_host, lifetime_pipe_child) = linux_raw::sys_pipe2(linux_raw::O_CLOEXEC)?;
990
991 let sandbox_flags =
992 if !cfg!(polkavm_dev_debug_zygote) {
993 u64::from(linux_raw::CLONE_NEWCGROUP
994 | linux_raw::CLONE_NEWIPC
995 | linux_raw::CLONE_NEWNET
996 | linux_raw::CLONE_NEWNS
997 | linux_raw::CLONE_NEWPID
998 | linux_raw::CLONE_NEWUSER
999 | linux_raw::CLONE_NEWUTS)
1000 } else {
1001 0
1002 };
1003
1004 let mut pidfd: c_int = -1;
1005 let args = CloneArgs {
1006 flags: linux_raw::CLONE_CLEAR_SIGHAND | u64::from(linux_raw::CLONE_PIDFD) | sandbox_flags,
1007 pidfd: &mut pidfd,
1008 child_tid: 0,
1009 parent_tid: 0,
1010 exit_signal: 0,
1011 stack: 0,
1012 stack_size: 0,
1013 tls: 0,
1014 };
1015
1016 let uid = linux_raw::sys_getuid()?;
1017 let gid = linux_raw::sys_getgid()?;
1018
1019 let uid_map = format!("0 {} 1\n", uid);
1020 let gid_map = format!("0 {} 1\n", gid);
1021
1022 let (logger_rx, logger_tx) = if config.enable_logger {
1023 let (rx, tx) = linux_raw::sys_pipe2(linux_raw::O_CLOEXEC)?;
1024 (Some(rx), Some(tx))
1025 } else {
1026 (None, None)
1027 };
1028
1029 let mut child_pid =
1031 unsafe { linux_raw::syscall!(linux_raw::SYS_clone3, core::ptr::addr_of!(args), core::mem::size_of::<CloneArgs>()) };
1032
1033 if child_pid < 0 {
1034 let error = Error::from_last_os_error("clone");
1036 child_pid = unsafe { linux_raw::syscall!(linux_raw::SYS_clone, sandbox_flags, 0, 0, 0, 0) };
1037
1038 if child_pid < 0 {
1039 return Err(error);
1040 }
1041 }
1042
1043 if child_pid == 0 {
1044 core::mem::forget(sigset);
1050
1051 unsafe {
1052 match child_main(zygote_memfd, child_socket, &uid_map, &gid_map, logger_tx) {
1053 Ok(()) => {
1054 abort();
1056 }
1057 Err(error) => {
1058 let vmctx = &*vmctx_mmap.as_ptr().cast::<VmCtx>();
1059 set_message(vmctx, format_args!("fatal error while spawning child: {error}"));
1060
1061 abort();
1062 }
1063 }
1064 }
1065 }
1066
1067 if let Some(logger_rx) = logger_rx {
1068 std::thread::Builder::new()
1070 .name("polkavm-logger".into())
1071 .spawn(move || {
1072 let mut tmp = [0; 4096];
1073 let mut buffer = Vec::new();
1074 loop {
1075 if buffer.len() > 8192 {
1076 buffer.clear();
1078 }
1079
1080 match linux_raw::sys_read(logger_rx.borrow(), &mut tmp) {
1081 Err(error) if error.errno() == linux_raw::EINTR => continue,
1082 Err(error) => {
1083 log::warn!("Failed to read from logger: {}", error);
1084 break;
1085 }
1086 Ok(0) => break,
1087 Ok(count) => {
1088 let mut tmp = &tmp[..count];
1089 while !tmp.is_empty() {
1090 if let Some(index) = tmp.iter().position(|&byte| byte == b'\n') {
1091 buffer.extend_from_slice(&tmp[..index]);
1092 tmp = &tmp[index + 1..];
1093
1094 log::trace!(target: "polkavm::zygote", "Child #{}: {}", child_pid, String::from_utf8_lossy(&buffer));
1095 buffer.clear();
1096 } else {
1097 buffer.extend_from_slice(tmp);
1098 break;
1099 }
1100 }
1101 }
1102 }
1103 }
1104 })
1105 .map_err(|error| Error::from_os_error("failed to spawn logger thread", error))?;
1106 }
1107
1108 let mut child = ChildProcess {
1109 pid: child_pid as c_int,
1110 pidfd: if pidfd < 0 { None } else { Some(Fd::from_raw_unchecked(pidfd)) },
1111 };
1112
1113 child_socket.close()?;
1115 sigset.unblock()?;
1116
1117 fn wait_for_futex(vmctx: &VmCtx, child: &mut ChildProcess, current_state: u32, target_state: u32) -> Result<(), Error> {
1118 let instant = Instant::now();
1119 loop {
1120 let state = vmctx.futex.load(Ordering::Relaxed);
1121 if state == target_state {
1122 return Ok(());
1123 }
1124
1125 if state != current_state {
1126 return Err(Error::from_str("failed to initialize sandbox process: unexpected futex state"));
1127 }
1128
1129 let status = child.check_status(true)?;
1130 if !status.is_running() {
1131 let message = get_message(vmctx);
1132 if let Some(message) = message {
1133 let error = Error::from(format!("failed to initialize sandbox process: {status}: {message}"));
1134 return Err(error);
1135 } else {
1136 return Err(Error::from(format!(
1137 "failed to initialize sandbox process: child process unexpectedly quit: {status}",
1138 )));
1139 }
1140 }
1141
1142 if !cfg!(polkavm_dev_debug_zygote) && instant.elapsed() > core::time::Duration::from_secs(10) {
1143 return Err(Error::from_str("failed to initialize sandbox process: initialization timeout"));
1145 }
1146
1147 match linux_raw::sys_futex_wait(&vmctx.futex, state, Some(core::time::Duration::from_millis(100))) {
1148 Ok(()) => continue,
1149 Err(error)
1150 if error.errno() == linux_raw::EAGAIN
1151 || error.errno() == linux_raw::EINTR
1152 || error.errno() == linux_raw::ETIMEDOUT =>
1153 {
1154 continue
1155 }
1156 Err(error) => return Err(error),
1157 }
1158 }
1159 }
1160
1161 #[cfg(debug_assertions)]
1162 if cfg!(polkavm_dev_debug_zygote) {
1163 use core::fmt::Write;
1164 std::thread::sleep(core::time::Duration::from_millis(200));
1165
1166 let mut command = String::new();
1167 if std::fs::read_to_string("/proc/sys/kernel/yama/ptrace_scope").map(|value| value.trim() == "1").unwrap_or(false) {
1169 command.push_str("echo 0 | sudo tee /proc/sys/kernel/yama/ptrace_scope ;");
1170 }
1171
1172 command.push_str(concat!(
1173 "gdb",
1174 " -ex 'set pagination off'",
1175 " -ex 'layout split'",
1176 " -ex 'set print asm-demangle on'",
1177 " -ex 'set debuginfod enabled off'",
1178 " -ex 'tcatch exec'",
1179 " -ex 'handle SIGSTOP nostop'",
1180 ));
1181
1182 let _ = write!(&mut command, " -ex 'attach {}' -ex 'continue'", child.pid);
1183
1184 let mut cmd =
1185 if std::env::var_os("DISPLAY").is_some() {
1186 let mut cmd = std::process::Command::new("urxvt");
1188 cmd
1189 .args(["-fg", "rgb:ffff/ffff/ffff"])
1190 .args(["-bg", "rgba:0000/0000/0000/7777"])
1191 .arg("-e")
1192 .arg("sh")
1193 .arg("-c")
1194 .arg(&command);
1195 cmd
1196 } else {
1197 let mut cmd = std::process::Command::new("sh");
1199 cmd
1200 .arg("-c")
1201 .arg(&command);
1202 cmd
1203 };
1204
1205 let mut gdb = match cmd.spawn() {
1206 Ok(child) => child,
1207 Err(error) => {
1208 panic!("failed to launch: '{cmd:?}': {error}");
1209 }
1210 };
1211
1212 let pid = child.pid;
1213 std::thread::spawn(move || {
1214 let _ = gdb.wait();
1215 let _ = linux_raw::sys_kill(pid, linux_raw::SIGKILL);
1216 });
1217 }
1218
1219 let vmctx = unsafe { &*vmctx_mmap.as_ptr().cast::<VmCtx>() };
1220
1221 if let Err(error) = linux_raw::sendfd(socket.borrow(), vmctx_memfd.borrow()) {
1223 let message = get_message(vmctx);
1224 if let Some(message) = message {
1225 let error = Error::from(format!("failed to initialize sandbox process: {error} (root cause: {message})"));
1226 return Err(error);
1227 }
1228
1229 return Err(error);
1230 }
1231
1232 linux_raw::sendfd(socket.borrow(), lifetime_pipe_child.borrow())?;
1233 lifetime_pipe_child.close()?;
1234
1235 wait_for_futex(vmctx, &mut child, VMCTX_FUTEX_BUSY, VMCTX_FUTEX_INIT)?;
1237
1238 let maps = std::fs::read(format!("/proc/{}/maps", child_pid))
1242 .map_err(|error| Error::from_errno("failed to read child's maps", error.raw_os_error().unwrap_or(0)))?;
1243
1244 for line in maps.split(|&byte| byte == b'\n') {
1245 if line.is_empty() {
1246 continue;
1247 }
1248
1249 let map = Map::parse(line).ok_or_else(|| Error::from_str("failed to parse the maps of the child process"))?;
1250 match map.name {
1251 b"[stack]" => {
1252 vmctx.init.stack_address.store(map.start, Ordering::Relaxed);
1253 vmctx.init.stack_length.store(map.end - map.start, Ordering::Relaxed);
1254 }
1255 b"[vdso]" => {
1256 vmctx.init.vdso_address.store(map.start, Ordering::Relaxed);
1257 vmctx.init.vdso_length.store(map.end - map.start, Ordering::Relaxed);
1258 }
1259 b"[vvar]" => {
1260 vmctx.init.vvar_address.store(map.start, Ordering::Relaxed);
1261 vmctx.init.vvar_length.store(map.end - map.start, Ordering::Relaxed);
1262 }
1263 b"[vsyscall]" => {
1264 if map.is_readable {
1265 return Err(Error::from_str("failed to initialize sandbox process: vsyscall region is readable"));
1266 }
1267 }
1268 _ => {}
1269 }
1270 }
1271
1272 vmctx.futex.store(VMCTX_FUTEX_BUSY, Ordering::Release);
1274 linux_raw::sys_futex_wake_one(&vmctx.futex)?;
1275
1276 wait_for_futex(vmctx, &mut child, VMCTX_FUTEX_BUSY, VMCTX_FUTEX_IDLE)?;
1278
1279 Ok(Sandbox {
1280 _lifetime_pipe: lifetime_pipe_host,
1281 vmctx_mmap,
1282 child,
1283 socket,
1284
1285 count_wait_loop_start: 0,
1286 count_futex_wait: 0,
1287
1288 module: None,
1289 gas_metering: None,
1290 })
1291 }
1292
1293 fn execute(&mut self, mut args: ExecuteArgs) -> Result<(), ExecutionError<Self::Error>> {
1294 self.wait_if_necessary(match args.hostcall_handler {
1295 Some(ref mut hostcall_handler) => Some(&mut *hostcall_handler),
1296 None => None,
1297 }, true)?;
1298
1299 if args.is_async && args.hostcall_handler.is_some() {
1300 return Err(Error::from_str("requested asynchronous execution with a borrowed hostcall handler").into());
1301 }
1302
1303 unsafe {
1304 if let Some(module) = args.module {
1305 args.flags |= polkavm_common::zygote::VM_RPC_FLAG_RECONFIGURE;
1306
1307 let compiled_module = Self::as_compiled_module(module);
1308 let program = &compiled_module.sandbox_program;
1309 *self.vmctx().memory_config.get() = program.0.memory_config.clone();
1310 *self.vmctx().heap_info.heap_top.get() = u64::from(module.memory_map().heap_base());
1311 *self.vmctx().heap_info.heap_threshold.get() = u64::from(module.memory_map().rw_data_range().end);
1312 self.gas_metering = module.gas_metering();
1313 self.module = Some(module.clone());
1314 }
1315
1316 if let Some(gas) = crate::sandbox::get_gas(&args, self.gas_metering) {
1317 *self.vmctx().gas().get() = gas;
1318 }
1319
1320 *self.vmctx().rpc_address.get() = args.entry_point.map_or(0, |entry_point|
1321 Self::as_compiled_module(self.module.as_ref().unwrap()).export_trampolines[entry_point] as usize
1322 ) as u64;
1323
1324 *self.vmctx().rpc_flags.get() = args.flags;
1325 *self.vmctx().rpc_sbrk.get() = args.sbrk;
1326
1327 if let Some(regs) = args.regs {
1328 (*self.vmctx().regs().get()).copy_from_slice(regs);
1329 }
1330
1331 self.vmctx().futex.store(VMCTX_FUTEX_BUSY, Ordering::Release);
1332 linux_raw::sys_futex_wake_one(&self.vmctx().futex)?;
1333
1334 if let Some(module) = args.module {
1335 let compiled_module = Self::as_compiled_module(module);
1336 linux_raw::sendfd(self.socket.borrow(), compiled_module.sandbox_program.0.memfd.borrow())?;
1338 }
1339 }
1340
1341 if !args.is_async {
1342 self.wait_if_necessary(match args.hostcall_handler {
1343 Some(ref mut hostcall_handler) => Some(&mut *hostcall_handler),
1344 None => None,
1345 }, args.entry_point.is_none())?;
1346 }
1347
1348 Ok(())
1349 }
1350
1351 #[inline]
1352 fn access(&mut self) -> SandboxAccess {
1353 SandboxAccess { sandbox: self }
1354 }
1355
1356 fn pid(&self) -> Option<u32> {
1357 Some(self.child.pid as u32)
1358 }
1359
1360 fn address_table() -> AddressTable {
1361 ZYGOTE_ADDRESS_TABLE
1362 }
1363
1364 fn vmctx_regs_offset() -> usize {
1365 get_field_offset!(VmCtx::new(), |base| base.regs().get())
1366 }
1367
1368 fn vmctx_gas_offset() -> usize {
1369 get_field_offset!(VmCtx::new(), |base| base.gas().get())
1370 }
1371
1372 fn vmctx_heap_info_offset() -> usize {
1373 get_field_offset!(VmCtx::new(), |base| base.heap_info())
1374 }
1375
1376 fn gas_remaining_impl(&self) -> Result<Option<Gas>, super::OutOfGas> {
1377 if self.gas_metering.is_none() { return Ok(None) };
1378 let raw_gas = unsafe { *self.vmctx().gas().get() };
1379 Gas::from_i64(raw_gas).ok_or(super::OutOfGas).map(Some)
1380 }
1381
1382 fn sync(&mut self) -> Result<(), Self::Error> {
1383 self.wait_if_necessary(None, true).map_err(|error| {
1384 match error {
1385 ExecutionError::Trap(..) => Error::from_str("unexpected trap"),
1386 ExecutionError::OutOfGas => Error::from_str("unexpected out of gas"),
1387 ExecutionError::Error(error) => error,
1388 }
1389 })
1390 }
1391}
1392
1393impl Sandbox {
1394 #[inline]
1395 fn vmctx(&self) -> &VmCtx {
1396 unsafe { &*self.vmctx_mmap.as_ptr().cast::<VmCtx>() }
1397 }
1398
1399 #[inline(never)]
1400 #[cold]
1401 fn wait(&mut self, mut hostcall_handler: Option<HostcallHandler>, low_latency: bool) -> Result<(), ExecutionError<Error>> {
1402 let mut spin_target = 0;
1403 let mut yield_target = 0;
1404 if low_latency {
1405 yield_target = 20;
1406 }
1407
1408 'outer: loop {
1409 self.count_wait_loop_start += 1;
1410
1411 let state = self.vmctx().futex.load(Ordering::Relaxed);
1412 if state == VMCTX_FUTEX_IDLE {
1413 core::sync::atomic::fence(Ordering::Acquire);
1414 return Ok(());
1415 }
1416
1417 if state == VMCTX_FUTEX_TRAP {
1418 core::sync::atomic::fence(Ordering::Acquire);
1419
1420 self.vmctx().futex.store(VMCTX_FUTEX_BUSY, Ordering::Release);
1421 linux_raw::sys_futex_wake_one(&self.vmctx().futex)?;
1422
1423 return Err(ExecutionError::Trap(Trap::default()));
1424 }
1425
1426 if state == VMCTX_FUTEX_HOSTCALL {
1427 core::sync::atomic::fence(Ordering::Acquire);
1428
1429 let hostcall_handler = match hostcall_handler {
1430 Some(ref mut hostcall_handler) => &mut *hostcall_handler,
1431 None => {
1432 unsafe {
1433 *self.vmctx().hostcall().get() = polkavm_common::zygote::HOSTCALL_ABORT_EXECUTION;
1434 }
1435 self.vmctx().futex.store(VMCTX_FUTEX_BUSY, Ordering::Release);
1436 linux_raw::sys_futex_wake_one(&self.vmctx().futex)?;
1437
1438 return Err(Error::from_str("hostcall called without any hostcall handler set").into());
1439 }
1440 };
1441
1442 let hostcall = unsafe { *self.vmctx().hostcall().get() };
1443 if hostcall == polkavm_common::HOSTCALL_TRACE {
1444 spin_target = 512;
1446 }
1447
1448 match hostcall_handler(hostcall, super::Sandbox::access(self).into()) {
1449 Ok(()) => {
1450 self.vmctx().futex.store(VMCTX_FUTEX_BUSY, Ordering::Release);
1451 linux_raw::sys_futex_wake_one(&self.vmctx().futex)?;
1452 continue;
1453 }
1454 Err(trap) => {
1455 unsafe {
1456 *self.vmctx().hostcall().get() = polkavm_common::zygote::HOSTCALL_ABORT_EXECUTION;
1457 }
1458 self.vmctx().futex.store(VMCTX_FUTEX_BUSY, Ordering::Release);
1459 linux_raw::sys_futex_wake_one(&self.vmctx().futex)?;
1460
1461 return Err(ExecutionError::Trap(trap));
1462 }
1463 }
1464 }
1465
1466 if state != VMCTX_FUTEX_BUSY {
1467 return Err(Error::from_str("internal error: unexpected worker process state").into());
1468 }
1469
1470 cache_program_memfd_if_necessary();
1472
1473 for _ in 0..yield_target {
1474 let _ = linux_raw::sys_sched_yield();
1475 if self.vmctx().futex.load(Ordering::Relaxed) != VMCTX_FUTEX_BUSY {
1476 continue 'outer;
1477 }
1478 }
1479
1480 for _ in 0..spin_target {
1481 core::hint::spin_loop();
1482 if self.vmctx().futex.load(Ordering::Relaxed) != VMCTX_FUTEX_BUSY {
1483 continue 'outer;
1484 }
1485 }
1486
1487 self.count_futex_wait += 1;
1488 match linux_raw::sys_futex_wait(&self.vmctx().futex, VMCTX_FUTEX_BUSY, Some(core::time::Duration::from_millis(100))) {
1489 Ok(()) => continue,
1490 Err(error) if error.errno() == linux_raw::EAGAIN || error.errno() == linux_raw::EINTR => continue,
1491 Err(error) if error.errno() == linux_raw::ETIMEDOUT => {
1492 log::trace!("Timeout expired while waiting for child #{}...", self.child.pid);
1493 self.check_child_status()?;
1494 }
1495 Err(error) => return Err(error.into()),
1496 }
1497 }
1498 }
1499
1500 fn check_child_status(&mut self) -> Result<(), Error> {
1501 let status = self.child.check_status(true)?;
1502 if status.is_running() {
1503 return Ok(());
1504 }
1505
1506 log::trace!("Child #{} is not running anymore: {status}", self.child.pid);
1507 let message = get_message(self.vmctx());
1508 if let Some(message) = message {
1509 Err(Error::from(format!("{status}: {message}")))
1510 } else {
1511 Err(Error::from(format!("worker process unexpectedly quit: {status}")))
1512 }
1513 }
1514
1515 #[inline]
1516 fn wait_if_necessary(&mut self, hostcall_handler: Option<HostcallHandler>, low_latency: bool) -> Result<(), ExecutionError<Error>> {
1517 if self.vmctx().futex.load(Ordering::Relaxed) != VMCTX_FUTEX_IDLE {
1518 self.wait(hostcall_handler, low_latency)?;
1519 }
1520
1521 Ok(())
1522 }
1523}
1524
1525pub struct SandboxAccess<'a> {
1526 sandbox: &'a mut Sandbox,
1527}
1528
1529impl<'a> From<SandboxAccess<'a>> for BackendAccess<'a> {
1530 fn from(access: SandboxAccess<'a>) -> Self {
1531 BackendAccess::CompiledLinux(access)
1532 }
1533}
1534
1535impl<'a> Access<'a> for SandboxAccess<'a> {
1536 type Error = MemoryAccessError<linux_raw::Error>;
1537
1538 fn get_reg(&self, reg: Reg) -> u32 {
1539 let regs = unsafe { &*self.sandbox.vmctx().regs().get() };
1540 regs[reg as usize]
1541 }
1542
1543 fn set_reg(&mut self, reg: Reg, value: u32) {
1544 unsafe {
1545 (*self.sandbox.vmctx().regs().get())[reg as usize] = value;
1546 }
1547 }
1548
1549 fn read_memory_into_slice<'slice, T>(&self, address: u32, buffer: &'slice mut T) -> Result<&'slice mut [u8], Self::Error>
1550 where
1551 T: ?Sized + AsUninitSliceMut,
1552 {
1553 let slice = buffer.as_uninit_slice_mut();
1554 log::trace!(
1555 "Reading memory: 0x{:x}-0x{:x} ({} bytes)",
1556 address,
1557 address as usize + slice.len(),
1558 slice.len()
1559 );
1560
1561 if address as usize + slice.len() > 0xffffffff {
1562 return Err(MemoryAccessError {
1563 address,
1564 length: slice.len() as u64,
1565 error: Error::from_str("out of range read"),
1566 });
1567 }
1568
1569 let length = slice.len();
1570 match linux_raw::vm_read_memory(self.sandbox.child.pid, [slice], [(address as usize, length)]) {
1571 Ok(actual_length) if actual_length == length => {
1572 unsafe { Ok(slice_assume_init_mut(slice)) }
1573 },
1574 Ok(_) => {
1575 Err(MemoryAccessError {
1576 address,
1577 length: slice.len() as u64,
1578 error: Error::from_str("incomplete read"),
1579 })
1580 },
1581 Err(error) => {
1582 Err(MemoryAccessError {
1583 address,
1584 length: slice.len() as u64,
1585 error,
1586 })
1587 }
1588 }
1589 }
1590
1591 fn write_memory(&mut self, address: u32, data: &[u8]) -> Result<(), Self::Error> {
1592 log::trace!(
1593 "Writing memory: 0x{:x}-0x{:x} ({} bytes)",
1594 address,
1595 address as usize + data.len(),
1596 data.len()
1597 );
1598
1599 if address as usize + data.len() > 0xffffffff {
1600 return Err(MemoryAccessError {
1601 address,
1602 length: data.len() as u64,
1603 error: Error::from_str("out of range write"),
1604 });
1605 }
1606
1607 self.sandbox.vmctx().is_memory_dirty.store(true, Ordering::Relaxed);
1608
1609 let length = data.len();
1610 match linux_raw::vm_write_memory(self.sandbox.child.pid, [data], [(address as usize, length)]) {
1611 Ok(actual_length) if actual_length == length => {
1612 Ok(())
1613 },
1614 Ok(_) => {
1615 Err(MemoryAccessError {
1616 address,
1617 length: data.len() as u64,
1618 error: Error::from_str("incomplete write"),
1619 })
1620 },
1621 Err(error) => {
1622 Err(MemoryAccessError {
1623 address,
1624 length: data.len() as u64,
1625 error,
1626 })
1627 }
1628 }
1629 }
1630
1631 fn sbrk(&mut self, size: u32) -> Option<u32> {
1632 if size == 0 {
1633 return Some(unsafe { *self.sandbox.vmctx().heap_info().heap_top.get() as u32 });
1634 }
1635
1636 debug_assert_eq!(self.sandbox.vmctx().futex.load(Ordering::Relaxed), VMCTX_FUTEX_HOSTCALL);
1637
1638 unsafe {
1639 *self.sandbox.vmctx().rpc_sbrk.get() = size;
1640 *self.sandbox.vmctx().hostcall().get() = polkavm_common::zygote::HOSTCALL_SBRK;
1641 }
1642
1643 self.sandbox.vmctx().futex.store(VMCTX_FUTEX_BUSY, Ordering::Release);
1644 if let Err(error) = linux_raw::sys_futex_wake_one(&self.sandbox.vmctx().futex) {
1645 panic!("sbrk failed: {error}");
1646 }
1647
1648 let mut timestamp = Instant::now();
1649 loop {
1650 let _ = linux_raw::sys_sched_yield();
1651 if self.sandbox.vmctx().futex.load(Ordering::Relaxed) == VMCTX_FUTEX_BUSY {
1652 let new_timestamp = Instant::now();
1653 let elapsed = new_timestamp - timestamp;
1654 if elapsed >= Duration::from_millis(100) {
1655 timestamp = new_timestamp;
1656 if let Err(error) = self.sandbox.check_child_status() {
1657 panic!("sbrk failed: {error}");
1658 }
1659 }
1660 continue;
1661 }
1662
1663 core::sync::atomic::fence(Ordering::Acquire);
1664 break;
1665 }
1666
1667 debug_assert_eq!(self.sandbox.vmctx().futex.load(Ordering::Relaxed), VMCTX_FUTEX_HOSTCALL);
1668
1669 let result = unsafe { *self.sandbox.vmctx().rpc_sbrk.get() };
1670 if result == 0 {
1671 None
1672 } else {
1673 Some(result)
1674 }
1675 }
1676
1677 fn heap_size(&self) -> u32 {
1678 let heap_base = unsafe { (*self.sandbox.vmctx().memory_config.get()).memory_map.heap_base() };
1679 let heap_top = unsafe { *self.sandbox.vmctx().heap_info().heap_top.get() };
1680 (heap_top - u64::from(heap_base)) as u32
1681 }
1682
1683 fn program_counter(&self) -> Option<u32> {
1684 let value = unsafe { *self.sandbox.vmctx().nth_instruction().get() };
1685
1686 if value == SANDBOX_EMPTY_NTH_INSTRUCTION {
1687 None
1688 } else {
1689 Some(value)
1690 }
1691 }
1692
1693 fn native_program_counter(&self) -> Option<u64> {
1694 let value = unsafe { *self.sandbox.vmctx().rip().get() };
1695
1696 if value == SANDBOX_EMPTY_NATIVE_PROGRAM_COUNTER {
1697 None
1698 } else {
1699 Some(value)
1700 }
1701 }
1702
1703 fn gas_remaining(&self) -> Option<Gas> {
1704 use super::Sandbox;
1705 self.sandbox.gas_remaining_impl().ok().unwrap_or(Some(Gas::MIN))
1706 }
1707
1708 fn consume_gas(&mut self, gas: u64) {
1709 if self.sandbox.gas_metering.is_none() { return }
1710 let gas_remaining = unsafe { &mut *self.sandbox.vmctx().gas().get() };
1711 *gas_remaining = gas_remaining.checked_sub_unsigned(gas).unwrap_or(-1);
1712 }
1713}