1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
//! This module defines the ABI boundary between the host and the zygote.
//!
//! In general everything here can be modified at will, provided the zygote
//! is recompiled.

use crate::abi::MemoryMap;
use core::cell::UnsafeCell;
use core::sync::atomic::{AtomicBool, AtomicU32, AtomicU64};

// Due to the limitations of Rust's compile time constant evaluation machinery
// we need to define this struct multiple times.
macro_rules! define_address_table {
    ($($name:ident: $type:ty,)+) => {
        #[repr(C)]
        pub struct AddressTableRaw {
            $(pub $name: $type),+
        }

        #[derive(Copy, Clone)]
        #[repr(packed)]
        pub struct AddressTablePacked {
            $(pub $name: u64),+
        }

        #[derive(Copy, Clone)]
        pub struct AddressTable {
            $(pub $name: u64),+
        }

        impl AddressTable {
            #[inline]
            pub fn from_raw(table: AddressTableRaw) -> Self {
                Self {
                    $(
                        $name: table.$name as u64
                    ),+
                }
            }

            pub const fn from_packed(table: &AddressTablePacked) -> Self {
                Self {
                    $(
                        $name: table.$name
                    ),+
                }
            }
        }

        static_assert!(core::mem::size_of::<AddressTableRaw>() == core::mem::size_of::<AddressTablePacked>());
        static_assert!(core::mem::size_of::<AddressTableRaw>() == core::mem::size_of::<AddressTable>());
    }
}

// These are the addresses exported from the zygote.
define_address_table! {
    syscall_hostcall: unsafe extern "C" fn(u32),
    syscall_trap: unsafe extern "C" fn() -> !,
    syscall_return: unsafe extern "C" fn() -> !,
    syscall_trace: unsafe extern "C" fn(u32, u64),
    syscall_sbrk: unsafe extern "C" fn(u64) -> u32,
}

/// The address where the native code starts inside of the VM.
///
/// This is not directly accessible by the program running inside of the VM.
pub const VM_ADDR_NATIVE_CODE: u64 = 0x100000000;

/// The address where the indirect jump table starts inside of the VM.
///
/// This is not directly accessible by the program running inside of the VM.
pub const VM_ADDR_JUMP_TABLE: u64 = 0x800000000;

/// The address where the return-to-host jump table vector physically resides.
pub const VM_ADDR_JUMP_TABLE_RETURN_TO_HOST: u64 = VM_ADDR_JUMP_TABLE + ((crate::abi::VM_ADDR_RETURN_TO_HOST as u64) << 3);

/// A special hostcall number set by the *host* to signal that the guest should stop executing the program.
pub const HOSTCALL_ABORT_EXECUTION: u32 = !0;

/// A special hostcall number set by the *host* to signal that the guest should execute `sbrk`.
pub const HOSTCALL_SBRK: u32 = !0 - 1;

/// A sentinel value to indicate that the instruction counter is not available.
pub const SANDBOX_EMPTY_NTH_INSTRUCTION: u32 = !0;

/// A sentinel value to indicate that the native program counter is not available.
pub const SANDBOX_EMPTY_NATIVE_PROGRAM_COUNTER: u64 = 0;

/// The address of the global per-VM context struct.
pub const VM_ADDR_VMCTX: u64 = 0x400000000;

/// The address of the signal stack.
pub const VM_ADDR_SIGSTACK: u64 = 0x500000000;

/// The address of the native stack.
pub const VM_ADDR_NATIVE_STACK_LOW: u64 = 0x600000000;

/// The size of the native stack.
pub const VM_ADDR_NATIVE_STACK_SIZE: u64 = 0x4000;

/// The address of the top of the native stack.
pub const VM_ADDR_NATIVE_STACK_HIGH: u64 = VM_ADDR_NATIVE_STACK_LOW + VM_ADDR_NATIVE_STACK_SIZE;

/// The maximum number of native code bytes that can be emitted by a single VM instruction.
///
/// This does *not* affect the VM ABI and can be changed at will,
/// but should be high enough that it's never hit.
pub const VM_COMPILER_MAXIMUM_INSTRUCTION_LENGTH: u32 = 53;

/// The maximum number of native code bytes that can be emitted as an epilogue.
///
/// This does *not* affect the VM ABI and can be changed at will,
/// but should be high enough that it's never hit.
pub const VM_COMPILER_MAXIMUM_EPILOGUE_LENGTH: u32 = 1024 * 1024;

/// The maximum number of bytes the jump table can be.
pub const VM_SANDBOX_MAXIMUM_JUMP_TABLE_SIZE: u64 = (crate::abi::VM_MAXIMUM_INSTRUCTION_COUNT as u64 + 1)
    * core::mem::size_of::<u64>() as u64
    * crate::abi::VM_CODE_ADDRESS_ALIGNMENT as u64;

/// The maximum number of bytes the jump table can span in virtual memory.
pub const VM_SANDBOX_MAXIMUM_JUMP_TABLE_VIRTUAL_SIZE: u64 = 0x100000000 * core::mem::size_of::<u64>() as u64;

/// The maximum number of bytes the native code can be.
pub const VM_SANDBOX_MAXIMUM_NATIVE_CODE_SIZE: u32 = 512 * 1024 * 1024 - 1;

/// The memory configuration used by a given program and/or sandbox instance.
#[derive(Clone)]
#[repr(C)]
pub struct SandboxMemoryConfig {
    pub memory_map: MemoryMap,
    pub ro_data_fd_size: u32,
    pub rw_data_fd_size: u32,
    pub code_size: u32,
    pub jump_table_size: u32,
    pub sysreturn_address: u64,
}

/// A flag which will trigger the sandbox to reload its program before execution.
pub const VM_RPC_FLAG_RECONFIGURE: u32 = 1 << 0;

#[repr(C)]
pub struct VmInit {
    pub stack_address: AtomicU64,
    pub stack_length: AtomicU64,
    pub vdso_address: AtomicU64,
    pub vdso_length: AtomicU64,
    pub vvar_address: AtomicU64,
    pub vvar_length: AtomicU64,
}

const MESSAGE_BUFFER_SIZE: usize = 512;

#[repr(align(64))]
pub struct CacheAligned<T>(pub T);

impl<T> core::ops::Deref for CacheAligned<T> {
    type Target = T;
    #[inline(always)]
    fn deref(&self) -> &Self::Target {
        &self.0
    }
}

impl<T> core::ops::DerefMut for CacheAligned<T> {
    #[inline(always)]
    fn deref_mut(&mut self) -> &mut Self::Target {
        &mut self.0
    }
}

#[repr(C)]
pub struct VmCtxHeapInfo {
    pub heap_top: UnsafeCell<u64>,
    pub heap_threshold: UnsafeCell<u64>,
}

const REG_COUNT: usize = crate::program::Reg::ALL.len();

#[repr(C)]
pub struct VmCtxSyscall {
    // NOTE: The order of fields here can matter for performance!
    /// The current gas counter.
    pub gas: UnsafeCell<i64>,
    /// The hostcall number that was triggered.
    pub hostcall: UnsafeCell<u32>,
    /// A dump of all of the registers of the VM.
    pub regs: UnsafeCell<[u32; REG_COUNT]>,
    /// The number of the instruction just about to be executed.
    ///
    /// Should be treated as empty if equal to `SANDBOX_EMPTY_NTH_INSTRUCTION`.
    pub nth_instruction: UnsafeCell<u32>,

    /// The current RIP. Filled out in case of a trap or during tracing.
    ///
    /// Should be treated as empty if equal to `SANDBOX_EMPTY_NATIVE_PROGRAM_COUNTER`.
    pub rip: UnsafeCell<u64>,
}

#[repr(C)]
pub struct VmCtxCounters {
    pub syscall_wait_loop_start: UnsafeCell<u64>,
    pub syscall_futex_wait: UnsafeCell<u64>,
}

/// The virtual machine context.
///
/// This is mapped in shared memory and used by the sandbox to keep its state in,
/// as well as by the host to communicate with the sandbox.
#[allow(clippy::partial_pub_fields)]
#[repr(C)]
pub struct VmCtx {
    /// Fields used when making syscalls from the VM into the host.
    syscall_ffi: CacheAligned<VmCtxSyscall>,

    /// The state of the program's heap.
    pub heap_info: VmCtxHeapInfo,

    /// The futex used to synchronize the sandbox with the host process.
    pub futex: CacheAligned<AtomicU32>,

    /// The address of the native code to call inside of the VM, if non-zero.
    pub rpc_address: UnsafeCell<u64>,
    /// Flags specifying what exactly the sandbox should do.
    pub rpc_flags: UnsafeCell<u32>,
    /// The amount of memory to allocate.
    pub rpc_sbrk: UnsafeCell<u32>,
    /// The memory configuration of the sandbox.
    pub memory_config: UnsafeCell<SandboxMemoryConfig>,
    /// Whether the memory of the sandbox is dirty.
    pub is_memory_dirty: AtomicBool,

    /// Performance counters. Only for debugging.
    pub counters: CacheAligned<VmCtxCounters>,

    /// One-time args used during initialization.
    pub init: VmInit,

    /// Length of the message in the message buffer.
    pub message_length: UnsafeCell<u32>,
    /// A buffer used to marshal error messages.
    pub message_buffer: UnsafeCell<[u8; MESSAGE_BUFFER_SIZE]>,
}

// Make sure it fits within a single page on amd64.
static_assert!(core::mem::size_of::<VmCtx>() <= 4096);

/// The VM is busy.
pub const VMCTX_FUTEX_BUSY: u32 = 0;

/// The VM is ready to be initialized.
pub const VMCTX_FUTEX_INIT: u32 = 1;

/// The VM is idle and is waiting for work.
pub const VMCTX_FUTEX_IDLE: u32 = 2;

/// The VM has triggered a host call.
pub const VMCTX_FUTEX_HOSTCALL: u32 = 3;

/// The VM has triggered a trap.
pub const VMCTX_FUTEX_TRAP: u32 = 4;

impl VmCtx {
    /// Creates a zeroed VM context.
    pub const fn zeroed() -> Self {
        VmCtx {
            futex: CacheAligned(AtomicU32::new(VMCTX_FUTEX_BUSY)),

            rpc_address: UnsafeCell::new(0),
            rpc_flags: UnsafeCell::new(0),
            rpc_sbrk: UnsafeCell::new(0),
            memory_config: UnsafeCell::new(SandboxMemoryConfig {
                memory_map: MemoryMap::empty(),
                ro_data_fd_size: 0,
                rw_data_fd_size: 0,
                code_size: 0,
                jump_table_size: 0,
                sysreturn_address: 0,
            }),
            is_memory_dirty: AtomicBool::new(false),

            syscall_ffi: CacheAligned(VmCtxSyscall {
                gas: UnsafeCell::new(0),
                hostcall: UnsafeCell::new(0),
                regs: UnsafeCell::new([0; REG_COUNT]),
                rip: UnsafeCell::new(0),
                nth_instruction: UnsafeCell::new(0),
            }),

            heap_info: VmCtxHeapInfo {
                heap_top: UnsafeCell::new(0),
                heap_threshold: UnsafeCell::new(0),
            },

            counters: CacheAligned(VmCtxCounters {
                syscall_wait_loop_start: UnsafeCell::new(0),
                syscall_futex_wait: UnsafeCell::new(0),
            }),

            init: VmInit {
                stack_address: AtomicU64::new(0),
                stack_length: AtomicU64::new(0),
                vdso_address: AtomicU64::new(0),
                vdso_length: AtomicU64::new(0),
                vvar_address: AtomicU64::new(0),
                vvar_length: AtomicU64::new(0),
            },

            message_length: UnsafeCell::new(0),
            message_buffer: UnsafeCell::new([0; MESSAGE_BUFFER_SIZE]),
        }
    }

    /// Creates a fresh VM context.
    pub const fn new() -> Self {
        let mut vmctx = Self::zeroed();
        vmctx.syscall_ffi.0.nth_instruction = UnsafeCell::new(SANDBOX_EMPTY_NTH_INSTRUCTION);
        vmctx
    }

    // Define some accessor methods so that we don't have to update the rest of the codebase
    // when we shuffle things around in the structure.

    #[inline(always)]
    pub const fn gas(&self) -> &UnsafeCell<i64> {
        &self.syscall_ffi.0.gas
    }

    #[inline(always)]
    pub const fn heap_info(&self) -> &VmCtxHeapInfo {
        &self.heap_info
    }

    #[inline(always)]
    pub const fn hostcall(&self) -> &UnsafeCell<u32> {
        &self.syscall_ffi.0.hostcall
    }

    #[inline(always)]
    pub const fn regs(&self) -> &UnsafeCell<[u32; REG_COUNT]> {
        &self.syscall_ffi.0.regs
    }

    #[inline(always)]
    pub const fn rip(&self) -> &UnsafeCell<u64> {
        &self.syscall_ffi.0.rip
    }

    #[inline(always)]
    pub const fn nth_instruction(&self) -> &UnsafeCell<u32> {
        &self.syscall_ffi.0.nth_instruction
    }
}

static_assert!(VM_ADDR_JUMP_TABLE_RETURN_TO_HOST > VM_ADDR_JUMP_TABLE);
static_assert!(VM_ADDR_JUMP_TABLE_RETURN_TO_HOST % 0x4000 == 0);
static_assert!(VM_SANDBOX_MAXIMUM_JUMP_TABLE_SIZE <= VM_SANDBOX_MAXIMUM_JUMP_TABLE_VIRTUAL_SIZE);
static_assert!(VM_ADDR_JUMP_TABLE + VM_SANDBOX_MAXIMUM_JUMP_TABLE_SIZE < VM_ADDR_JUMP_TABLE_RETURN_TO_HOST);
static_assert!(VM_ADDR_JUMP_TABLE_RETURN_TO_HOST < VM_ADDR_JUMP_TABLE + VM_SANDBOX_MAXIMUM_JUMP_TABLE_VIRTUAL_SIZE);
static_assert!(VM_ADDR_JUMP_TABLE.count_ones() == 1);
static_assert!((1 << VM_ADDR_JUMP_TABLE.trailing_zeros()) == VM_ADDR_JUMP_TABLE);

static_assert!(
    VM_SANDBOX_MAXIMUM_NATIVE_CODE_SIZE
        >= crate::abi::VM_MAXIMUM_INSTRUCTION_COUNT * VM_COMPILER_MAXIMUM_INSTRUCTION_LENGTH + VM_COMPILER_MAXIMUM_EPILOGUE_LENGTH
);
static_assert!(VM_ADDR_NATIVE_CODE > 0xffffffff);
static_assert!(VM_ADDR_VMCTX > 0xffffffff);
static_assert!(VM_ADDR_NATIVE_STACK_LOW > 0xffffffff);