wasmtime_runtime/
cow.rs

1//! Copy-on-write initialization support: creation of backing images for
2//! modules, and logic to support mapping these backing images into memory.
3
4#![cfg_attr(not(unix), allow(unused_imports, unused_variables))]
5
6use crate::MmapVec;
7use anyhow::Result;
8use libc::c_void;
9use std::fs::File;
10use std::sync::Arc;
11use std::{convert::TryFrom, ops::Range};
12use wasmtime_environ::{
13    DefinedMemoryIndex, MemoryInitialization, MemoryPlan, MemoryStyle, Module, PrimaryMap,
14};
15
16/// Backing images for memories in a module.
17///
18/// This is meant to be built once, when a module is first loaded/constructed,
19/// and then used many times for instantiation.
20pub struct ModuleMemoryImages {
21    memories: PrimaryMap<DefinedMemoryIndex, Option<Arc<MemoryImage>>>,
22}
23
24impl ModuleMemoryImages {
25    /// Get the MemoryImage for a given memory.
26    pub fn get_memory_image(&self, defined_index: DefinedMemoryIndex) -> Option<&Arc<MemoryImage>> {
27        self.memories[defined_index].as_ref()
28    }
29}
30
31/// One backing image for one memory.
32#[derive(Debug, PartialEq)]
33pub struct MemoryImage {
34    /// The file descriptor source of this image.
35    ///
36    /// This might be an mmaped `*.cwasm` file or on Linux it could also be a
37    /// `Memfd` as an anonymous file in memory. In either case this is used as
38    /// the backing-source for the CoW image.
39    fd: FdSource,
40
41    /// Length of image, in bytes.
42    ///
43    /// Note that initial memory size may be larger; leading and trailing zeroes
44    /// are truncated (handled by backing fd).
45    ///
46    /// Must be a multiple of the system page size.
47    len: usize,
48
49    /// Image starts this many bytes into `fd` source.
50    ///
51    /// This is 0 for anonymous-backed memfd files and is the offset of the data
52    /// section in a `*.cwasm` file for `*.cwasm`-backed images.
53    ///
54    /// Must be a multiple of the system page size.
55    fd_offset: u64,
56
57    /// Image starts this many bytes into heap space.
58    ///
59    /// Must be a multiple of the system page size.
60    linear_memory_offset: usize,
61}
62
63#[derive(Debug)]
64enum FdSource {
65    #[cfg(unix)]
66    Mmap(Arc<File>),
67    #[cfg(target_os = "linux")]
68    Memfd(memfd::Memfd),
69}
70
71impl FdSource {
72    #[cfg(unix)]
73    fn as_file(&self) -> &File {
74        match self {
75            FdSource::Mmap(ref file) => file,
76            #[cfg(target_os = "linux")]
77            FdSource::Memfd(ref memfd) => memfd.as_file(),
78        }
79    }
80}
81
82impl PartialEq for FdSource {
83    fn eq(&self, other: &FdSource) -> bool {
84        cfg_if::cfg_if! {
85            if #[cfg(unix)] {
86                use rustix::fd::AsRawFd;
87                self.as_file().as_raw_fd() == other.as_file().as_raw_fd()
88            } else {
89                drop(other);
90                match *self {}
91            }
92        }
93    }
94}
95
96impl MemoryImage {
97    fn new(
98        page_size: u32,
99        offset: u64,
100        data: &[u8],
101        mmap: Option<&MmapVec>,
102    ) -> Result<Option<MemoryImage>> {
103        // Sanity-check that various parameters are page-aligned.
104        let len = data.len();
105        assert_eq!(offset % u64::from(page_size), 0);
106        assert_eq!((len as u32) % page_size, 0);
107        let linear_memory_offset = match usize::try_from(offset) {
108            Ok(offset) => offset,
109            Err(_) => return Ok(None),
110        };
111
112        // If a backing `mmap` is present then `data` should be a sub-slice of
113        // the `mmap`. The sanity-checks here double-check that. Additionally
114        // compilation should have ensured that the `data` section is
115        // page-aligned within `mmap`, so that's also all double-checked here.
116        //
117        // Finally if the `mmap` itself comes from a backing file on disk, such
118        // as a `*.cwasm` file, then that's a valid source of data for the
119        // memory image so we simply return referencing that.
120        //
121        // Note that this path is platform-agnostic in the sense of all
122        // platforms we support support memory mapping copy-on-write data from
123        // files, but for now this is still a Linux-specific region of Wasmtime.
124        // Some work will be needed to get this file compiling for macOS and
125        // Windows.
126        #[cfg(not(windows))]
127        if let Some(mmap) = mmap {
128            let start = mmap.as_ptr() as usize;
129            let end = start + mmap.len();
130            let data_start = data.as_ptr() as usize;
131            let data_end = data_start + data.len();
132            assert!(start <= data_start && data_end <= end);
133            assert_eq!((start as u32) % page_size, 0);
134            assert_eq!((data_start as u32) % page_size, 0);
135            assert_eq!((data_end as u32) % page_size, 0);
136            assert_eq!((mmap.original_offset() as u32) % page_size, 0);
137
138            if let Some(file) = mmap.original_file() {
139                return Ok(Some(MemoryImage {
140                    fd: FdSource::Mmap(file.clone()),
141                    fd_offset: u64::try_from(mmap.original_offset() + (data_start - start))
142                        .unwrap(),
143                    linear_memory_offset,
144                    len,
145                }));
146            }
147        }
148
149        // If `mmap` doesn't come from a file then platform-specific mechanisms
150        // may be used to place the data in a form that's amenable to an mmap.
151
152        cfg_if::cfg_if! {
153            if #[cfg(target_os = "linux")] {
154                // On Linux `memfd_create` is used to create an anonymous
155                // in-memory file to represent the heap image. This anonymous
156                // file is then used as the basis for further mmaps.
157
158                use std::io::Write;
159
160                let memfd = create_memfd()?;
161                memfd.as_file().write_all(data)?;
162
163                // Seal the memfd's data and length.
164                //
165                // This is a defense-in-depth security mitigation. The
166                // memfd will serve as the starting point for the heap of
167                // every instance of this module. If anything were to
168                // write to this, it could affect every execution. The
169                // memfd object itself is owned by the machinery here and
170                // not exposed elsewhere, but it is still an ambient open
171                // file descriptor at the syscall level, so some other
172                // vulnerability that allowed writes to arbitrary fds
173                // could modify it. Or we could have some issue with the
174                // way that we map it into each instance. To be
175                // extra-super-sure that it never changes, and because
176                // this costs very little, we use the kernel's "seal" API
177                // to make the memfd image permanently read-only.
178                memfd.add_seals(&[
179                    memfd::FileSeal::SealGrow,
180                    memfd::FileSeal::SealShrink,
181                    memfd::FileSeal::SealWrite,
182                    memfd::FileSeal::SealSeal,
183                ])?;
184
185                Ok(Some(MemoryImage {
186                    fd: FdSource::Memfd(memfd),
187                    fd_offset: 0,
188                    linear_memory_offset,
189                    len,
190                }))
191            } else {
192                // Other platforms don't have an easily available way of
193                // representing the heap image as an mmap-source right now. We
194                // could theoretically create a file and immediately unlink it
195                // but that means that data may likely be preserved to disk
196                // which isn't what we want here.
197                Ok(None)
198            }
199        }
200    }
201
202    unsafe fn map_at(&self, base: usize) -> Result<()> {
203        cfg_if::cfg_if! {
204            if #[cfg(unix)] {
205                let ptr = rustix::mm::mmap(
206                    (base + self.linear_memory_offset) as *mut c_void,
207                    self.len,
208                    rustix::mm::ProtFlags::READ | rustix::mm::ProtFlags::WRITE,
209                    rustix::mm::MapFlags::PRIVATE | rustix::mm::MapFlags::FIXED,
210                    self.fd.as_file(),
211                    self.fd_offset,
212                )?;
213                assert_eq!(ptr as usize, base + self.linear_memory_offset);
214                Ok(())
215            } else {
216                match self.fd {}
217            }
218        }
219    }
220
221    unsafe fn remap_as_zeros_at(&self, base: usize) -> Result<()> {
222        cfg_if::cfg_if! {
223            if #[cfg(unix)] {
224                let ptr = rustix::mm::mmap_anonymous(
225                    (base + self.linear_memory_offset) as *mut c_void,
226                    self.len,
227                    rustix::mm::ProtFlags::READ | rustix::mm::ProtFlags::WRITE,
228                    rustix::mm::MapFlags::PRIVATE | rustix::mm::MapFlags::FIXED,
229                )?;
230                assert_eq!(ptr as usize, base + self.linear_memory_offset);
231                Ok(())
232            } else {
233                match self.fd {}
234            }
235        }
236    }
237}
238
239#[cfg(target_os = "linux")]
240fn create_memfd() -> Result<memfd::Memfd> {
241    // Create the memfd. It needs a name, but the
242    // documentation for `memfd_create()` says that names can
243    // be duplicated with no issues.
244    memfd::MemfdOptions::new()
245        .allow_sealing(true)
246        .create("wasm-memory-image")
247        .map_err(|e| e.into())
248}
249
250impl ModuleMemoryImages {
251    /// Create a new `ModuleMemoryImages` for the given module. This can be
252    /// passed in as part of a `InstanceAllocationRequest` to speed up
253    /// instantiation and execution by using copy-on-write-backed memories.
254    pub fn new(
255        module: &Module,
256        wasm_data: &[u8],
257        mmap: Option<&MmapVec>,
258    ) -> Result<Option<ModuleMemoryImages>> {
259        let map = match &module.memory_initialization {
260            MemoryInitialization::Static { map } => map,
261            _ => return Ok(None),
262        };
263        let mut memories = PrimaryMap::with_capacity(map.len());
264        let page_size = crate::page_size() as u32;
265        for (memory_index, init) in map {
266            // mmap-based-initialization only works for defined memories with a
267            // known starting point of all zeros, so bail out if the mmeory is
268            // imported.
269            let defined_memory = match module.defined_memory_index(memory_index) {
270                Some(idx) => idx,
271                None => return Ok(None),
272            };
273
274            // If there's no initialization for this memory known then we don't
275            // need an image for the memory so push `None` and move on.
276            let init = match init {
277                Some(init) => init,
278                None => {
279                    memories.push(None);
280                    continue;
281                }
282            };
283
284            // Get the image for this wasm module  as a subslice of `wasm_data`,
285            // and then use that to try to create the `MemoryImage`. If this
286            // creation files then we fail creating `ModuleMemoryImages` since this
287            // memory couldn't be represented.
288            let data = &wasm_data[init.data.start as usize..init.data.end as usize];
289            let image = match MemoryImage::new(page_size, init.offset, data, mmap)? {
290                Some(image) => image,
291                None => return Ok(None),
292            };
293
294            let idx = memories.push(Some(Arc::new(image)));
295            assert_eq!(idx, defined_memory);
296        }
297
298        Ok(Some(ModuleMemoryImages { memories }))
299    }
300}
301
302/// Slot management of a copy-on-write image which can be reused for the pooling
303/// allocator.
304///
305/// This data structure manages a slot of linear memory, primarily in the
306/// pooling allocator, which optionally has a contiguous memory image in the
307/// middle of it. Pictorially this data structure manages a virtual memory
308/// region that looks like:
309///
310/// ```text
311///   +--------------------+-------------------+--------------+--------------+
312///   |   anonymous        |      optional     |   anonymous  |    PROT_NONE |
313///   |     zero           |       memory      |     zero     |     memory   |
314///   |    memory          |       image       |    memory    |              |
315///   +--------------------+-------------------+--------------+--------------+
316///   |                     <------+---------->
317///   |<-----+------------>         \
318///   |      \                   image.len
319///   |       \
320///   |  image.linear_memory_offset
321///   |
322///   \
323///  self.base is this virtual address
324///
325///    <------------------+------------------------------------------------>
326///                        \
327///                      static_size
328///
329///    <------------------+---------------------------------->
330///                        \
331///                      accessible
332/// ```
333///
334/// When a `MemoryImageSlot` is created it's told what the `static_size` and
335/// `accessible` limits are. Initially there is assumed to be no image in linear
336/// memory.
337///
338/// When `MemoryImageSlot::instantiate` is called then the method will perform
339/// a "synchronization" to take the image from its prior state to the new state
340/// for the image specified. The first instantiation for example will mmap the
341/// heap image into place. Upon reuse of a slot nothing happens except possibly
342/// shrinking `self.accessible`. When a new image is used then the old image is
343/// mapped to anonymous zero memory and then the new image is mapped in place.
344///
345/// A `MemoryImageSlot` is either `dirty` or it isn't. When a `MemoryImageSlot`
346/// is dirty then it is assumed that any memory beneath `self.accessible` could
347/// have any value. Instantiation cannot happen into a `dirty` slot, however, so
348/// the `MemoryImageSlot::clear_and_remain_ready` returns this memory back to
349/// its original state to mark `dirty = false`. This is done by resetting all
350/// anonymous memory back to zero and the image itself back to its initial
351/// contents.
352///
353/// On Linux this is achieved with the `madvise(MADV_DONTNEED)` syscall. This
354/// syscall will release the physical pages back to the OS but retain the
355/// original mappings, effectively resetting everything back to its initial
356/// state. Non-linux platforms will replace all memory below `self.accessible`
357/// with a fresh zero'd mmap, meaning that reuse is effectively not supported.
358#[derive(Debug)]
359pub struct MemoryImageSlot {
360    /// The base address in virtual memory of the actual heap memory.
361    ///
362    /// Bytes at this address are what is seen by the Wasm guest code.
363    ///
364    /// Note that this is stored as `usize` instead of `*mut u8` to not deal
365    /// with `Send`/`Sync.
366    base: usize,
367
368    /// The maximum static memory size which `self.accessible` can grow to.
369    static_size: usize,
370
371    /// An optional image that is currently being used in this linear memory.
372    ///
373    /// This can be `None` in which case memory is originally all zeros. When
374    /// `Some` the image describes where it's located within the image.
375    image: Option<Arc<MemoryImage>>,
376
377    /// The size of the heap that is readable and writable.
378    ///
379    /// Note that this may extend beyond the actual linear memory heap size in
380    /// the case of dynamic memories in use. Memory accesses to memory below
381    /// `self.accessible` may still page fault as pages are lazily brought in
382    /// but the faults will always be resolved by the kernel.
383    accessible: usize,
384
385    /// Whether this slot may have "dirty" pages (pages written by an
386    /// instantiation). Set by `instantiate()` and cleared by
387    /// `clear_and_remain_ready()`, and used in assertions to ensure
388    /// those methods are called properly.
389    ///
390    /// Invariant: if !dirty, then this memory slot contains a clean
391    /// CoW mapping of `image`, if `Some(..)`, and anonymous-zero
392    /// memory beyond the image up to `static_size`. The addresses
393    /// from offset 0 to `self.accessible` are R+W and set to zero or the
394    /// initial image content, as appropriate. Everything between
395    /// `self.accessible` and `self.static_size` is inaccessible.
396    dirty: bool,
397
398    /// Whether this MemoryImageSlot is responsible for mapping anonymous
399    /// memory (to hold the reservation while overwriting mappings
400    /// specific to this slot) in place when it is dropped. Default
401    /// on, unless the caller knows what they are doing.
402    clear_on_drop: bool,
403}
404
405impl MemoryImageSlot {
406    /// Create a new MemoryImageSlot. Assumes that there is an anonymous
407    /// mmap backing in the given range to start.
408    ///
409    /// The `accessible` parameter descibes how much of linear memory is
410    /// already mapped as R/W with all zero-bytes. The `static_size` value is
411    /// the maximum size of this image which `accessible` cannot grow beyond,
412    /// and all memory from `accessible` from `static_size` should be mapped as
413    /// `PROT_NONE` backed by zero-bytes.
414    pub(crate) fn create(base_addr: *mut c_void, accessible: usize, static_size: usize) -> Self {
415        let base = base_addr as usize;
416        MemoryImageSlot {
417            base,
418            static_size,
419            accessible,
420            image: None,
421            dirty: false,
422            clear_on_drop: true,
423        }
424    }
425
426    #[cfg(feature = "pooling-allocator")]
427    pub(crate) fn dummy() -> MemoryImageSlot {
428        MemoryImageSlot {
429            base: 0,
430            static_size: 0,
431            image: None,
432            accessible: 0,
433            dirty: false,
434            clear_on_drop: false,
435        }
436    }
437
438    /// Inform the MemoryImageSlot that it should *not* clear the underlying
439    /// address space when dropped. This should be used only when the
440    /// caller will clear or reuse the address space in some other
441    /// way.
442    pub(crate) fn no_clear_on_drop(&mut self) {
443        self.clear_on_drop = false;
444    }
445
446    pub(crate) fn set_heap_limit(&mut self, size_bytes: usize) -> Result<()> {
447        assert!(size_bytes <= self.static_size);
448
449        // If the heap limit already addresses accessible bytes then no syscalls
450        // are necessary since the data is already mapped into the process and
451        // waiting to go.
452        //
453        // This is used for "dynamic" memories where memory is not always
454        // decommitted during recycling (but it's still always reset).
455        if size_bytes <= self.accessible {
456            return Ok(());
457        }
458
459        // Otherwise use `mprotect` to make the new pages read/write.
460        self.set_protection(self.accessible..size_bytes, true)?;
461        self.accessible = size_bytes;
462
463        Ok(())
464    }
465
466    /// Prepares this slot for the instantiation of a new instance with the
467    /// provided linear memory image.
468    ///
469    /// The `initial_size_bytes` parameter indicates the required initial size
470    /// of the heap for the instance. The `maybe_image` is an optional initial
471    /// image for linear memory to contains. The `style` is the way compiled
472    /// code will be accessing this memory.
473    ///
474    /// The purpose of this method is to take a previously pristine slot
475    /// (`!self.dirty`) and transform its prior state into state necessary for
476    /// the given parameters. This could include, for example:
477    ///
478    /// * More memory may be made read/write if `initial_size_bytes` is larger
479    ///   than `self.accessible`.
480    /// * For `MemoryStyle::Static` linear memory may be made `PROT_NONE` if
481    ///   `self.accessible` is larger than `initial_size_bytes`.
482    /// * If no image was previously in place or if the wrong image was
483    ///   previously in place then `mmap` may be used to setup the initial
484    ///   image.
485    pub(crate) fn instantiate(
486        &mut self,
487        initial_size_bytes: usize,
488        maybe_image: Option<&Arc<MemoryImage>>,
489        plan: &MemoryPlan,
490    ) -> Result<()> {
491        assert!(!self.dirty);
492        assert!(initial_size_bytes <= self.static_size);
493
494        // First order of business is to blow away the previous linear memory
495        // image if it doesn't match the image specified here. If one is
496        // detected then it's reset with anonymous memory which means that all
497        // of memory up to `self.accessible` will now be read/write and zero.
498        //
499        // Note that this intentionally a "small mmap" which only covers the
500        // extent of the prior initialization image in order to preserve
501        // resident memory that might come before or after the image.
502        if self.image.as_ref() != maybe_image {
503            self.remove_image()?;
504        }
505
506        // The next order of business is to ensure that `self.accessible` is
507        // appropriate. First up is to grow the read/write portion of memory if
508        // it's not large enough to accommodate `initial_size_bytes`.
509        if self.accessible < initial_size_bytes {
510            self.set_protection(self.accessible..initial_size_bytes, true)?;
511            self.accessible = initial_size_bytes;
512        }
513
514        // If (1) the accessible region is not in its initial state, and (2) the
515        // memory relies on virtual memory at all (i.e. has offset guard pages
516        // and/or is static), then we need to reset memory protections. Put
517        // another way, the only time it is safe to not reset protections is
518        // when we are using dynamic memory without any guard pages.
519        if initial_size_bytes < self.accessible
520            && (plan.offset_guard_size > 0 || matches!(plan.style, MemoryStyle::Static { .. }))
521        {
522            self.set_protection(initial_size_bytes..self.accessible, false)?;
523            self.accessible = initial_size_bytes;
524        }
525
526        // Now that memory is sized appropriately the final operation is to
527        // place the new image into linear memory. Note that this operation is
528        // skipped if `self.image` matches `maybe_image`.
529        assert!(initial_size_bytes <= self.accessible);
530        if self.image.as_ref() != maybe_image {
531            if let Some(image) = maybe_image.as_ref() {
532                assert!(
533                    image.linear_memory_offset.checked_add(image.len).unwrap()
534                        <= initial_size_bytes
535                );
536                if image.len > 0 {
537                    unsafe {
538                        image.map_at(self.base)?;
539                    }
540                }
541            }
542            self.image = maybe_image.cloned();
543        }
544
545        // Flag ourselves as `dirty` which means that the next operation on this
546        // slot is required to be `clear_and_remain_ready`.
547        self.dirty = true;
548
549        Ok(())
550    }
551
552    pub(crate) fn remove_image(&mut self) -> Result<()> {
553        if let Some(image) = &self.image {
554            unsafe {
555                image.remap_as_zeros_at(self.base)?;
556            }
557            self.image = None;
558        }
559        Ok(())
560    }
561
562    /// Resets this linear memory slot back to a "pristine state".
563    ///
564    /// This will reset the memory back to its original contents on Linux or
565    /// reset the contents back to zero on other platforms. The `keep_resident`
566    /// argument is the maximum amount of memory to keep resident in this
567    /// process's memory on Linux. Up to that much memory will be `memset` to
568    /// zero where the rest of it will be reset or released with `madvise`.
569    #[allow(dead_code)] // ignore warnings as this is only used in some cfgs
570    pub(crate) fn clear_and_remain_ready(&mut self, keep_resident: usize) -> Result<()> {
571        assert!(self.dirty);
572
573        unsafe {
574            self.reset_all_memory_contents(keep_resident)?;
575        }
576
577        self.dirty = false;
578        Ok(())
579    }
580
581    #[allow(dead_code)] // ignore warnings as this is only used in some cfgs
582    unsafe fn reset_all_memory_contents(&mut self, keep_resident: usize) -> Result<()> {
583        if !cfg!(target_os = "linux") {
584            // If we're not on Linux then there's no generic platform way to
585            // reset memory back to its original state, so instead reset memory
586            // back to entirely zeros with an anonymous backing.
587            //
588            // Additionally the previous image, if any, is dropped here
589            // since it's no longer applicable to this mapping.
590            return self.reset_with_anon_memory();
591        }
592
593        match &self.image {
594            Some(image) => {
595                assert!(self.accessible >= image.linear_memory_offset + image.len);
596                if image.linear_memory_offset < keep_resident {
597                    // If the image starts below the `keep_resident` then
598                    // memory looks something like this:
599                    //
600                    //               up to `keep_resident` bytes
601                    //                          |
602                    //          +--------------------------+  remaining_memset
603                    //          |                          | /
604                    //  <-------------->                <------->
605                    //
606                    //                              image_end
607                    // 0        linear_memory_offset   |             accessible
608                    // |                |              |                  |
609                    // +----------------+--------------+---------+--------+
610                    // |  dirty memory  |    image     |   dirty memory   |
611                    // +----------------+--------------+---------+--------+
612                    //
613                    //  <------+-------> <-----+----->  <---+---> <--+--->
614                    //         |               |            |        |
615                    //         |               |            |        |
616                    //   memset (1)            /            |   madvise (4)
617                    //                  mmadvise (2)       /
618                    //                                    /
619                    //                              memset (3)
620                    //
621                    //
622                    // In this situation there are two disjoint regions that are
623                    // `memset` manually to zero. Note that `memset (3)` may be
624                    // zero bytes large. Furthermore `madvise (4)` may also be
625                    // zero bytes large.
626
627                    let image_end = image.linear_memory_offset + image.len;
628                    let mem_after_image = self.accessible - image_end;
629                    let remaining_memset =
630                        (keep_resident - image.linear_memory_offset).min(mem_after_image);
631
632                    // This is memset (1)
633                    std::ptr::write_bytes(self.base as *mut u8, 0u8, image.linear_memory_offset);
634
635                    // This is madvise (2)
636                    self.madvise_reset(image.linear_memory_offset, image.len)?;
637
638                    // This is memset (3)
639                    std::ptr::write_bytes(
640                        (self.base + image_end) as *mut u8,
641                        0u8,
642                        remaining_memset,
643                    );
644
645                    // This is madvise (4)
646                    self.madvise_reset(
647                        image_end + remaining_memset,
648                        mem_after_image - remaining_memset,
649                    )?;
650                } else {
651                    // If the image starts after the `keep_resident` threshold
652                    // then we memset the start of linear memory and then use
653                    // madvise below for the rest of it, including the image.
654                    //
655                    // 0             keep_resident                   accessible
656                    // |                |                                 |
657                    // +----------------+---+----------+------------------+
658                    // |  dirty memory      |  image   |   dirty memory   |
659                    // +----------------+---+----------+------------------+
660                    //
661                    //  <------+-------> <-------------+----------------->
662                    //         |                       |
663                    //         |                       |
664                    //   memset (1)                 madvise (2)
665                    //
666                    // Here only a single memset is necessary since the image
667                    // started after the threshold which we're keeping resident.
668                    // Note that the memset may be zero bytes here.
669
670                    // This is memset (1)
671                    std::ptr::write_bytes(self.base as *mut u8, 0u8, keep_resident);
672
673                    // This is madvise (2)
674                    self.madvise_reset(keep_resident, self.accessible - keep_resident)?;
675                }
676            }
677
678            // If there's no memory image for this slot then memset the first
679            // bytes in the memory back to zero while using `madvise` to purge
680            // the rest.
681            None => {
682                let size_to_memset = keep_resident.min(self.accessible);
683                std::ptr::write_bytes(self.base as *mut u8, 0u8, size_to_memset);
684                self.madvise_reset(size_to_memset, self.accessible - size_to_memset)?;
685            }
686        }
687
688        Ok(())
689    }
690
691    #[allow(dead_code)] // ignore warnings as this is only used in some cfgs
692    unsafe fn madvise_reset(&self, base: usize, len: usize) -> Result<()> {
693        assert!(base + len <= self.accessible);
694        if len == 0 {
695            return Ok(());
696        }
697        cfg_if::cfg_if! {
698            if #[cfg(target_os = "linux")] {
699                rustix::mm::madvise(
700                    (self.base + base) as *mut c_void,
701                    len,
702                    rustix::mm::Advice::LinuxDontNeed,
703                )?;
704                Ok(())
705            } else {
706                unreachable!();
707            }
708        }
709    }
710
711    fn set_protection(&self, range: Range<usize>, readwrite: bool) -> Result<()> {
712        assert!(range.start <= range.end);
713        assert!(range.end <= self.static_size);
714        let start = self.base.checked_add(range.start).unwrap();
715        if range.len() == 0 {
716            return Ok(());
717        }
718
719        unsafe {
720            cfg_if::cfg_if! {
721                if #[cfg(unix)] {
722                    let flags = if readwrite {
723                        rustix::mm::MprotectFlags::READ | rustix::mm::MprotectFlags::WRITE
724                    } else {
725                        rustix::mm::MprotectFlags::empty()
726                    };
727                    rustix::mm::mprotect(start as *mut _, range.len(), flags)?;
728                } else {
729                    use windows_sys::Win32::System::Memory::*;
730
731                    let failure = if readwrite {
732                        VirtualAlloc(start as _, range.len(), MEM_COMMIT, PAGE_READWRITE).is_null()
733                    } else {
734                        VirtualFree(start as _, range.len(), MEM_DECOMMIT) == 0
735                    };
736                    if failure {
737                        return Err(std::io::Error::last_os_error().into());
738                    }
739                }
740            }
741        }
742
743        Ok(())
744    }
745
746    pub(crate) fn has_image(&self) -> bool {
747        self.image.is_some()
748    }
749
750    #[allow(dead_code)] // ignore warnings as this is only used in some cfgs
751    pub(crate) fn is_dirty(&self) -> bool {
752        self.dirty
753    }
754
755    /// Map anonymous zeroed memory across the whole slot,
756    /// inaccessible. Used both during instantiate and during drop.
757    fn reset_with_anon_memory(&mut self) -> Result<()> {
758        if self.static_size == 0 {
759            assert!(self.image.is_none());
760            assert_eq!(self.accessible, 0);
761            return Ok(());
762        }
763
764        unsafe {
765            cfg_if::cfg_if! {
766                if #[cfg(unix)] {
767                    let ptr = rustix::mm::mmap_anonymous(
768                        self.base as *mut c_void,
769                        self.static_size,
770                        rustix::mm::ProtFlags::empty(),
771                        rustix::mm::MapFlags::PRIVATE | rustix::mm::MapFlags::FIXED,
772                    )?;
773                    assert_eq!(ptr as usize, self.base);
774                } else {
775                    use windows_sys::Win32::System::Memory::*;
776                    if VirtualFree(self.base as _, self.static_size, MEM_DECOMMIT) == 0 {
777                        return Err(std::io::Error::last_os_error().into());
778                    }
779                }
780            }
781        }
782
783        self.image = None;
784        self.accessible = 0;
785
786        Ok(())
787    }
788}
789
790impl Drop for MemoryImageSlot {
791    fn drop(&mut self) {
792        // The MemoryImageSlot may be dropped if there is an error during
793        // instantiation: for example, if a memory-growth limiter
794        // disallows a guest from having a memory of a certain size,
795        // after we've already initialized the MemoryImageSlot.
796        //
797        // We need to return this region of the large pool mmap to a
798        // safe state (with no module-specific mappings). The
799        // MemoryImageSlot will not be returned to the MemoryPool, so a new
800        // MemoryImageSlot will be created and overwrite the mappings anyway
801        // on the slot's next use; but for safety and to avoid
802        // resource leaks it's better not to have stale mappings to a
803        // possibly-otherwise-dead module's image.
804        //
805        // To "wipe the slate clean", let's do a mmap of anonymous
806        // memory over the whole region, with PROT_NONE. Note that we
807        // *can't* simply munmap, because that leaves a hole in the
808        // middle of the pooling allocator's big memory area that some
809        // other random mmap may swoop in and take, to be trampled
810        // over by the next MemoryImageSlot later.
811        //
812        // Since we're in drop(), we can't sanely return an error if
813        // this mmap fails. Instead though the result is unwrapped here to
814        // trigger a panic if something goes wrong. Otherwise if this
815        // reset-the-mapping fails then on reuse it might be possible, depending
816        // on precisely where errors happened, that stale memory could get
817        // leaked through.
818        //
819        // The exception to all of this is if the `clear_on_drop` flag
820        // (which is set by default) is false. If so, the owner of
821        // this MemoryImageSlot has indicated that it will clean up in some
822        // other way.
823        if self.clear_on_drop {
824            self.reset_with_anon_memory().unwrap();
825        }
826    }
827}
828
829#[cfg(all(test, target_os = "linux"))]
830mod test {
831    use std::sync::Arc;
832
833    use super::{create_memfd, FdSource, MemoryImage, MemoryImageSlot, MemoryPlan, MemoryStyle};
834    use crate::mmap::Mmap;
835    use anyhow::Result;
836    use std::io::Write;
837    use wasmtime_environ::Memory;
838
839    fn create_memfd_with_data(offset: usize, data: &[u8]) -> Result<MemoryImage> {
840        // Offset must be page-aligned.
841        let page_size = crate::page_size();
842        assert_eq!(offset & (page_size - 1), 0);
843        let memfd = create_memfd()?;
844        memfd.as_file().write_all(data)?;
845
846        // The image length is rounded up to the nearest page size
847        let image_len = (data.len() + page_size - 1) & !(page_size - 1);
848        memfd.as_file().set_len(image_len as u64)?;
849
850        Ok(MemoryImage {
851            fd: FdSource::Memfd(memfd),
852            len: image_len,
853            fd_offset: 0,
854            linear_memory_offset: offset,
855        })
856    }
857
858    fn dummy_memory_plan(style: MemoryStyle) -> MemoryPlan {
859        MemoryPlan {
860            style,
861            memory: Memory {
862                minimum: 0,
863                maximum: None,
864                shared: false,
865                memory64: false,
866            },
867            pre_guard_size: 0,
868            offset_guard_size: 0,
869        }
870    }
871
872    #[test]
873    fn instantiate_no_image() {
874        let plan = dummy_memory_plan(MemoryStyle::Static { bound: 4 << 30 });
875        // 4 MiB mmap'd area, not accessible
876        let mut mmap = Mmap::accessible_reserved(0, 4 << 20).unwrap();
877        // Create a MemoryImageSlot on top of it
878        let mut memfd = MemoryImageSlot::create(mmap.as_mut_ptr() as *mut _, 0, 4 << 20);
879        memfd.no_clear_on_drop();
880        assert!(!memfd.is_dirty());
881        // instantiate with 64 KiB initial size
882        memfd.instantiate(64 << 10, None, &plan).unwrap();
883        assert!(memfd.is_dirty());
884        // We should be able to access this 64 KiB (try both ends) and
885        // it should consist of zeroes.
886        let slice = mmap.as_mut_slice();
887        assert_eq!(0, slice[0]);
888        assert_eq!(0, slice[65535]);
889        slice[1024] = 42;
890        assert_eq!(42, slice[1024]);
891        // grow the heap
892        memfd.set_heap_limit(128 << 10).unwrap();
893        let slice = mmap.as_slice();
894        assert_eq!(42, slice[1024]);
895        assert_eq!(0, slice[131071]);
896        // instantiate again; we should see zeroes, even as the
897        // reuse-anon-mmap-opt kicks in
898        memfd.clear_and_remain_ready(0).unwrap();
899        assert!(!memfd.is_dirty());
900        memfd.instantiate(64 << 10, None, &plan).unwrap();
901        let slice = mmap.as_slice();
902        assert_eq!(0, slice[1024]);
903    }
904
905    #[test]
906    fn instantiate_image() {
907        let plan = dummy_memory_plan(MemoryStyle::Static { bound: 4 << 30 });
908        // 4 MiB mmap'd area, not accessible
909        let mut mmap = Mmap::accessible_reserved(0, 4 << 20).unwrap();
910        // Create a MemoryImageSlot on top of it
911        let mut memfd = MemoryImageSlot::create(mmap.as_mut_ptr() as *mut _, 0, 4 << 20);
912        memfd.no_clear_on_drop();
913        // Create an image with some data.
914        let image = Arc::new(create_memfd_with_data(4096, &[1, 2, 3, 4]).unwrap());
915        // Instantiate with this image
916        memfd.instantiate(64 << 10, Some(&image), &plan).unwrap();
917        assert!(memfd.has_image());
918        let slice = mmap.as_mut_slice();
919        assert_eq!(&[1, 2, 3, 4], &slice[4096..4100]);
920        slice[4096] = 5;
921        // Clear and re-instantiate same image
922        memfd.clear_and_remain_ready(0).unwrap();
923        memfd.instantiate(64 << 10, Some(&image), &plan).unwrap();
924        let slice = mmap.as_slice();
925        // Should not see mutation from above
926        assert_eq!(&[1, 2, 3, 4], &slice[4096..4100]);
927        // Clear and re-instantiate no image
928        memfd.clear_and_remain_ready(0).unwrap();
929        memfd.instantiate(64 << 10, None, &plan).unwrap();
930        assert!(!memfd.has_image());
931        let slice = mmap.as_slice();
932        assert_eq!(&[0, 0, 0, 0], &slice[4096..4100]);
933        // Clear and re-instantiate image again
934        memfd.clear_and_remain_ready(0).unwrap();
935        memfd.instantiate(64 << 10, Some(&image), &plan).unwrap();
936        let slice = mmap.as_slice();
937        assert_eq!(&[1, 2, 3, 4], &slice[4096..4100]);
938        // Create another image with different data.
939        let image2 = Arc::new(create_memfd_with_data(4096, &[10, 11, 12, 13]).unwrap());
940        memfd.clear_and_remain_ready(0).unwrap();
941        memfd.instantiate(128 << 10, Some(&image2), &plan).unwrap();
942        let slice = mmap.as_slice();
943        assert_eq!(&[10, 11, 12, 13], &slice[4096..4100]);
944        // Instantiate the original image again; we should notice it's
945        // a different image and not reuse the mappings.
946        memfd.clear_and_remain_ready(0).unwrap();
947        memfd.instantiate(64 << 10, Some(&image), &plan).unwrap();
948        let slice = mmap.as_slice();
949        assert_eq!(&[1, 2, 3, 4], &slice[4096..4100]);
950    }
951
952    #[test]
953    #[cfg(target_os = "linux")]
954    fn memset_instead_of_madvise() {
955        let plan = dummy_memory_plan(MemoryStyle::Static { bound: 100 });
956        let mut mmap = Mmap::accessible_reserved(0, 4 << 20).unwrap();
957        let mut memfd = MemoryImageSlot::create(mmap.as_mut_ptr() as *mut _, 0, 4 << 20);
958        memfd.no_clear_on_drop();
959
960        // Test basics with the image
961        for image_off in [0, 4096, 8 << 10] {
962            let image = Arc::new(create_memfd_with_data(image_off, &[1, 2, 3, 4]).unwrap());
963            for amt_to_memset in [0, 4096, 10 << 12, 1 << 20, 10 << 20] {
964                memfd.instantiate(64 << 10, Some(&image), &plan).unwrap();
965                assert!(memfd.has_image());
966                let slice = mmap.as_mut_slice();
967                if image_off > 0 {
968                    assert_eq!(slice[image_off - 1], 0);
969                }
970                assert_eq!(slice[image_off + 5], 0);
971                assert_eq!(&[1, 2, 3, 4], &slice[image_off..][..4]);
972                slice[image_off] = 5;
973                assert_eq!(&[5, 2, 3, 4], &slice[image_off..][..4]);
974                memfd.clear_and_remain_ready(amt_to_memset).unwrap();
975            }
976        }
977
978        // Test without an image
979        for amt_to_memset in [0, 4096, 10 << 12, 1 << 20, 10 << 20] {
980            memfd.instantiate(64 << 10, None, &plan).unwrap();
981            for chunk in mmap.as_mut_slice()[..64 << 10].chunks_mut(1024) {
982                assert_eq!(chunk[0], 0);
983                chunk[0] = 5;
984            }
985            memfd.clear_and_remain_ready(amt_to_memset).unwrap();
986        }
987    }
988
989    #[test]
990    #[cfg(target_os = "linux")]
991    fn dynamic() {
992        let plan = dummy_memory_plan(MemoryStyle::Dynamic { reserve: 200 });
993
994        let mut mmap = Mmap::accessible_reserved(0, 4 << 20).unwrap();
995        let mut memfd = MemoryImageSlot::create(mmap.as_mut_ptr() as *mut _, 0, 4 << 20);
996        memfd.no_clear_on_drop();
997        let image = Arc::new(create_memfd_with_data(4096, &[1, 2, 3, 4]).unwrap());
998        let initial = 64 << 10;
999
1000        // Instantiate the image and test that memory remains accessible after
1001        // it's cleared.
1002        memfd.instantiate(initial, Some(&image), &plan).unwrap();
1003        assert!(memfd.has_image());
1004        let slice = mmap.as_mut_slice();
1005        assert_eq!(&[1, 2, 3, 4], &slice[4096..4100]);
1006        slice[4096] = 5;
1007        assert_eq!(&[5, 2, 3, 4], &slice[4096..4100]);
1008        memfd.clear_and_remain_ready(0).unwrap();
1009        assert_eq!(&[1, 2, 3, 4], &slice[4096..4100]);
1010
1011        // Re-instantiate make sure it preserves memory. Grow a bit and set data
1012        // beyond the initial size.
1013        memfd.instantiate(initial, Some(&image), &plan).unwrap();
1014        assert_eq!(&[1, 2, 3, 4], &slice[4096..4100]);
1015        memfd.set_heap_limit(initial * 2).unwrap();
1016        assert_eq!(&[0, 0], &slice[initial..initial + 2]);
1017        slice[initial] = 100;
1018        assert_eq!(&[100, 0], &slice[initial..initial + 2]);
1019        memfd.clear_and_remain_ready(0).unwrap();
1020
1021        // Test that memory is still accessible, but it's been reset
1022        assert_eq!(&[0, 0], &slice[initial..initial + 2]);
1023
1024        // Instantiate again, and again memory beyond the initial size should
1025        // still be accessible. Grow into it again and make sure it works.
1026        memfd.instantiate(initial, Some(&image), &plan).unwrap();
1027        assert_eq!(&[0, 0], &slice[initial..initial + 2]);
1028        memfd.set_heap_limit(initial * 2).unwrap();
1029        assert_eq!(&[0, 0], &slice[initial..initial + 2]);
1030        slice[initial] = 100;
1031        assert_eq!(&[100, 0], &slice[initial..initial + 2]);
1032        memfd.clear_and_remain_ready(0).unwrap();
1033
1034        // Reset the image to none and double-check everything is back to zero
1035        memfd.instantiate(64 << 10, None, &plan).unwrap();
1036        assert!(!memfd.has_image());
1037        assert_eq!(&[0, 0, 0, 0], &slice[4096..4100]);
1038        assert_eq!(&[0, 0], &slice[initial..initial + 2]);
1039    }
1040}