diff --git a/resources/seccomp/aarch64-unknown-linux-musl.json b/resources/seccomp/aarch64-unknown-linux-musl.json index db3abe1eced..7bb16541bfd 100644 --- a/resources/seccomp/aarch64-unknown-linux-musl.json +++ b/resources/seccomp/aarch64-unknown-linux-musl.json @@ -743,6 +743,9 @@ { "syscall": "exit_group" }, + { + "syscall": "read" + }, { "syscall": "write" }, diff --git a/resources/seccomp/x86_64-unknown-linux-musl.json b/resources/seccomp/x86_64-unknown-linux-musl.json index 95ceca1b7ef..ca479fde533 100644 --- a/resources/seccomp/x86_64-unknown-linux-musl.json +++ b/resources/seccomp/x86_64-unknown-linux-musl.json @@ -755,6 +755,9 @@ { "syscall": "exit_group" }, + { + "syscall": "read" + }, { "syscall": "write" }, diff --git a/src/firecracker/examples/uffd/fault_all_handler.rs b/src/firecracker/examples/uffd/fault_all_handler.rs index ca7601ebf25..61d1569ccd4 100644 --- a/src/firecracker/examples/uffd/fault_all_handler.rs +++ b/src/firecracker/examples/uffd/fault_all_handler.rs @@ -8,11 +8,14 @@ mod uffd_utils; use std::fs::File; +use std::os::fd::AsRawFd; use std::os::unix::net::UnixListener; use uffd_utils::{Runtime, UffdHandler}; use utils::time::{ClockType, get_time_us}; +use crate::uffd_utils::uffd_continue; + fn main() { let mut args = std::env::args(); let uffd_sock_path = args.nth(1).expect("No socket path given"); @@ -23,27 +26,83 @@ fn main() { // Get Uffd from UDS. We'll use the uffd to handle PFs for Firecracker. let listener = UnixListener::bind(uffd_sock_path).expect("Cannot bind to socket path"); let (stream, _) = listener.accept().expect("Cannot listen on UDS socket"); + stream + .set_nonblocking(true) + .expect("Cannot set non-blocking"); let mut runtime = Runtime::new(stream, file); runtime.install_panic_hook(); - runtime.run(|uffd_handler: &mut UffdHandler| { - // Read an event from the userfaultfd. - let event = uffd_handler - .read_event() - .expect("Failed to read uffd_msg") - .expect("uffd_msg not ready"); - - match event { - userfaultfd::Event::Pagefault { .. } => { - let start = get_time_us(ClockType::Monotonic); - for region in uffd_handler.mem_regions.clone() { - uffd_handler.serve_pf(region.base_host_virt_addr as _, region.size); + runtime.run( + |uffd_handler: &mut UffdHandler| { + // Read an event from the userfaultfd. + let event = uffd_handler + .read_event() + .expect("Failed to read uffd_msg") + .expect("uffd_msg not ready"); + + if let userfaultfd::Event::Pagefault { addr, .. } = event { + let bit = + uffd_handler.addr_to_offset(addr.cast()) as usize / uffd_handler.page_size; + + // If Secret Free, we know if this is the first fault based on the userfault + // bitmap state. Otherwise, we assume that we will ever only receive a single fault + // event via UFFD. + let are_we_faulted_yet = uffd_handler + .userfault_bitmap + .as_mut() + .map_or(false, |bitmap| !bitmap.is_bit_set(bit)); + + if are_we_faulted_yet { + // TODO: we currently ignore the result as we may attempt to + // populate the page that is already present as we may receive + // multiple minor fault events per page. + let _ = uffd_continue( + uffd_handler.uffd.as_raw_fd(), + addr as _, + uffd_handler.page_size as u64, + ) + .inspect_err(|err| println!("Error during uffdio_continue: {:?}", err)); + } else { + fault_all(uffd_handler, addr); } - let end = get_time_us(ClockType::Monotonic); + } + }, + |_uffd_handler: &mut UffdHandler, _offset: usize| {}, + ); +} - println!("Finished Faulting All: {}us", end - start); +fn fault_all(uffd_handler: &mut UffdHandler, fault_addr: *mut libc::c_void) { + let start = get_time_us(ClockType::Monotonic); + for region in uffd_handler.mem_regions.clone() { + match uffd_handler.guest_memfd { + None => { + uffd_handler.serve_pf(region.base_host_virt_addr as _, region.size); + } + Some(_) => { + let written = uffd_handler.populate_via_write(region.offset as usize, region.size); + + // This code is written under the assumption that the first fault triggered by + // Firecracker is either due to an MSR write (on x86) or due to device restoration + // reading from guest memory to check the virtio queues are sane (on + // ARM). This will be reported via a UFFD minor fault which needs to + // be handled via memcpy. Importantly, we get to the UFFD handler + // with the actual guest_memfd page already faulted in, meaning pwrite will stop + // once it gets to the offset of that page (e.g. written < region.size above). + // Thus, to fault in everything, we now need to skip this one page, write the + // remaining region, and then deal with the "gap" via uffd_handler.serve_pf(). + + if written < region.size - uffd_handler.page_size { + let r = uffd_handler.populate_via_write( + region.offset as usize + written + uffd_handler.page_size, + region.size - written - uffd_handler.page_size, + ); + assert_eq!(written + r, region.size - uffd_handler.page_size); + } } - _ => panic!("Unexpected event on userfaultfd"), } - }); + } + uffd_handler.serve_pf(fault_addr.cast(), uffd_handler.page_size); + let end = get_time_us(ClockType::Monotonic); + + println!("Finished Faulting All: {}us", end - start); } diff --git a/src/firecracker/examples/uffd/malicious_handler.rs b/src/firecracker/examples/uffd/malicious_handler.rs index 9af94e057aa..c926b976207 100644 --- a/src/firecracker/examples/uffd/malicious_handler.rs +++ b/src/firecracker/examples/uffd/malicious_handler.rs @@ -21,17 +21,23 @@ fn main() { // Get Uffd from UDS. We'll use the uffd to handle PFs for Firecracker. let listener = UnixListener::bind(uffd_sock_path).expect("Cannot bind to socket path"); let (stream, _) = listener.accept().expect("Cannot listen on UDS socket"); + stream + .set_nonblocking(true) + .expect("Cannot set non-blocking"); let mut runtime = Runtime::new(stream, file); - runtime.run(|uffd_handler: &mut UffdHandler| { - // Read an event from the userfaultfd. - let event = uffd_handler - .read_event() - .expect("Failed to read uffd_msg") - .expect("uffd_msg not ready"); - - if let userfaultfd::Event::Pagefault { .. } = event { - panic!("Fear me! I am the malicious page fault handler.") - } - }); + runtime.run( + |uffd_handler: &mut UffdHandler| { + // Read an event from the userfaultfd. + let event = uffd_handler + .read_event() + .expect("Failed to read uffd_msg") + .expect("uffd_msg not ready"); + + if let userfaultfd::Event::Pagefault { .. } = event { + panic!("Fear me! I am the malicious page fault handler.") + } + }, + |_uffd_handler: &mut UffdHandler, _offset: usize| {}, + ); } diff --git a/src/firecracker/examples/uffd/on_demand_handler.rs b/src/firecracker/examples/uffd/on_demand_handler.rs index 3be958b3578..d8686b1af61 100644 --- a/src/firecracker/examples/uffd/on_demand_handler.rs +++ b/src/firecracker/examples/uffd/on_demand_handler.rs @@ -8,10 +8,13 @@ mod uffd_utils; use std::fs::File; +use std::os::fd::AsRawFd; use std::os::unix::net::UnixListener; use uffd_utils::{Runtime, UffdHandler}; +use crate::uffd_utils::uffd_continue; + fn main() { let mut args = std::env::args(); let uffd_sock_path = args.nth(1).expect("No socket path given"); @@ -22,84 +25,134 @@ fn main() { // Get Uffd from UDS. We'll use the uffd to handle PFs for Firecracker. let listener = UnixListener::bind(uffd_sock_path).expect("Cannot bind to socket path"); let (stream, _) = listener.accept().expect("Cannot listen on UDS socket"); + stream + .set_nonblocking(true) + .expect("Cannot set non-blocking"); let mut runtime = Runtime::new(stream, file); runtime.install_panic_hook(); - runtime.run(|uffd_handler: &mut UffdHandler| { - // !DISCLAIMER! - // When using UFFD together with the balloon device, this handler needs to deal with - // `remove` and `pagefault` events. There are multiple things to keep in mind in - // such setups: - // - // As long as any `remove` event is pending in the UFFD queue, all ioctls return EAGAIN - // ----------------------------------------------------------------------------------- - // - // This means we cannot process UFFD events simply one-by-one anymore - if a `remove` event - // arrives, we need to pre-fetch all other events up to the `remove` event, to unblock the - // UFFD, and then go back to the process the pre-fetched events. - // - // UFFD might receive events in not in their causal order - // ----------------------------------------------------- - // - // For example, the guest - // kernel might first respond to a balloon inflation by freeing some memory, and - // telling Firecracker about this. Firecracker will then madvise(MADV_DONTNEED) the - // free memory range, which causes a `remove` event to be sent to UFFD. Then, the - // guest kernel might immediately fault the page in again (for example because - // default_on_oom was set). which causes a `pagefault` event to be sent to UFFD. - // - // However, the pagefault will be triggered from inside KVM on the vCPU thread, while the - // balloon device is handled by Firecracker on its VMM thread. This means that potentially - // this handler can receive the `pagefault` _before_ the `remove` event. - // - // This means that the simple "greedy" strategy of simply prefetching _all_ UFFD events - // to make sure no `remove` event is blocking us can result in the handler acting on - // the `pagefault` event before the `remove` message (despite the `remove` event being - // in the causal past of the `pagefault` event), which means that we will fault in a page - // from the snapshot file, while really we should be faulting in a zero page. - // - // In this example handler, we ignore this problem, to avoid - // complexity (under the assumption that the guest kernel will zero a newly faulted in - // page anyway). A production handler will most likely want to ensure that `remove` - // events for a specific range are always handled before `pagefault` events. - // - // Lastly, we still need to deal with the race condition where a `remove` event arrives - // in the UFFD queue after we got done reading all events, in which case we need to go - // back to reading more events before we can continue processing `pagefault`s. - let mut deferred_events = Vec::new(); + runtime.run( + |uffd_handler: &mut UffdHandler| { + // !DISCLAIMER! + // When using UFFD together with the balloon device, this handler needs to deal with + // `remove` and `pagefault` events. There are multiple things to keep in mind in + // such setups: + // + // As long as any `remove` event is pending in the UFFD queue, all ioctls return EAGAIN + // ----------------------------------------------------------------------------------- + // + // This means we cannot process UFFD events simply one-by-one anymore - if a `remove` + // event arrives, we need to pre-fetch all other events up to the `remove` + // event, to unblock the UFFD, and then go back to the process the + // pre-fetched events. + // + // UFFD might receive events in not in their causal order + // ----------------------------------------------------- + // + // For example, the guest + // kernel might first respond to a balloon inflation by freeing some memory, and + // telling Firecracker about this. Firecracker will then madvise(MADV_DONTNEED) the + // free memory range, which causes a `remove` event to be sent to UFFD. Then, the + // guest kernel might immediately fault the page in again (for example because + // default_on_oom was set). which causes a `pagefault` event to be sent to UFFD. + // + // However, the pagefault will be triggered from inside KVM on the vCPU thread, while + // the balloon device is handled by Firecracker on its VMM thread. This + // means that potentially this handler can receive the `pagefault` _before_ + // the `remove` event. + // + // This means that the simple "greedy" strategy of simply prefetching _all_ UFFD events + // to make sure no `remove` event is blocking us can result in the handler acting on + // the `pagefault` event before the `remove` message (despite the `remove` event being + // in the causal past of the `pagefault` event), which means that we will fault in a + // page from the snapshot file, while really we should be faulting in a zero + // page. + // + // In this example handler, we ignore this problem, to avoid + // complexity (under the assumption that the guest kernel will zero a newly faulted in + // page anyway). A production handler will most likely want to ensure that `remove` + // events for a specific range are always handled before `pagefault` events. + // + // Lastly, we still need to deal with the race condition where a `remove` event arrives + // in the UFFD queue after we got done reading all events, in which case we need to go + // back to reading more events before we can continue processing `pagefault`s. + let mut deferred_events = Vec::new(); - loop { - // First, try events that we couldn't handle last round - let mut events_to_handle = Vec::from_iter(deferred_events.drain(..)); + loop { + // First, try events that we couldn't handle last round + let mut events_to_handle = Vec::from_iter(deferred_events.drain(..)); - // Read all events from the userfaultfd. - while let Some(event) = uffd_handler.read_event().expect("Failed to read uffd_msg") { - events_to_handle.push(event); - } + // Read all events from the userfaultfd. + while let Some(event) = uffd_handler.read_event().expect("Failed to read uffd_msg") + { + events_to_handle.push(event); + } + + for event in events_to_handle.drain(..) { + // We expect to receive either a Page Fault or `remove` + // event (if the balloon device is enabled). + match event { + userfaultfd::Event::Pagefault { addr, .. } => { + let bit = uffd_handler.addr_to_offset(addr.cast()) as usize + / uffd_handler.page_size; - for event in events_to_handle.drain(..) { - // We expect to receive either a Page Fault or `remove` - // event (if the balloon device is enabled). - match event { - userfaultfd::Event::Pagefault { addr, .. } => { - if !uffd_handler.serve_pf(addr.cast(), uffd_handler.page_size) { - deferred_events.push(event); + if uffd_handler.userfault_bitmap.is_some() { + if uffd_handler + .userfault_bitmap + .as_mut() + .unwrap() + .is_bit_set(bit) + { + if !uffd_handler.serve_pf(addr.cast(), uffd_handler.page_size) { + deferred_events.push(event); + } + } else { + // TODO: we currently ignore the result as we may attempt to + // populate the page that is already present as we may receive + // multiple minor fault events per page. + let _ = uffd_continue( + uffd_handler.uffd.as_raw_fd(), + addr as _, + uffd_handler.page_size as u64, + ) + .inspect_err(|err| { + println!("uffdio_continue error: {:?}", err) + }); + } + } else { + if !uffd_handler.serve_pf(addr.cast(), uffd_handler.page_size) { + deferred_events.push(event); + } + } } + userfaultfd::Event::Remove { start, end } => { + uffd_handler.mark_range_removed(start as u64, end as u64) + } + _ => panic!("Unexpected event on userfaultfd"), } - userfaultfd::Event::Remove { start, end } => { - uffd_handler.mark_range_removed(start as u64, end as u64) - } - _ => panic!("Unexpected event on userfaultfd"), + } + + // We assume that really only the above removed/pagefault interaction can result in + // deferred events. In that scenario, the loop will always terminate (unless + // newly arriving `remove` events end up indefinitely blocking it, but there's + // nothing we can do about that, and it's a largely theoretical + // problem). + if deferred_events.is_empty() { + break; } } + }, + |uffd_handler: &mut UffdHandler, offset: usize| { + let bytes_written = uffd_handler.populate_via_write(offset, uffd_handler.page_size); - // We assume that really only the above removed/pagefault interaction can result in - // deferred events. In that scenario, the loop will always terminate (unless - // newly arriving `remove` events end up indefinitely blocking it, but there's nothing - // we can do about that, and it's a largely theoretical problem). - if deferred_events.is_empty() { - break; + if bytes_written == 0 { + println!( + "got a vcpu fault for an already populated page at offset {}", + offset + ); + } else { + assert_eq!(bytes_written, uffd_handler.page_size); } - } - }); + }, + ); } diff --git a/src/firecracker/examples/uffd/uffd_utils.rs b/src/firecracker/examples/uffd/uffd_utils.rs index 07ed48f9439..beb09b2a7e6 100644 --- a/src/firecracker/examples/uffd/uffd_utils.rs +++ b/src/firecracker/examples/uffd/uffd_utils.rs @@ -4,17 +4,64 @@ // Not everything is used by both binaries #![allow(dead_code)] +mod userfault_bitmap; + use std::collections::{HashMap, HashSet}; use std::ffi::c_void; use std::fs::File; +use std::io::{Read, Write}; +use std::num::NonZero; +use std::ops::DerefMut; +use std::os::fd::RawFd; use std::os::unix::io::{AsRawFd, FromRawFd, IntoRawFd}; use std::os::unix::net::UnixStream; use std::ptr; +use std::sync::atomic::AtomicU64; use std::time::Duration; use serde::{Deserialize, Serialize}; +use serde_json::{Deserializer, StreamDeserializer}; use userfaultfd::{Error, Event, Uffd}; +use vmm_sys_util::ioctl::ioctl_with_mut_ref; use vmm_sys_util::sock_ctrl_msg::ScmSocket; +use vmm_sys_util::{ioctl_ioc_nr, ioctl_iowr_nr}; + +use crate::uffd_utils::userfault_bitmap::UserfaultBitmap; + +// TODO: remove when UFFDIO_CONTINUE for guest_memfd is available in the crate +#[repr(C)] +struct uffdio_continue { + range: uffdio_range, + mode: u64, + mapped: u64, +} + +ioctl_iowr_nr!(UFFDIO_CONTINUE, 0xAA, 0x7, uffdio_continue); + +#[repr(C)] +struct uffdio_range { + start: u64, + len: u64, +} + +pub fn uffd_continue(uffd: RawFd, fault_addr: u64, len: u64) -> std::io::Result<()> { + let mut cont = uffdio_continue { + range: uffdio_range { + start: fault_addr, + len, + }, + mode: 0, // Normal continuation mode + mapped: 0, + }; + + let ret = unsafe { ioctl_with_mut_ref(&uffd, UFFDIO_CONTINUE(), &mut cont) }; + + if ret == -1 { + return Err(std::io::Error::last_os_error()); + } + + Ok(()) +} // This is the same with the one used in src/vmm. /// This describes the mapping between Firecracker base virtual address and offset in the @@ -36,6 +83,66 @@ pub struct GuestRegionUffdMapping { pub page_size: usize, } +#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq)] +pub struct FaultRequest { + /// vCPU that encountered the fault (not meaningful to Fission) + pub vcpu: u32, + /// Offset in guest_memfd where the fault occured + pub offset: u64, + /// Flags (not meaningful to Fission) + pub flags: u64, + /// Async PF token (not meaningful to Fission) + pub token: Option, +} + +impl FaultRequest { + pub fn into_reply(self, len: u64) -> FaultReply { + FaultReply { + vcpu: Some(self.vcpu), + offset: self.offset, + len, + flags: self.flags, + token: self.token, + zero: false, + } + } +} + +/// FaultReply +#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq)] +pub struct FaultReply { + /// vCPU that encountered the fault, from `FaultRequest` (if present, otherwise 0) + pub vcpu: Option, + /// Offset in guest_memfd where population started + pub offset: u64, + /// Length of populated area + pub len: u64, + /// Flags, must be copied from `FaultRequest`, otherwise 0 + pub flags: u64, + /// Async PF token, must be copied from `FaultRequest`, otherwise None + pub token: Option, + /// Whether the populated pages are zero pages + pub zero: bool, +} + +/// UffdMsgFromFirecracker +#[derive(Serialize, Deserialize, Debug)] +#[serde(untagged)] +pub enum UffdMsgFromFirecracker { + /// Mappings + Mappings(Vec), + /// FaultReq + FaultReq(FaultRequest), +} + +/// UffdMsgToFirecracker +#[derive(Serialize, Deserialize, Debug)] +#[serde(untagged)] +pub enum UffdMsgToFirecracker { + /// FaultRep + FaultRep(FaultReply), +} + impl GuestRegionUffdMapping { fn contains(&self, fault_page_addr: u64) -> bool { fault_page_addr >= self.base_host_virt_addr @@ -48,8 +155,11 @@ pub struct UffdHandler { pub mem_regions: Vec, pub page_size: usize, backing_buffer: *const u8, - uffd: Uffd, + pub uffd: Uffd, removed_pages: HashSet, + pub guest_memfd: Option, + pub guest_memfd_addr: Option<*mut u8>, + pub userfault_bitmap: Option, } impl UffdHandler { @@ -93,17 +203,37 @@ impl UffdHandler { panic!("Could not get UFFD and mappings after 5 retries"); } - pub fn from_unix_stream(stream: &UnixStream, backing_buffer: *const u8, size: usize) -> Self { - let (body, file) = Self::get_mappings_and_file(stream); - let mappings = - serde_json::from_str::>(&body).unwrap_or_else(|_| { - panic!("Cannot deserialize memory mappings. Received body: {body}") - }); + fn mmap_helper(len: libc::size_t, fd: libc::c_int) -> *mut libc::c_void { + // SAFETY: `mmap` is a safe function to call with valid parameters. + let ret = unsafe { + libc::mmap( + ptr::null_mut(), + len, + libc::PROT_WRITE, + libc::MAP_SHARED, + fd, + 0, + ) + }; + + assert_ne!(ret, libc::MAP_FAILED); + + ret + } + + pub fn from_mappings( + mappings: Vec, + uffd: File, + guest_memfd: Option, + userfault_bitmap_memfd: Option, + backing_buffer: *const u8, + size: usize, + ) -> Self { let memsize: usize = mappings.iter().map(|r| r.size).sum(); // Page size is the same for all memory regions, so just grab the first one let first_mapping = mappings.first().unwrap_or_else(|| { panic!( - "Cannot get the first mapping. Mappings size is {}. Received body: {body}", + "Cannot get the first mapping. Mappings size is {}.", mappings.len() ) }); @@ -113,14 +243,46 @@ impl UffdHandler { assert_eq!(memsize, size); assert!(page_size.is_power_of_two()); - let uffd = unsafe { Uffd::from_raw_fd(file.into_raw_fd()) }; - - Self { - mem_regions: mappings, - page_size, - backing_buffer, - uffd, - removed_pages: HashSet::new(), + let uffd = unsafe { Uffd::from_raw_fd(uffd.into_raw_fd()) }; + + match (&guest_memfd, &userfault_bitmap_memfd) { + (Some(guestmem_file), Some(bitmap_file)) => { + let guest_memfd_addr = + Some(Self::mmap_helper(size, guestmem_file.as_raw_fd()) as *mut u8); + + let bitmap_ptr = Self::mmap_helper(size, bitmap_file.as_raw_fd()) as *mut AtomicU64; + + // SAFETY: The bitmap pointer is valid and the size is correct. + let userfault_bitmap = Some(unsafe { + UserfaultBitmap::new(bitmap_ptr, memsize, NonZero::new(page_size).unwrap()) + }); + + Self { + mem_regions: mappings, + page_size, + backing_buffer, + uffd, + removed_pages: HashSet::new(), + guest_memfd, + guest_memfd_addr, + userfault_bitmap, + } + } + (None, None) => Self { + mem_regions: mappings, + page_size, + backing_buffer, + uffd, + removed_pages: HashSet::new(), + guest_memfd: None, + guest_memfd_addr: None, + userfault_bitmap: None, + }, + (_, _) => { + panic!( + "Only both guest_memfd and userfault_bitmap_memfd can be set at the same time." + ); + } } } @@ -137,6 +299,20 @@ impl UffdHandler { } } + pub fn addr_to_offset(&self, addr: *mut u8) -> u64 { + let addr = addr as u64; + for region in &self.mem_regions { + if region.contains(addr) { + return addr - region.base_host_virt_addr + region.offset as u64; + } + } + + panic!( + "Could not find addr: {:#x} within guest region mappings.", + addr + ); + } + pub fn serve_pf(&mut self, addr: *mut u8, len: usize) -> bool { // Find the start of the page that the current faulting address belongs to. let dst = (addr as usize & !(self.page_size - 1)) as *mut libc::c_void; @@ -149,7 +325,7 @@ impl UffdHandler { } else { for region in self.mem_regions.iter() { if region.contains(fault_page_addr) { - return self.populate_from_file(region, fault_page_addr, len); + return self.populate_from_file(®ion.clone(), fault_page_addr, len); } } } @@ -160,12 +336,65 @@ impl UffdHandler { ); } - fn populate_from_file(&self, region: &GuestRegionUffdMapping, dst: u64, len: usize) -> bool { - let offset = dst - region.base_host_virt_addr; - let src = self.backing_buffer as u64 + region.offset + offset; + pub fn size(&self) -> usize { + self.mem_regions.iter().map(|r| r.size).sum() + } + + pub fn populate_via_write(&mut self, offset: usize, len: usize) -> usize { + // man 2 write: + // + // On Linux, write() (and similar system calls) will transfer at most + // 0x7ffff000 (2,147,479,552) bytes, returning the number of bytes + // actually transferred. (This is true on both 32-bit and 64-bit + // systems.) + const MAX_WRITE_LEN: usize = 2_147_479_552; + + assert!( + offset.checked_add(len).unwrap() <= self.size(), + "{} + {} >= {}", + offset, + len, + self.size() + ); + + let mut total_written = 0; + + while total_written < len { + let src = unsafe { self.backing_buffer.add(offset + total_written) }; + let len_to_write = (len - total_written).min(MAX_WRITE_LEN); + let bytes_written = unsafe { + libc::pwrite64( + self.guest_memfd.as_ref().unwrap().as_raw_fd(), + src.cast(), + len_to_write, + (offset + total_written) as libc::off64_t, + ) + }; + + let bytes_written = match bytes_written { + -1 if vmm_sys_util::errno::Error::last().errno() == libc::ENOSPC => 0, + written @ 0.. => written as usize, + _ => panic!("{:?}", std::io::Error::last_os_error()), + }; + + self.userfault_bitmap + .as_mut() + .unwrap() + .reset_addr_range(offset + total_written, bytes_written); + + total_written += bytes_written; + + if bytes_written != len_to_write { + break; + } + } + + total_written + } + fn populate_via_uffdio_copy(&self, src: *const u8, dst: u64, len: usize) -> bool { unsafe { - match self.uffd.copy(src as *const _, dst as *mut _, len, true) { + match self.uffd.copy(src.cast(), dst as *mut _, len, true) { // Make sure the UFFD copied some bytes. Ok(value) => assert!(value > 0), // Catch EAGAIN errors, which occur when a `remove` event lands in the UFFD @@ -190,6 +419,42 @@ impl UffdHandler { true } + fn populate_via_memcpy(&mut self, src: *const u8, dst: u64, offset: usize, len: usize) -> bool { + let dst_memcpy = unsafe { + self.guest_memfd_addr + .expect("no guest_memfd addr") + .add(offset) + }; + + unsafe { + std::ptr::copy_nonoverlapping(src, dst_memcpy, len); + } + + self.userfault_bitmap + .as_mut() + .unwrap() + .reset_addr_range(offset, len); + + uffd_continue(self.uffd.as_raw_fd(), dst, len as u64).expect("uffd_continue"); + + true + } + + fn populate_from_file( + &mut self, + region: &GuestRegionUffdMapping, + dst: u64, + len: usize, + ) -> bool { + let offset = (region.offset + dst - region.base_host_virt_addr) as usize; + let src = unsafe { self.backing_buffer.add(offset) }; + + match self.guest_memfd { + Some(_) => self.populate_via_memcpy(src, dst, offset, len), + None => self.populate_via_uffdio_copy(src, dst, len), + } + } + fn zero_out(&mut self, addr: u64) { let ret = unsafe { self.uffd @@ -201,6 +466,113 @@ impl UffdHandler { } } +struct UffdMsgIterator { + stream: UnixStream, + buffer: Vec, + fds_received: bool, + secret_free: bool, + current_pos: usize, + fds: [RawFd; 3], +} + +impl Iterator for UffdMsgIterator { + type Item = UffdMsgFromFirecracker; + + fn next(&mut self) -> Option { + if !self.fds_received { + let mut iov = []; + let mut fds = [0; 3]; + let ret = unsafe { self.stream.recv_with_fds(&mut iov, &mut fds) }; + match ret { + Ok((0, 1)) => { + self.secret_free = false; + } + Ok((0, 3)) => { + self.secret_free = true; + } + Ok((n, _)) => { + panic!("Wrong number of fds: {}", n); + } + Err(e) => { + if e.errno() == libc::EAGAIN { + return None; + } + panic!("Read error: {}", e); + } + } + self.fds_received = true; + self.fds = fds; + + match self.recv_mappings() { + Some(mappings) => Some(UffdMsgFromFirecracker::Mappings(mappings)), + None => None, // EOF or error + } + } else { + match self.recv_fault_request() { + Some(request) => Some(UffdMsgFromFirecracker::FaultReq(request)), + None => None, // EOF or error + } + } + } +} + +impl UffdMsgIterator { + fn new(stream: UnixStream) -> Self { + Self { + stream, + buffer: vec![0u8; 4096], + fds_received: false, + secret_free: false, + current_pos: 0, + fds: [0; 3], + } + } + + fn recv_json(&mut self) -> Option { + match self.stream.read(&mut self.buffer[self.current_pos..]) { + Ok(bytes_read) => { + if bytes_read > 0 { + self.current_pos += bytes_read; + } + } + Err(e) if e.kind() == std::io::ErrorKind::WouldBlock => { + // Continue with existing buffer data + } + Err(e) => panic!("Failed to read from stream: {}", e,), + } + + if self.current_pos == 0 { + return None; + } + + let str_slice = std::str::from_utf8(&self.buffer[..self.current_pos]).unwrap(); + let mut stream: StreamDeserializer<_, T> = Deserializer::from_str(str_slice).into_iter(); + + match stream.next() { + Some(Ok(value)) => { + let consumed = stream.byte_offset(); + self.buffer.copy_within(consumed..self.current_pos, 0); + self.current_pos -= consumed; + Some(value) + } + Some(Err(e)) => panic!( + "Failed to deserialize JSON message: {}. Error: {}", + String::from_utf8_lossy(&self.buffer[..self.current_pos]), + e + ), + None => None, + } + } + + fn recv_mappings(&mut self) -> Option> { + self.recv_json() + } + + fn recv_fault_request(&mut self) -> Option { + self.recv_json() + } +} + #[derive(Debug)] pub struct Runtime { stream: UnixStream, @@ -208,6 +580,7 @@ pub struct Runtime { backing_memory: *mut u8, backing_memory_size: usize, uffds: HashMap, + main_handler_fd: Option, } impl Runtime { @@ -238,6 +611,7 @@ impl Runtime { backing_memory: ret.cast(), backing_memory_size, uffds: HashMap::default(), + main_handler_fd: None, } } @@ -278,12 +652,22 @@ impl Runtime { })); } + pub fn send_fault_reply(&mut self, fault_reply: FaultReply) { + let reply = UffdMsgToFirecracker::FaultRep(fault_reply); + let reply_json = serde_json::to_string(&reply).unwrap(); + self.stream.write_all(reply_json.as_bytes()).unwrap(); + } + /// Polls the `UnixStream` and UFFD fds in a loop. /// When stream is polled, new uffd is retrieved. /// When uffd is polled, page fault is handled by /// calling `pf_event_dispatch` with corresponding /// uffd object passed in. - pub fn run(&mut self, pf_event_dispatch: impl Fn(&mut UffdHandler)) { + pub fn run( + &mut self, + pf_event_dispatch: impl Fn(&mut UffdHandler), + pf_vcpu_event_dispatch: impl Fn(&mut UffdHandler, usize), + ) { let mut pollfds = vec![]; // Poll the stream for incoming uffds @@ -293,6 +677,9 @@ impl Runtime { revents: 0, }); + let mut uffd_msg_iter = + UffdMsgIterator::new(self.stream.try_clone().expect("Failed to clone stream")); + // We can skip polling on stream fd if // the connection is closed. let mut skip_stream: usize = 0; @@ -315,22 +702,71 @@ impl Runtime { if pollfds[i].revents & libc::POLLIN != 0 { nready -= 1; if pollfds[i].fd == self.stream.as_raw_fd() { - // Handle new uffd from stream - let handler = UffdHandler::from_unix_stream( - &self.stream, - self.backing_memory, - self.backing_memory_size, - ); - pollfds.push(libc::pollfd { - fd: handler.uffd.as_raw_fd(), - events: libc::POLLIN, - revents: 0, - }); - self.uffds.insert(handler.uffd.as_raw_fd(), handler); - - // If connection is closed, we can skip the socket from being polled. - if pollfds[i].revents & (libc::POLLRDHUP | libc::POLLHUP) != 0 { - skip_stream = 1; + while let Some(msg) = uffd_msg_iter.next() { + match msg { + UffdMsgFromFirecracker::Mappings(mappings) => { + let (guest_memfd, userfault_bitmap_memfd) = + if uffd_msg_iter.secret_free { + ( + Some(unsafe { + File::from_raw_fd(uffd_msg_iter.fds[1]) + }), + Some(unsafe { + File::from_raw_fd(uffd_msg_iter.fds[2]) + }), + ) + } else { + (None, None) + }; + + let uffd_handler = UffdHandler::from_mappings( + mappings, + unsafe { File::from_raw_fd(uffd_msg_iter.fds[0]) }, + guest_memfd, + userfault_bitmap_memfd, + self.backing_memory, + self.backing_memory_size, + ); + + let fd = uffd_handler.uffd.as_raw_fd(); + if uffd_handler.guest_memfd.is_some() { + self.main_handler_fd = Some(fd); + } + + pollfds.push(libc::pollfd { + fd, + events: libc::POLLIN, + revents: 0, + }); + self.uffds.insert(fd, uffd_handler); + } + UffdMsgFromFirecracker::FaultReq(fault_request) => { + let fd = self.main_handler_fd.unwrap(); + + let mut locked_uffd = self.uffds.get_mut(&fd).unwrap(); + let page_size = locked_uffd.page_size; + + assert!( + (fault_request.offset as usize) < locked_uffd.size(), + "received bogus offset from firecracker" + ); + + // Handle one of FaultRequest page faults + pf_vcpu_event_dispatch( + locked_uffd.deref_mut(), + fault_request.offset as usize, + ); + + self.send_fault_reply( + fault_request.into_reply(page_size as u64), + ); + } + } + // If connection is closed, we can skip the socket from + // being polled. + if pollfds[i].revents & (libc::POLLRDHUP | libc::POLLHUP) != 0 { + skip_stream = 1; + } } } else { // Handle one of uffd page faults @@ -376,7 +812,7 @@ mod tests { let (stream, _) = listener.accept().expect("Cannot listen on UDS socket"); // Update runtime with actual runtime let runtime = uninit_runtime.write(Runtime::new(stream, file)); - runtime.run(|_: &mut UffdHandler| {}); + runtime.run(|_: &mut UffdHandler| {}, |_: &mut UffdHandler, _: usize| {}); }); // wait for runtime thread to initialize itself @@ -385,6 +821,7 @@ mod tests { let stream = UnixStream::connect(dummy_socket_path_clone).expect("Cannot connect to the socket"); + #[allow(deprecated)] let dummy_memory_region = vec![GuestRegionUffdMapping { base_host_virt_addr: 0, size: 0x1000, @@ -393,6 +830,8 @@ mod tests { }]; let dummy_memory_region_json = serde_json::to_string(&dummy_memory_region).unwrap(); + // Send the first mapping message to the runtime. + // We expect the runtime to store its UFFD. let dummy_file_1 = TempFile::new().unwrap(); let dummy_fd_1 = dummy_file_1.as_file().as_raw_fd(); stream @@ -404,6 +843,8 @@ mod tests { assert_eq!((*runtime_ptr).uffds.len(), 1); } + // Send the second mappings message to the runtime. + // The runtime is only accepting one mappings message. let dummy_file_2 = TempFile::new().unwrap(); let dummy_fd_2 = dummy_file_2.as_file().as_raw_fd(); stream @@ -412,12 +853,13 @@ mod tests { // wait for the runtime thread to process message std::thread::sleep(std::time::Duration::from_millis(100)); unsafe { - assert_eq!((*runtime_ptr).uffds.len(), 2); + assert_eq!((*runtime_ptr).uffds.len(), 1); } // there is no way to properly stop runtime, so // we send a message with an incorrect memory region // to cause runtime thread to panic + #[allow(deprecated)] let error_memory_region = vec![GuestRegionUffdMapping { base_host_virt_addr: 0, size: 0, diff --git a/src/firecracker/examples/uffd/uffd_utils/userfault_bitmap.rs b/src/firecracker/examples/uffd/uffd_utils/userfault_bitmap.rs new file mode 100644 index 00000000000..7a751fa0ef2 --- /dev/null +++ b/src/firecracker/examples/uffd/uffd_utils/userfault_bitmap.rs @@ -0,0 +1,203 @@ +// Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +use std::num::NonZeroUsize; +use std::sync::atomic::{AtomicU64, Ordering}; + +/// `UserfaultBitmap` implements a simple bit map on the page level with test and set operations. +/// It is page-size aware, so it converts addresses to page numbers before setting or clearing +/// the bits. +#[derive(Debug)] +pub struct UserfaultBitmap { + map: *mut AtomicU64, + size: usize, + byte_size: usize, + page_size: NonZeroUsize, + map_size: usize, +} + +impl UserfaultBitmap { + /// Create a new bitmap using a user-supplied pointer. + /// + /// # Safety + /// + /// Caller must ensure: + /// * `map_ptr` points to a valid region of memory containing initialized `AtomicU64` elements + /// * `map_ptr` is properly aligned for `AtomicU64` + /// * The memory region contains enough space for `ceil(ceil(byte_size/page_size)/64)` elements + /// * The memory region pointed to by `map_ptr` must not be accessed through any other means + /// while this `UserfaultBitmap` exists + /// * The caller must ensure the memory remains valid for the lifetime of the returned + /// `UserfaultBitmap` + pub unsafe fn new(map_ptr: *mut AtomicU64, byte_size: usize, page_size: NonZeroUsize) -> Self { + let num_pages = byte_size.div_ceil(page_size.get()); + let map_size = num_pages.div_ceil(u64::BITS as usize); + + UserfaultBitmap { + map: map_ptr, + size: num_pages, + byte_size, + page_size, + map_size, + } + } + + /// Is bit `n` set? Bits outside the range of the bitmap are always unset. + pub fn is_bit_set(&self, index: usize) -> bool { + if index < self.size { + unsafe { + let map_entry = &*self.map.add(index >> 6); + (map_entry.load(Ordering::Acquire) & (1 << (index & 63))) != 0 + } + } else { + // Out-of-range bits are always unset. + false + } + } + + /// Reset a range of `len` bytes starting at `start_addr`. The first bit set in the bitmap + /// is for the page corresponding to `start_addr`, and the last bit that we set corresponds + /// to address `start_addr + len - 1`. + pub fn reset_addr_range(&self, start_addr: usize, len: usize) { + if len == 0 { + return; + } + + let first_bit = start_addr / self.page_size; + let last_bit = start_addr.saturating_add(len - 1) / self.page_size; + + for n in first_bit..=last_bit { + if n >= self.size { + break; + } + unsafe { + let map_entry = &*self.map.add(n >> 6); + map_entry.fetch_and(!(1 << (n & 63)), Ordering::SeqCst); + } + } + } +} + +#[cfg(test)] +mod tests { + use std::sync::atomic::AtomicU64; + + use super::*; + + // Helper function to create a test bitmap + fn setup_test_bitmap( + byte_size: usize, + page_size: NonZeroUsize, + ) -> (Vec, UserfaultBitmap) { + let num_pages = byte_size.div_ceil(page_size.get()); + let map_size = num_pages.div_ceil(u64::BITS as usize); + let mut memory = Vec::with_capacity(map_size); + for _ in 0..map_size { + memory.push(AtomicU64::new(0)); + } + let ptr = memory.as_mut_ptr(); + let bitmap = unsafe { UserfaultBitmap::new(ptr, byte_size, page_size) }; + (memory, bitmap) + } + + #[test] + fn test_basic_initialization() { + let page_size = NonZeroUsize::new(128).unwrap(); + let (_memory, bitmap) = setup_test_bitmap(1024, page_size); + + assert!(!bitmap.is_bit_set(0)); + assert!(!bitmap.is_bit_set(7)); + } + + #[test] + fn test_out_of_bounds_access() { + let page_size = NonZeroUsize::new(128).unwrap(); + let (_memory, bitmap) = setup_test_bitmap(1024, page_size); + + // With 1024 bytes and 128-byte pages, we should have 8 pages + assert!(!bitmap.is_bit_set(8)); // This should be out of bounds + assert!(!bitmap.is_bit_set(100)); // This should be out of bounds + } + + #[test] + fn test_reset_addr_range() { + let page_size = NonZeroUsize::new(128).unwrap(); + let (memory, bitmap) = setup_test_bitmap(1024, page_size); + + // Set bits 0 and 1 (representing first two pages) + memory[0].store(0b11, Ordering::SeqCst); + + // Verify bits are set + assert!(bitmap.is_bit_set(0)); + assert!(bitmap.is_bit_set(1)); + assert!(!bitmap.is_bit_set(2)); + + // Reset first page + bitmap.reset_addr_range(0, 128); + + // Verify first bit is reset but second remains set + assert!(!bitmap.is_bit_set(0)); + assert!(bitmap.is_bit_set(1)); + } + + #[test] + fn test_reset_addr_range_spanning_multiple_words() { + let page_size = NonZeroUsize::new(128).unwrap(); + // Ensure we allocate enough space for at least 2 words (128 bits) + let (memory, bitmap) = setup_test_bitmap(128 * 128, page_size); // 128 pages + + // Set bits in different words + memory[0].store(u64::MAX, Ordering::SeqCst); + memory[1].store(u64::MAX, Ordering::SeqCst); + + // Reset a range spanning both words + bitmap.reset_addr_range(63 * 128, 256); // Reset bits 63 and 64 + + // Check bits are reset + assert!(!bitmap.is_bit_set(63)); + assert!(!bitmap.is_bit_set(64)); + // Check adjacent bits are still set + assert!(bitmap.is_bit_set(62)); + assert!(bitmap.is_bit_set(65)); + } + + #[test] + fn test_reset_addr_range_zero_length() { + let page_size = NonZeroUsize::new(128).unwrap(); + let (memory, bitmap) = setup_test_bitmap(1024, page_size); + + // Set a bit manually + memory[0].store(1, Ordering::SeqCst); + + // Reset with length 0 + bitmap.reset_addr_range(0, 0); + + // Bit should still be set + assert!(bitmap.is_bit_set(0)); + } + + #[test] + fn test_reset_addr_range_beyond_bounds() { + let page_size = NonZeroUsize::new(128).unwrap(); + let (_memory, bitmap) = setup_test_bitmap(1024, page_size); + + // This should not panic + bitmap.reset_addr_range(1024, 2048); + } + + #[test] + fn test_edge_cases() { + // Test with minimum page size + let page_size = NonZeroUsize::new(1).unwrap(); + let (_memory, bitmap) = setup_test_bitmap(64, page_size); + assert!(!bitmap.is_bit_set(0)); + + // Test with zero byte_size + let page_size = NonZeroUsize::new(128).unwrap(); + let (_memory, bitmap) = setup_test_bitmap(0, page_size); + assert!(!bitmap.is_bit_set(0)); + + // Test reset_addr_range with maximum usize value + bitmap.reset_addr_range(usize::MAX - 128, 256); + } +} diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index fbeedefa348..e6b0343e3c7 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -4,7 +4,7 @@ //! Enables pre-boot setup, instantiation and booting of a Firecracker VMM. use std::fmt::Debug; -use std::io; +use std::io::{self, Write}; use std::os::fd::AsFd; use std::os::unix::fs::MetadataExt; #[cfg(feature = "gdb")] @@ -14,7 +14,6 @@ use std::sync::{Arc, Mutex}; use event_manager::{MutEventSubscriber, SubscriberOps}; use libc::EFD_NONBLOCK; use linux_loader::cmdline::Cmdline as LoaderKernelCmdline; -use userfaultfd::Uffd; use utils::time::TimestampUs; #[cfg(target_arch = "aarch64")] use vm_memory::GuestAddress; @@ -23,7 +22,7 @@ use vm_superio::Rtc; use vm_superio::Serial; use vmm_sys_util::eventfd::EventFd; -use crate::arch::{ConfigurationError, configure_system_for_boot, load_kernel}; +use crate::arch::{ConfigurationError, configure_system_for_boot, host_page_size, load_kernel}; #[cfg(target_arch = "aarch64")] use crate::construct_kvm_mpidrs; use crate::cpu_config::templates::{ @@ -54,15 +53,19 @@ use crate::devices::virtio::vsock::{Vsock, VsockUnixBackend}; use crate::gdb; use crate::initrd::{InitrdConfig, InitrdError}; use crate::logger::{debug, error}; -use crate::persist::{MicrovmState, MicrovmStateError}; +use crate::persist::{ + GuestMemoryFromFileError, GuestMemoryFromUffdError, MicrovmState, MicrovmStateError, + guest_memory_from_file, guest_memory_from_uffd, +}; use crate::resources::VmResources; use crate::seccomp::BpfThreadMap; use crate::snapshot::Persist; use crate::utils::u64_to_usize; use crate::vmm_config::instance_info::InstanceInfo; use crate::vmm_config::machine_config::MachineConfigError; +use crate::vmm_config::snapshot::{LoadSnapshotParams, MemBackendType}; use crate::vstate::kvm::Kvm; -use crate::vstate::memory::{GuestRegionMmap, MaybeBounce}; +use crate::vstate::memory::{MaybeBounce, create_memfd}; use crate::vstate::vcpu::{Vcpu, VcpuError}; use crate::vstate::vm::{KVM_GMEM_NO_DIRECT_MAP, Vm}; use crate::{EventManager, Vmm, VmmError, device_manager}; @@ -159,7 +162,7 @@ fn create_vmm_and_vcpus( // Instantiate ACPI device manager. let acpi_device_manager = ACPIDeviceManager::new(); - let (vcpus, vcpus_exit_evt) = vm.create_vcpus(vcpu_count)?; + let (vcpus, vcpus_exit_evt, userfault_channels) = vm.create_vcpus(vcpu_count, secret_free)?; #[cfg(target_arch = "x86_64")] let pio_device_manager = { @@ -188,6 +191,8 @@ fn create_vmm_and_vcpus( kvm, vm, uffd: None, + uffd_socket: None, + userfault_channels, vcpus_handles: Vec::new(), vcpus_exit_evt, resource_allocator, @@ -260,7 +265,7 @@ pub fn build_microvm_for_boot( .map_err(StartMicrovmError::GuestMemory)?; vmm.vm - .register_memory_regions(guest_memory) + .register_memory_regions(guest_memory, None) .map_err(VmmError::Vm)?; #[cfg(target_arch = "x86_64")] @@ -422,6 +427,17 @@ pub fn build_and_boot_microvm( Ok(vmm) } +/// Sub-Error type for [`build_microvm_from_snapshot`] to contain either +/// [`GuestMemoryFromFileError`] or [`GuestMemoryFromUffdError`] within +/// [`BuildMicrovmFromSnapshotError`]. +#[derive(Debug, thiserror::Error, displaydoc::Display)] +pub enum BuildMicrovmFromSnapshotErrorGuestMemoryError { + /// Error creating guest memory from file: {0} + File(#[from] GuestMemoryFromFileError), + /// Error creating guest memory from uffd: {0} + Uffd(#[from] GuestMemoryFromUffdError), +} + /// Error type for [`build_microvm_from_snapshot`]. #[derive(Debug, thiserror::Error, displaydoc::Display)] pub enum BuildMicrovmFromSnapshotError { @@ -459,6 +475,12 @@ pub enum BuildMicrovmFromSnapshotError { ACPIDeviManager(#[from] ACPIDeviceManagerRestoreError), /// VMGenID update failed: {0} VMGenIDUpdate(std::io::Error), + /// Internal error while restoring microVM: {0} + Internal(#[from] VmmError), + /// Failed to load guest memory: {0} + GuestMemory(#[from] BuildMicrovmFromSnapshotErrorGuestMemoryError), + /// Userfault bitmap memfd error: {0} + UserfaultBitmapMemfd(#[from] crate::vstate::memory::MemoryError), } /// Builds and starts a microVM based on the provided MicrovmState. @@ -470,27 +492,96 @@ pub fn build_microvm_from_snapshot( instance_info: &InstanceInfo, event_manager: &mut EventManager, microvm_state: MicrovmState, - guest_memory: Vec, - uffd: Option, seccomp_filters: &BpfThreadMap, + params: &LoadSnapshotParams, vm_resources: &mut VmResources, ) -> Result>, BuildMicrovmFromSnapshotError> { + // TODO: take it from kvm-bindings when userfault support is merged upstream + const KVM_CAP_USERFAULT: u32 = 241; + // Build Vmm. debug!("event_start: build microvm from snapshot"); + + let secret_free = vm_resources.machine_config.secret_free; + + let mut kvm_capabilities = microvm_state.kvm_state.kvm_cap_modifiers.clone(); + if secret_free { + kvm_capabilities.push(KvmCapability::Add(KVM_CAP_USERFAULT)); + } + let (mut vmm, mut vcpus) = create_vmm_and_vcpus( instance_info, event_manager, vm_resources.machine_config.vcpu_count, - microvm_state.kvm_state.kvm_cap_modifiers.clone(), - false, + kvm_capabilities, + secret_free, ) .map_err(StartMicrovmError::Internal)?; + let guest_memfd = match secret_free { + true => Some( + vmm.vm + .create_guest_memfd(vm_resources.memory_size(), KVM_GMEM_NO_DIRECT_MAP) + .map_err(VmmError::Vm)?, + ), + false => None, + }; + + let userfault_bitmap_memfd = if secret_free { + let bitmap_size = vm_resources.memory_size() / host_page_size(); + let bitmap_file = create_memfd(bitmap_size as u64, None)?; + + // Set all bits so a fault on any page will cause a VM exit + let all_set = vec![0xffu8; bitmap_size]; + bitmap_file + .as_file() + .write_all(&all_set) + .expect("failed to write"); + + Some(bitmap_file.into_file()) + } else { + None + }; + + let mem_backend_path = ¶ms.mem_backend.backend_path; + let mem_state = µvm_state.vm_state.memory; + let track_dirty_pages = params.enable_diff_snapshots; + + let (guest_memory, uffd, socket) = match params.mem_backend.backend_type { + MemBackendType::File => { + if vm_resources.machine_config.huge_pages.is_hugetlbfs() { + return Err(BuildMicrovmFromSnapshotErrorGuestMemoryError::File( + GuestMemoryFromFileError::HugetlbfsSnapshot, + ) + .into()); + } + ( + guest_memory_from_file(mem_backend_path, mem_state, track_dirty_pages) + .map_err(BuildMicrovmFromSnapshotErrorGuestMemoryError::File)?, + None, + None, + ) + } + MemBackendType::Uffd => guest_memory_from_uffd( + mem_backend_path, + mem_state, + track_dirty_pages, + vm_resources.machine_config.huge_pages, + guest_memfd, + userfault_bitmap_memfd.as_ref(), + ) + .map_err(BuildMicrovmFromSnapshotErrorGuestMemoryError::Uffd)?, + }; + vmm.vm - .register_memory_regions(guest_memory) + .register_memory_regions(guest_memory, userfault_bitmap_memfd.as_ref()) .map_err(VmmError::Vm) .map_err(StartMicrovmError::Internal)?; vmm.uffd = uffd; + vmm.uffd_socket = socket; + + #[cfg(target_arch = "x86_64")] + vmm.vm.set_memory_private().map_err(VmmError::Vm)?; #[cfg(target_arch = "x86_64")] { @@ -947,7 +1038,7 @@ pub(crate) mod tests { ) .unwrap(); - let (_, vcpus_exit_evt) = vm.create_vcpus(1).unwrap(); + let (_, vcpus_exit_evt, _) = vm.create_vcpus(1, false).unwrap(); Vmm { events_observer: Some(std::io::stdin()), @@ -956,6 +1047,8 @@ pub(crate) mod tests { kvm, vm, uffd: None, + uffd_socket: None, + userfault_channels: None, vcpus_handles: Vec::new(), vcpus_exit_evt, resource_allocator: ResourceAllocator::new().unwrap(), diff --git a/src/vmm/src/device_manager/mmio.rs b/src/vmm/src/device_manager/mmio.rs index f1b8cf01697..7dfb9fc3c31 100644 --- a/src/vmm/src/device_manager/mmio.rs +++ b/src/vmm/src/device_manager/mmio.rs @@ -660,7 +660,7 @@ mod tests { let guest_mem = multi_region_mem_raw(&[(start_addr1, 0x1000), (start_addr2, 0x1000)]); let kvm = Kvm::new(vec![]).expect("Cannot create Kvm"); let mut vm = Vm::new(&kvm, false).unwrap(); - vm.register_memory_regions(guest_mem).unwrap(); + vm.register_memory_regions(guest_mem, None).unwrap(); let mut device_manager = MMIODeviceManager::new(); let mut resource_allocator = ResourceAllocator::new().unwrap(); @@ -691,7 +691,7 @@ mod tests { let guest_mem = multi_region_mem_raw(&[(start_addr1, 0x1000), (start_addr2, 0x1000)]); let kvm = Kvm::new(vec![]).expect("Cannot create Kvm"); let mut vm = Vm::new(&kvm, false).unwrap(); - vm.register_memory_regions(guest_mem).unwrap(); + vm.register_memory_regions(guest_mem, None).unwrap(); let mut device_manager = MMIODeviceManager::new(); let mut resource_allocator = ResourceAllocator::new().unwrap(); @@ -747,7 +747,7 @@ mod tests { let guest_mem = multi_region_mem_raw(&[(start_addr1, 0x1000), (start_addr2, 0x1000)]); let kvm = Kvm::new(vec![]).expect("Cannot create Kvm"); let mut vm = Vm::new(&kvm, false).unwrap(); - vm.register_memory_regions(guest_mem).unwrap(); + vm.register_memory_regions(guest_mem, None).unwrap(); #[cfg(target_arch = "x86_64")] vm.setup_irqchip().unwrap(); diff --git a/src/vmm/src/lib.rs b/src/vmm/src/lib.rs index 29f3b0148ac..a1ddfd9d43b 100644 --- a/src/vmm/src/lib.rs +++ b/src/vmm/src/lib.rs @@ -115,8 +115,10 @@ pub mod vstate; pub mod initrd; use std::collections::HashMap; -use std::io; +use std::io::{self, Read, Write}; +use std::os::fd::RawFd; use std::os::unix::io::AsRawFd; +use std::os::unix::net::UnixStream; use std::sync::mpsc::RecvTimeoutError; use std::sync::{Arc, Barrier, Mutex}; use std::time::Duration; @@ -127,6 +129,7 @@ use devices::acpi::vmgenid::VmGenIdError; use event_manager::{EventManager as BaseEventManager, EventOps, Events, MutEventSubscriber}; use seccomp::BpfProgram; use userfaultfd::Uffd; +use vm_memory::GuestAddress; use vmm_sys_util::epoll::EventSet; use vmm_sys_util::eventfd::EventFd; use vmm_sys_util::terminal::Terminal; @@ -146,14 +149,17 @@ use crate::devices::virtio::block::device::Block; use crate::devices::virtio::net::Net; use crate::devices::virtio::{TYPE_BALLOON, TYPE_BLOCK, TYPE_NET}; use crate::logger::{METRICS, MetricsError, error, info, warn}; -use crate::persist::{MicrovmState, MicrovmStateError, VmInfo}; +use crate::persist::{FaultReply, FaultRequest, MicrovmState, MicrovmStateError, VmInfo}; use crate::rate_limiter::BucketUpdate; use crate::snapshot::Persist; use crate::vmm_config::instance_info::{InstanceInfo, VmState}; -use crate::vstate::memory::{GuestMemory, GuestMemoryMmap, GuestMemoryRegion}; +use crate::vstate::memory::{ + GuestMemory, GuestMemoryExtension, GuestMemoryMmap, GuestMemoryRegion, +}; use crate::vstate::vcpu::VcpuState; pub use crate::vstate::vcpu::{Vcpu, VcpuConfig, VcpuEvent, VcpuHandle, VcpuResponse}; pub use crate::vstate::vm::Vm; +use crate::vstate::vm::{UserfaultChannel, UserfaultChannelError, UserfaultData}; /// Shorthand type for the EventManager flavour used by Firecracker. pub type EventManager = BaseEventManager>>; @@ -310,6 +316,10 @@ pub struct Vmm { pub vm: Vm, // Save UFFD in order to keep it open in the Firecracker process, as well. uffd: Option, + // Used for userfault communication with the UFFD handler when secret freedom is enabled + uffd_socket: Option, + // Used for userfault communication with vCPUs when secret freedom is enabled + userfault_channels: Option>, vcpus_handles: Vec, // Used by Vcpus and devices to initiate teardown; Vmm should never write here. vcpus_exit_evt: EventFd, @@ -797,6 +807,169 @@ impl Vmm { self.shutdown_exit_code = Some(exit_code); } + fn active_event_in_userfault_channel( + &self, + source: RawFd, + event_set: EventSet, + ) -> Option { + if let Some(userfault_channels) = &self.userfault_channels { + userfault_channels.iter().position(|channel| { + let receiver = &channel.receiver; + source == receiver.as_raw_fd() && event_set == EventSet::IN + }) + } else { + None + } + } + + fn process_userfault_channels(&mut self, vcpu: usize) { + loop { + match self + .userfault_channels + .as_mut() + .expect("Userfault channels must be set")[vcpu] + .recv() + { + Ok(userfault_data) => { + let offset = self + .vm + .guest_memory() + .gpa_to_offset(GuestAddress(userfault_data.gpa)) + .expect("Failed to convert GPA to offset"); + + let fault_request = FaultRequest { + vcpu: vcpu.try_into().expect("Invalid vCPU index"), + offset, + flags: userfault_data.flags, + token: None, + }; + let fault_request_json = serde_json::to_string(&fault_request) + .expect("Failed to serialize fault request"); + + let written = self + .uffd_socket + .as_ref() + .expect("Uffd socket is not set") + .write(fault_request_json.as_bytes()) + .expect("Failed to write to uffd socket"); + + if written != fault_request_json.len() { + panic!( + "Failed to write the entire fault request to the uffd socket: \ + expected {}, written {}", + fault_request_json.len(), + written + ); + } + } + Err(ref e) => match e { + UserfaultChannelError::IO(io_e) if io_e.kind() == io::ErrorKind::WouldBlock => { + break; + } + _ => panic!("Error receiving userfault data: {}", e), + }, + } + } + } + + fn active_event_in_uffd_socket(&self, source: RawFd, event_set: EventSet) -> bool { + if let Some(uffd_socket) = &self.uffd_socket { + uffd_socket.as_raw_fd() == source && event_set == EventSet::IN + } else { + false + } + } + + fn process_uffd_socket(&mut self) { + const BUFFER_SIZE: usize = 4096; + + let stream = self.uffd_socket.as_mut().expect("Uffd socket is not set"); + + let mut buffer = [0u8; BUFFER_SIZE]; + let mut current_pos = 0; + let mut exit_loop = false; + + loop { + if current_pos < BUFFER_SIZE { + match stream.read(&mut buffer[current_pos..]) { + Ok(0) => break, + Ok(n) => current_pos += n, + Err(e) if e.kind() == io::ErrorKind::WouldBlock => { + if exit_loop { + break; + } + } + Err(e) => panic!("Read error: {}", e), + } + + exit_loop = false; + } + + let mut parser = serde_json::Deserializer::from_slice(&buffer[..current_pos]) + .into_iter::(); + let mut total_consumed = 0; + let mut needs_more = false; + + while let Some(result) = parser.next() { + match result { + Ok(fault_reply) => { + let gpa = self + .vm + .common + .guest_memory + .offset_to_gpa(fault_reply.offset) + .expect("Failed to convert offset to GPA"); + + let userfaultfd_data = UserfaultData { + flags: fault_reply.flags, + gpa: gpa.0, + size: fault_reply.len, + }; + + let vcpu = fault_reply.vcpu.expect("vCPU must be set"); + + self.userfault_channels + .as_mut() + .expect("userfault_channels are not set") + .get_mut(vcpu as usize) + .expect("Invalid vcpu index") + .send(userfaultfd_data) + .expect("Failed to send userfault data"); + + total_consumed = parser.byte_offset(); + } + Err(e) if e.is_eof() => { + needs_more = true; + break; + } + Err(e) => { + println!( + "Buffer content: {:?}", + std::str::from_utf8(&buffer[..current_pos]) + ); + panic!("Invalid JSON: {}", e); + } + } + } + + if total_consumed > 0 { + buffer.copy_within(total_consumed..current_pos, 0); + current_pos -= total_consumed; + } + + if needs_more { + continue; + } + + // We consumed all data in the buffer, but the socket may have remaining unread data so + // we attempt to read from it and exit the loop only if we confirm that nothing is in + // there. + if current_pos == 0 { + exit_loop = true; + } + } + } + /// Gets a reference to kvm-ioctls Vm #[cfg(feature = "gdb")] pub fn vm(&self) -> &Vm { @@ -903,8 +1076,14 @@ impl MutEventSubscriber for Vmm { FcExitCode::Ok }; self.stop(exit_code); - } else { - error!("Spurious EventManager event for handler: Vmm"); + } + + if let Some(vcpu) = self.active_event_in_userfault_channel(source, event_set) { + self.process_userfault_channels(vcpu); + } + + if self.active_event_in_uffd_socket(source, event_set) { + self.process_uffd_socket(); } } @@ -912,5 +1091,19 @@ impl MutEventSubscriber for Vmm { if let Err(err) = ops.add(Events::new(&self.vcpus_exit_evt, EventSet::IN)) { error!("Failed to register vmm exit event: {}", err); } + + if let Some(uffd_socket) = self.uffd_socket.as_ref() { + if let Err(err) = ops.add(Events::new(uffd_socket, EventSet::IN)) { + panic!("Failed to register UFFD socket: {}", err); + } + } + + if let Some(userfault_channels) = self.userfault_channels.as_ref() { + for channel in userfault_channels { + if let Err(err) = ops.add(Events::new(&channel.receiver, EventSet::IN)) { + panic!("Failed to register userfault events: {}", err); + } + } + } } } diff --git a/src/vmm/src/persist.rs b/src/vmm/src/persist.rs index 967f9804c80..3c2d62316d0 100644 --- a/src/vmm/src/persist.rs +++ b/src/vmm/src/persist.rs @@ -7,6 +7,7 @@ use std::fmt::Debug; use std::fs::{File, OpenOptions}; use std::io::{self, Write}; use std::mem::forget; +use std::os::fd::RawFd; use std::os::unix::io::AsRawFd; use std::os::unix::net::UnixStream; use std::path::Path; @@ -115,6 +116,54 @@ pub struct GuestRegionUffdMapping { pub page_size_kib: usize, } +/// FaultRequest +#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq)] +pub struct FaultRequest { + /// vCPU that encountered the fault (not meaningful to Fission) + pub vcpu: u32, + /// Offset in guest_memfd where the fault occured + pub offset: u64, + /// Flags (not meaningful to Fission) + pub flags: u64, + /// Async PF token (not meaningful to Fission) + pub token: Option, +} + +/// FaultReply +#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq)] +pub struct FaultReply { + /// vCPU that encountered the fault, from `FaultRequest` (if present, otherwise 0) + pub vcpu: Option, + /// Offset in guest_memfd where population started + pub offset: u64, + /// Length of populated area + pub len: u64, + /// Flags, must be copied from `FaultRequest`, otherwise 0 + pub flags: u64, + /// Async PF token, must be copied from `FaultRequest`, otherwise None + pub token: Option, + /// Whether the populated pages are zero pages + pub zero: bool, +} + +/// UffdMsgFromFirecracker +#[derive(Serialize, Deserialize, Debug)] +#[serde(untagged)] +pub enum UffdMsgFromFirecracker { + /// Mappings + Mappings(Vec), + /// FaultReq + FaultReq(FaultRequest), +} + +/// UffdMsgToFirecracker +#[derive(Serialize, Deserialize, Debug)] +#[serde(untagged)] +pub enum UffdMsgToFirecracker { + /// FaultRep + FaultRep(FaultReply), +} + /// Errors related to saving and restoring Microvm state. #[derive(Debug, thiserror::Error, displaydoc::Display)] pub enum MicrovmStateError { @@ -336,6 +385,17 @@ pub fn restore_from_snapshot( vm_resources: &mut VmResources, ) -> Result>, RestoreFromSnapshotError> { let mut microvm_state = snapshot_state_from_file(¶ms.snapshot_path)?; + + if microvm_state.vm_info.secret_free && params.mem_backend.backend_type == MemBackendType::File + { + return Err(RestoreFromSnapshotError::Build( + BuildMicrovmFromSnapshotError::VmUpdateConfig(MachineConfigError::Incompatible( + "secret freedom", + "file memory backend", + )), + )); + } + for entry in ¶ms.network_overrides { let net_devices = &mut microvm_state.device_states.net_devices; if let Some(device) = net_devices @@ -376,38 +436,12 @@ pub fn restore_from_snapshot( // Some sanity checks before building the microvm. snapshot_state_sanity_check(µvm_state)?; - let mem_backend_path = ¶ms.mem_backend.backend_path; - let mem_state = µvm_state.vm_state.memory; - - let (guest_memory, uffd) = match params.mem_backend.backend_type { - MemBackendType::File => { - if vm_resources.machine_config.huge_pages.is_hugetlbfs() { - return Err(RestoreFromSnapshotGuestMemoryError::File( - GuestMemoryFromFileError::HugetlbfsSnapshot, - ) - .into()); - } - ( - guest_memory_from_file(mem_backend_path, mem_state, track_dirty_pages) - .map_err(RestoreFromSnapshotGuestMemoryError::File)?, - None, - ) - } - MemBackendType::Uffd => guest_memory_from_uffd( - mem_backend_path, - mem_state, - track_dirty_pages, - vm_resources.machine_config.huge_pages, - ) - .map_err(RestoreFromSnapshotGuestMemoryError::Uffd)?, - }; builder::build_microvm_from_snapshot( instance_info, event_manager, microvm_state, - guest_memory, - uffd, seccomp_filters, + params, vm_resources, ) .map_err(RestoreFromSnapshotError::Build) @@ -451,7 +485,8 @@ pub enum GuestMemoryFromFileError { HugetlbfsSnapshot, } -fn guest_memory_from_file( +/// Creates guest memory from a file. +pub fn guest_memory_from_file( mem_file_path: &Path, mem_state: &GuestMemoryState, track_dirty_pages: bool, @@ -476,14 +511,73 @@ pub enum GuestMemoryFromUffdError { Send(#[from] vmm_sys_util::errno::Error), } -fn guest_memory_from_uffd( +// TODO remove these when the UFFD crate supports minor faults for guest_memfd +const UFFDIO_REGISTER_MODE_MINOR: u64 = 1 << 2; +const UFFDIO_REGISTER: u64 = 0xc020aa00; + +use std::ffi::c_void; + +use libc::ioctl; +#[repr(C)] +struct UffdRegister { + range: UffdRange, + mode: u64, + ioctls: u64, +} + +#[repr(C)] +struct UffdRange { + start: u64, + len: u64, +} + +fn guest_memory_uffd_register_minor(guest_memory: &[GuestRegionMmap], uffd: &Uffd) { + for mem_region in guest_memory.iter() { + let uffd_register = UffdRegister { + range: UffdRange { + start: mem_region.as_ptr() as u64, + len: mem_region.size() as u64, + }, + mode: UFFDIO_REGISTER_MODE_MINOR, + ioctls: 0, + }; + + // SAFETY: ioctl is safe. Called with a valid UFFD object and a valid uffd_register + // structure. + #[allow(clippy::cast_possible_truncation)] + #[allow(clippy::ptr_as_ptr)] + let result = unsafe { + ioctl( + uffd.as_raw_fd(), + UFFDIO_REGISTER as _, + &uffd_register as *const _ as *const c_void, + ) + }; + + if result == -1 { + panic!( + "Failed to register UFFD minor: {}", + io::Error::last_os_error() + ); + } + } +} + +type GuestMemoryResult = + Result<(Vec, Option, Option), GuestMemoryFromUffdError>; + +/// Creates guest memory using a UDS socket provided by a UFFD handler. +pub fn guest_memory_from_uffd( mem_uds_path: &Path, mem_state: &GuestMemoryState, track_dirty_pages: bool, huge_pages: HugePageConfig, -) -> Result<(Vec, Option), GuestMemoryFromUffdError> { + guest_memfd: Option, + userfault_bitmap_memfd: Option<&File>, +) -> GuestMemoryResult { + let guest_memfd_copy = guest_memfd.as_ref().map(|f| f.try_clone()).transpose()?; let (guest_memory, backend_mappings) = - create_guest_memory(mem_state, track_dirty_pages, huge_pages)?; + create_guest_memory(mem_state, track_dirty_pages, huge_pages, guest_memfd)?; let mut uffd_builder = UffdBuilder::new(); @@ -500,22 +594,48 @@ fn guest_memory_from_uffd( .create() .map_err(GuestMemoryFromUffdError::Create)?; - for mem_region in guest_memory.iter() { - uffd.register(mem_region.as_ptr().cast(), mem_region.size() as _) - .map_err(GuestMemoryFromUffdError::Register)?; - } + let fds = match guest_memfd_copy { + Some(ref guest_memfd) => { + guest_memory_uffd_register_minor(&guest_memory, &uffd); + vec![ + uffd.as_raw_fd(), + guest_memfd.as_raw_fd(), + userfault_bitmap_memfd + .expect("memfd is not present") + .as_raw_fd(), + ] + } + None => { + for mem_region in guest_memory.iter() { + uffd.register(mem_region.as_ptr().cast(), mem_region.size() as _) + .map_err(GuestMemoryFromUffdError::Register)?; + } + vec![uffd.as_raw_fd()] + } + }; + + // We prevent Rust from closing the guest_memfd file descriptor + // so the UFFD handler can used it to populate guest memory. + forget(guest_memfd_copy); - send_uffd_handshake(mem_uds_path, &backend_mappings, &uffd)?; + let socket = send_uffd_handshake(mem_uds_path, &backend_mappings, fds)?; - Ok((guest_memory, Some(uffd))) + Ok((guest_memory, Some(uffd), Some(socket))) } fn create_guest_memory( mem_state: &GuestMemoryState, track_dirty_pages: bool, huge_pages: HugePageConfig, + guest_memfd: Option, ) -> Result<(Vec, Vec), GuestMemoryFromUffdError> { - let guest_memory = memory::anonymous(mem_state.regions(), track_dirty_pages, huge_pages)?; + let guest_memory = match guest_memfd { + Some(file) => { + memory::file_shared(file, mem_state.regions(), track_dirty_pages, huge_pages)? + } + None => memory::anonymous(mem_state.regions(), track_dirty_pages, huge_pages)?, + }; + let mut backend_mappings = Vec::with_capacity(guest_memory.len()); let mut offset = 0; for mem_region in guest_memory.iter() { @@ -536,15 +656,19 @@ fn create_guest_memory( fn send_uffd_handshake( mem_uds_path: &Path, backend_mappings: &[GuestRegionUffdMapping], - uffd: &impl AsRawFd, -) -> Result<(), GuestMemoryFromUffdError> { + fds: Vec, +) -> Result { // This is safe to unwrap() because we control the contents of the vector // (i.e GuestRegionUffdMapping entries). let backend_mappings = serde_json::to_string(backend_mappings).unwrap(); let socket = UnixStream::connect(mem_uds_path)?; - socket.send_with_fd( - backend_mappings.as_bytes(), + socket + .set_nonblocking(true) + .expect("Cannot set non-blocking"); + + socket.send_with_fds( + &[backend_mappings.as_bytes()], // In the happy case we can close the fd since the other process has it open and is // using it to serve us pages. // @@ -575,15 +699,10 @@ fn send_uffd_handshake( // Moreover, Firecracker holds a copy of the UFFD fd as well, so that even if the // page fault handler process does not tear down Firecracker when necessary, the // uffd will still be alive but with no one to serve faults, leading to guest freeze. - uffd.as_raw_fd(), + &fds, )?; - // We prevent Rust from closing the socket file descriptor to avoid a potential race condition - // between the mappings message and the connection shutdown. If the latter arrives at the UFFD - // handler first, the handler never sees the mappings. - forget(socket); - - Ok(()) + Ok(socket) } #[cfg(test)] @@ -714,7 +833,7 @@ mod tests { }; let (_, uffd_regions) = - create_guest_memory(&mem_state, false, HugePageConfig::None).unwrap(); + create_guest_memory(&mem_state, false, HugePageConfig::None, None).unwrap(); assert_eq!(uffd_regions.len(), 1); assert_eq!(uffd_regions[0].size, 0x20000); @@ -748,7 +867,7 @@ mod tests { let listener = UnixListener::bind(uds_path).expect("Cannot bind to socket path"); - send_uffd_handshake(uds_path, &uffd_regions, &std::io::stdin()).unwrap(); + send_uffd_handshake(uds_path, &uffd_regions, vec![std::io::stdin().as_raw_fd()]).unwrap(); let (stream, _) = listener.accept().expect("Cannot listen on UDS socket"); diff --git a/src/vmm/src/vstate/memory.rs b/src/vmm/src/vstate/memory.rs index 005b4f7d38c..0f319562683 100644 --- a/src/vmm/src/vstate/memory.rs +++ b/src/vmm/src/vstate/memory.rs @@ -323,6 +323,12 @@ where /// Store the dirty bitmap in internal store fn store_dirty_bitmap(&self, dirty_bitmap: &DirtyBitmap, page_size: usize); + + /// Convert guest physical address to file offset + fn gpa_to_offset(&self, gpa: GuestAddress) -> Option; + + /// Convert file offset to guest physical address + fn offset_to_gpa(&self, offset: u64) -> Option; } /// State of a guest memory region saved to file/buffer. @@ -473,6 +479,33 @@ impl GuestMemoryExtension for GuestMemoryMmap { } }); } + + /// Convert guest physical address to file offset + fn gpa_to_offset(&self, gpa: GuestAddress) -> Option { + self.find_region(gpa).map(|r| { + gpa.0 - r.start_addr().0 + r.file_offset().expect("File offset is None").start() + }) + } + + /// Convert file offset to guest physical address + fn offset_to_gpa(&self, offset: u64) -> Option { + self.iter().find_map(|region| { + if let Some(reg_offset) = region.file_offset() { + let region_start = reg_offset.start(); + let region_size = region.size(); + + if offset >= region_start && offset < region_start + region_size as u64 { + Some(GuestAddress( + region.start_addr().0 + (offset - region_start), + )) + } else { + None + } + } else { + None + } + }) + } } /// Creates a memfd of the given size and huge pages configuration diff --git a/src/vmm/src/vstate/vcpu.rs b/src/vmm/src/vstate/vcpu.rs index 825af33eea4..843d431fe91 100644 --- a/src/vmm/src/vstate/vcpu.rs +++ b/src/vmm/src/vstate/vcpu.rs @@ -31,11 +31,15 @@ use crate::logger::{IncMetric, METRICS}; use crate::seccomp::{BpfProgram, BpfProgramRef}; use crate::utils::signal::{Killable, register_signal_handler, sigrtmin}; use crate::utils::sm::StateMachine; -use crate::vstate::vm::Vm; +use crate::vstate::vm::{UserfaultChannel, UserfaultData, Vm}; /// Signal number (SIGRTMIN) used to kick Vcpus. pub const VCPU_RTSIG_OFFSET: i32 = 0; +// TODO: remove when KVM userfault support is merged upstream. +/// VM exit due to a userfault. +const KVM_MEMORY_EXIT_FLAG_USERFAULT: u64 = 1 << 4; + /// Errors associated with the wrappers over KVM ioctls. #[derive(Debug, thiserror::Error, displaydoc::Display)] pub enum VcpuError { @@ -109,6 +113,8 @@ pub struct Vcpu { response_receiver: Option>, /// The transmitting end of the responses channel owned by the vcpu side. response_sender: Sender, + /// Channel for communicating userfaults with the VMM thread + userfault_channel: Option, } impl Vcpu { @@ -201,7 +207,13 @@ impl Vcpu { /// * `index` - Represents the 0-based CPU index between [0, max vcpus). /// * `vm` - The vm to which this vcpu will get attached. /// * `exit_evt` - An `EventFd` that will be written into when this vcpu exits. - pub fn new(index: u8, vm: &Vm, exit_evt: EventFd) -> Result { + /// * `userfault_channel` - An optional userfault channel for handling page faults. + pub fn new( + index: u8, + vm: &Vm, + exit_evt: EventFd, + userfault_channel: Option, + ) -> Result { let (event_sender, event_receiver) = channel(); let (response_sender, response_receiver) = channel(); let kvm_vcpu = KvmVcpu::new(index, vm).unwrap(); @@ -215,6 +227,7 @@ impl Vcpu { #[cfg(feature = "gdb")] gdb_event: None, kvm_vcpu, + userfault_channel, }) } @@ -303,6 +316,7 @@ impl Vcpu { // - the other vCPUs won't ever exit out of `KVM_RUN`, but they won't consume CPU. // So we pause vCPU0 and send a signal to the emulation thread to stop the VMM. Ok(VcpuEmulation::Stopped) => return self.exit(FcExitCode::Ok), + Ok(VcpuEmulation::Userfault(_)) => unreachable!(), // If the emulation requests a pause lets do this #[cfg(feature = "gdb")] Ok(VcpuEmulation::Paused) => { @@ -339,10 +353,13 @@ impl Vcpu { // does not panic on resume, see https://docs.kernel.org/virt/kvm/api.html . // We do not want to fail if the call is not successful, because depending // that may be acceptable depending on the workload. + // TODO: once kvmclock is supported with Secret Fredom, remove this condition. #[cfg(target_arch = "x86_64")] - if let Err(err) = self.kvm_vcpu.fd.kvmclock_ctrl() { - METRICS.vcpu.kvmclock_ctrl_fails.inc(); - warn!("KVM_KVMCLOCK_CTRL call failed {}", err); + if self.userfault_channel.is_none() { + if let Err(err) = self.kvm_vcpu.fd.kvmclock_ctrl() { + METRICS.vcpu.kvmclock_ctrl_fails.inc(); + warn!("KVM_KVMCLOCK_CTRL call failed {}", err); + } } // Move to 'paused' state. @@ -486,6 +503,26 @@ impl Vcpu { StateMachine::finish() } + fn handle_userfault( + &mut self, + userfaultfd_data: UserfaultData, + ) -> Result { + let userfault_channel = self + .userfault_channel + .as_mut() + .expect("userfault channel not set"); + + userfault_channel + .send(userfaultfd_data) + .expect("Failed to send userfault data"); + + let _ = userfault_channel + .recv() + .expect("Failed to receive userfault response"); + + Ok(VcpuEmulation::Handled) + } + /// Runs the vCPU in KVM context and handles the kvm exit reason. /// /// Returns error or enum specifying whether emulation was handled or interrupted. @@ -496,7 +533,7 @@ impl Vcpu { return Ok(VcpuEmulation::Interrupted); } - match self.kvm_vcpu.fd.run() { + let result = match self.kvm_vcpu.fd.run() { Err(ref err) if err.errno() == libc::EINTR => { self.kvm_vcpu.fd.set_kvm_immediate_exit(0); // Notify that this KVM_RUN was interrupted. @@ -513,7 +550,14 @@ impl Vcpu { Ok(VcpuEmulation::Paused) } emulation_result => handle_kvm_exit(&mut self.kvm_vcpu.peripherals, emulation_result), - } + }; + + let userfault_data = match result { + Ok(VcpuEmulation::Userfault(userfault_data)) => userfault_data, + _ => return result, + }; + + self.handle_userfault(userfault_data) } } @@ -591,6 +635,16 @@ fn handle_kvm_exit( ))) } }, + VcpuExit::MemoryFault { flags, gpa, size } => { + if flags & KVM_MEMORY_EXIT_FLAG_USERFAULT == 0 { + Err(VcpuError::UnhandledKvmExit(format!( + "flags {:x} gpa {:x} size {:x}", + flags, gpa, size + ))) + } else { + Ok(VcpuEmulation::Userfault(UserfaultData { flags, gpa, size })) + } + } arch_specific_reason => { // run specific architecture emulation. peripherals.run_arch_emulation(arch_specific_reason) @@ -752,6 +806,8 @@ pub enum VcpuEmulation { Interrupted, /// Stopped. Stopped, + /// Userfault + Userfault(UserfaultData), /// Pause request #[cfg(feature = "gdb")] Paused, @@ -922,7 +978,7 @@ pub(crate) mod tests { pub(crate) fn setup_vcpu(mem_size: usize) -> (Kvm, Vm, Vcpu) { let (kvm, mut vm) = setup_vm_with_memory(mem_size); - let (mut vcpus, _) = vm.create_vcpus(1).unwrap(); + let (mut vcpus, _, _) = vm.create_vcpus(1, false).unwrap(); let mut vcpu = vcpus.remove(0); #[cfg(target_arch = "aarch64")] diff --git a/src/vmm/src/vstate/vm.rs b/src/vmm/src/vstate/vm.rs index 3119bc4cbb6..d2beca86c7b 100644 --- a/src/vmm/src/vstate/vm.rs +++ b/src/vmm/src/vstate/vm.rs @@ -7,18 +7,20 @@ use std::collections::HashMap; use std::fs::{File, OpenOptions}; -use std::io::Write; +use std::io::{Read, Write}; use std::os::fd::{AsFd, AsRawFd, FromRawFd}; use std::path::Path; use std::sync::Arc; +use bincode::{Decode, Encode}; use kvm_bindings::{ - KVM_MEM_GUEST_MEMFD, KVM_MEM_LOG_DIRTY_PAGES, KVM_MEMORY_ATTRIBUTE_PRIVATE, + KVM_MEM_GUEST_MEMFD, KVM_MEM_LOG_DIRTY_PAGES, KVM_MEMORY_ATTRIBUTE_PRIVATE, KVMIO, kvm_create_guest_memfd, kvm_memory_attributes, kvm_userspace_memory_region, - kvm_userspace_memory_region2, }; use kvm_ioctls::{Cap, VmFd}; use vmm_sys_util::eventfd::EventFd; +use vmm_sys_util::ioctl::ioctl_with_ref; +use vmm_sys_util::{ioctl_ioc_nr, ioctl_iow_nr}; pub use crate::arch::{ArchVm as Vm, ArchVmError, VmState}; use crate::arch::{VM_TYPE_FOR_SECRET_FREEDOM, host_page_size}; @@ -35,6 +37,86 @@ use crate::{DirtyBitmap, Vcpu, mem_size_mib}; pub(crate) const KVM_GMEM_NO_DIRECT_MAP: u64 = 1; +/// KVM userfault information +#[derive(Copy, Clone, Decode, Default, Eq, PartialEq, Debug, Encode)] +pub struct UserfaultData { + /// Flags + pub flags: u64, + /// Guest physical address + pub gpa: u64, + /// Size + pub size: u64, +} + +/// Errors associated with `UserfaultChannel`. +#[derive(Debug, thiserror::Error, displaydoc::Display)] +pub enum UserfaultChannelError { + /// Encode error {0} + Encode(#[from] bincode::error::EncodeError), + /// Decode error {0} + Decode(#[from] bincode::error::DecodeError), + /// IO error {0} + IO(#[from] std::io::Error), +} + +/// KVM userfault channel +#[derive(Debug)] +pub struct UserfaultChannel { + /// Sender + pub sender: File, + /// Receiver + pub receiver: File, +} + +impl UserfaultChannel { + fn bincode_config(&self) -> impl bincode::config::Config { + bincode::config::standard().with_fixed_int_encoding() + } + + /// Receive `UserfaultData` from the channel. + pub fn send(&mut self, data: UserfaultData) -> Result<(), UserfaultChannelError> { + let encoded_data = bincode::encode_to_vec(data, self.bincode_config()) + .map_err(UserfaultChannelError::Encode)?; + + self.sender.write_all(&encoded_data)?; + + Ok(()) + } + + /// Send `UserfaultData` to the channel. + pub fn recv(&mut self) -> Result { + let size = bincode::encode_to_vec(UserfaultData::default(), self.bincode_config())?.len(); + + let mut encoded_data = vec![0u8; size as usize]; + self.receiver.read_exact(&mut encoded_data)?; + + bincode::decode_from_slice(&encoded_data, self.bincode_config()) + .map_err(UserfaultChannelError::Decode) + .map(|(data, _)| data) + } +} + +fn pipe2(flags: libc::c_int) -> std::io::Result<(File, File)> { + let mut fds = [0, 0]; + + // SAFETY: pipe2() is safe to call with a valid mutable pointer to an array of 2 integers + // The fds array is stack-allocated and lives for the entire unsafe block. + let res = unsafe { libc::pipe2(fds.as_mut_ptr(), flags) }; + + if res == 0 { + Ok(( + // SAFETY: fds[0] contains a valid file descriptor for the read end of the pipe + // We only convert successful pipe2() calls, and each fd is used exactly once. + unsafe { File::from_raw_fd(fds[0]) }, + // SAFETY: fds[1] contains a valid file descriptor for the write end of the pipe + // We only convert successful pipe2() calls, and each fd is used exactly once. + unsafe { File::from_raw_fd(fds[1]) }, + )) + } else { + Err(std::io::Error::last_os_error()) + } +} + /// Architecture independent parts of a VM. #[derive(Debug)] pub struct VmCommon { @@ -59,6 +141,8 @@ pub enum VmError { Arch(#[from] ArchVmError), /// Error during eventfd operations: {0} EventFd(std::io::Error), + /// Failed to create a userfault channel: {0} + UserfaultChannel(std::io::Error), /// Failed to create vcpu: {0} CreateVcpu(VcpuError), /// The number of configured slots is bigger than the maximum reported by KVM @@ -73,6 +157,26 @@ pub enum VmError { SetMemoryAttributes(kvm_ioctls::Error), } +// Upstream `kvm_userspace_memory_region2` definition does not include `userfault_bitmap` field yet. +// TODO: revert to `kvm_userspace_memory_region2` from kvm-bindings +#[allow(non_camel_case_types)] +#[repr(C)] +#[derive(Debug, Default, Copy, Clone, PartialEq)] +struct kvm_userspace_memory_region2 { + slot: u32, + flags: u32, + guest_phys_addr: u64, + memory_size: u64, + userspace_addr: u64, + guest_memfd_offset: u64, + guest_memfd: u32, + pad1: u32, + userfault_bitmap: u64, + pad2: [u64; 13], +} + +type VcpuCreationResult = Result<(Vec, EventFd, Option>), VmError>; + /// Contains Vm functions that are usable across CPU architectures impl Vm { /// Create a KVM VM @@ -135,24 +239,65 @@ impl Vm { }) } + fn create_userfault_channels( + &self, + secret_free: bool, + ) -> Result<(Option, Option), std::io::Error> { + if secret_free { + let (receiver_vcpu_to_vm, sender_vcpu_to_vm) = pipe2(libc::O_NONBLOCK)?; + let (receiver_vm_to_vcpu, sender_vm_to_vcpu) = pipe2(0)?; + Ok(( + Some(UserfaultChannel { + sender: sender_vcpu_to_vm, + receiver: receiver_vm_to_vcpu, + }), + Some(UserfaultChannel { + sender: sender_vm_to_vcpu, + receiver: receiver_vcpu_to_vm, + }), + )) + } else { + Ok((None, None)) + } + } + /// Creates the specified number of [`Vcpu`]s. /// /// The returned [`EventFd`] is written to whenever any of the vcpus exit. - pub fn create_vcpus(&mut self, vcpu_count: u8) -> Result<(Vec, EventFd), VmError> { + pub fn create_vcpus(&mut self, vcpu_count: u8, secret_free: bool) -> VcpuCreationResult { self.arch_pre_create_vcpus(vcpu_count)?; let exit_evt = EventFd::new(libc::EFD_NONBLOCK).map_err(VmError::EventFd)?; let mut vcpus = Vec::with_capacity(vcpu_count as usize); + let mut userfault_channels = Vec::with_capacity(vcpu_count as usize); for cpu_idx in 0..vcpu_count { let exit_evt = exit_evt.try_clone().map_err(VmError::EventFd)?; - let vcpu = Vcpu::new(cpu_idx, self, exit_evt).map_err(VmError::CreateVcpu)?; + + let (vcpu_channel, vmm_channel) = self + .create_userfault_channels(secret_free) + .map_err(VmError::UserfaultChannel)?; + + let vcpu = + Vcpu::new(cpu_idx, self, exit_evt, vcpu_channel).map_err(VmError::CreateVcpu)?; vcpus.push(vcpu); + + if secret_free { + userfault_channels.push(vmm_channel.expect("Failed to create userfault channel")); + } } self.arch_post_create_vcpus(vcpu_count)?; - Ok((vcpus, exit_evt)) + Ok(( + vcpus, + exit_evt, + if secret_free { + Some(userfault_channels) + } else { + None + }, + )) } /// Create a guest_memfd of the specified size @@ -181,16 +326,78 @@ impl Vm { pub fn register_memory_regions( &mut self, regions: Vec, + userfault_bitmap_memfd: Option<&File>, ) -> Result<(), VmError> { + let addr = match userfault_bitmap_memfd { + Some(file) => { + // SAFETY: the arguments to mmap cannot cause any memory unsafety in the rust sense + let addr = unsafe { + libc::mmap( + std::ptr::null_mut(), + usize::try_from(file.metadata().expect("Failed to get metadata").len()) + .expect("userfault bitmap file size is too large"), + libc::PROT_WRITE, + libc::MAP_SHARED, + file.as_raw_fd(), + 0, + ) + }; + + if addr == libc::MAP_FAILED { + panic!( + "Failed to mmap userfault bitmap file: {}", + std::io::Error::last_os_error() + ); + } + + Some(addr as u64) + } + None => None, + }; + for region in regions { - self.register_memory_region(region)? + self.register_memory_region(region, addr)? } Ok(()) } + // TODO: remove when userfault support is merged upstream + fn set_user_memory_region2( + &self, + user_memory_region2: kvm_userspace_memory_region2, + ) -> Result<(), VmError> { + ioctl_iow_nr!( + KVM_SET_USER_MEMORY_REGION2, + KVMIO, + 0x49, + kvm_userspace_memory_region2 + ); + + #[allow(clippy::undocumented_unsafe_blocks)] + let ret = unsafe { + ioctl_with_ref( + self.fd(), + KVM_SET_USER_MEMORY_REGION2(), + &user_memory_region2, + ) + }; + if ret == 0 { + Ok(()) + } else { + Err(VmError::SetUserMemoryRegion(kvm_ioctls::Error::last())) + } + } + /// Register a new memory region to this [`Vm`]. - pub fn register_memory_region(&mut self, region: GuestRegionMmap) -> Result<(), VmError> { + pub fn register_memory_region( + &mut self, + region: GuestRegionMmap, + userfault_addr: Option, + ) -> Result<(), VmError> { + // TODO: take it from kvm-bindings when merged upstream + const KVM_MEM_USERFAULT: u32 = 1 << 3; + let next_slot = self .guest_memory() .num_regions() @@ -218,6 +425,21 @@ impl Vm { (0, 0) }; + let userfault_bitmap = match userfault_addr { + Some(addr) => { + flags |= KVM_MEM_USERFAULT; + + let file_offset_start = region + .file_offset() + .expect("file offset not present") + .start(); + let pages_offset = file_offset_start / (host_page_size() as u64); + let bytes_offset = pages_offset / (u8::BITS as u64); + addr + bytes_offset + } + None => 0, + }; + let memory_region = kvm_userspace_memory_region2 { slot: next_slot, guest_phys_addr: region.start_addr().raw_value(), @@ -226,18 +448,14 @@ impl Vm { flags, guest_memfd, guest_memfd_offset, + userfault_bitmap, ..Default::default() }; let new_guest_memory = self.common.guest_memory.insert_region(Arc::new(region))?; if self.fd().check_extension(Cap::UserMemory2) { - // SAFETY: We are passing a valid memory region and operate on a valid KVM FD. - unsafe { - self.fd() - .set_user_memory_region2(memory_region) - .map_err(VmError::SetUserMemoryRegion)?; - } + self.set_user_memory_region2(memory_region)?; } else { // Something is seriously wrong if we manage to set these fields on a host that doesn't // even allow creation of guest_memfds! @@ -417,7 +635,7 @@ pub(crate) mod tests { pub(crate) fn setup_vm_with_memory(mem_size: usize) -> (Kvm, Vm) { let (kvm, mut vm) = setup_vm(); let gm = single_region_mem_raw(mem_size); - vm.register_memory_regions(gm).unwrap(); + vm.register_memory_regions(gm, None).unwrap(); (kvm, vm) } @@ -447,14 +665,14 @@ pub(crate) mod tests { // Trying to set a memory region with a size that is not a multiple of GUEST_PAGE_SIZE // will result in error. let gm = single_region_mem_raw(0x10); - let res = vm.register_memory_regions(gm); + let res = vm.register_memory_regions(gm, None); assert_eq!( res.unwrap_err().to_string(), "Cannot set the memory regions: Invalid argument (os error 22)" ); let gm = single_region_mem_raw(0x1000); - let res = vm.register_memory_regions(gm); + let res = vm.register_memory_regions(gm, None); res.unwrap(); } @@ -489,7 +707,7 @@ pub(crate) mod tests { let region = GuestRegionMmap::new(region, GuestAddress(i as u64 * 0x1000)).unwrap(); - let res = vm.register_memory_region(region); + let res = vm.register_memory_region(region, None); if i >= max_nr_regions { assert!( @@ -516,7 +734,7 @@ pub(crate) mod tests { let vcpu_count = 2; let (_, mut vm) = setup_vm_with_memory(mib_to_bytes(128)); - let (vcpu_vec, _) = vm.create_vcpus(vcpu_count).unwrap(); + let (vcpu_vec, _, _) = vm.create_vcpus(vcpu_count, false).unwrap(); assert_eq!(vcpu_vec.len(), vcpu_count as usize); } diff --git a/tests/host_tools/memory.py b/tests/host_tools/memory.py index 93380a9321d..edab67169c3 100644 --- a/tests/host_tools/memory.py +++ b/tests/host_tools/memory.py @@ -93,10 +93,16 @@ def is_guest_mem(self, size, guest_mem_bytes): # - its size matches the guest memory exactly # - its size is 3328M # - its size is guest memory minus 3328M. - return size in ( - guest_mem_bytes, - self.X86_MEMORY_GAP_START, - guest_mem_bytes - self.X86_MEMORY_GAP_START, + # Note: we add a workaround for Secret Free VMs + # to also exclude the snapshot bounce buffer VMA from the check. + return ( + size + in ( + guest_mem_bytes, + self.X86_MEMORY_GAP_START, + guest_mem_bytes - self.X86_MEMORY_GAP_START, + ) + or size > guest_mem_bytes ) def check_samples(self): diff --git a/tests/integration_tests/functional/test_secret_freedom.py b/tests/integration_tests/functional/test_secret_freedom.py index 5f9758bb88c..50836752720 100644 --- a/tests/integration_tests/functional/test_secret_freedom.py +++ b/tests/integration_tests/functional/test_secret_freedom.py @@ -56,20 +56,3 @@ def test_secret_free_initrd(microvm_factory, guest_kernel): serial.rx(token="# ") serial.tx("mount |grep rootfs") serial.rx(token=f"rootfs on / type {INITRD_FILESYSTEM}") - - -def test_secret_free_snapshot_creation(microvm_factory, guest_kernel, rootfs): - """Test that snapshot creation works for secret hidden VMs""" - vm = microvm_factory.build(guest_kernel, rootfs) - vm.spawn() - vm.memory_monitor = None - vm.basic_config(secret_free=True) - vm.add_net_iface() - vm.start() - - snapshot = vm.snapshot_full() - - # After restoration, the VM will not be secret hidden anymore, as that's not supported yet. - # But we can at least test that in principle, the snapshot creation worked. - vm = microvm_factory.build_from_snapshot(snapshot) - vm.ssh.check_output("true") diff --git a/tests/integration_tests/functional/test_snapshot_basic.py b/tests/integration_tests/functional/test_snapshot_basic.py index deb9b2dd6c3..210d205f06d 100644 --- a/tests/integration_tests/functional/test_snapshot_basic.py +++ b/tests/integration_tests/functional/test_snapshot_basic.py @@ -338,9 +338,9 @@ def test_negative_snapshot_permissions(uvm_plain_rw, microvm_factory): microvm.spawn() expected_err = re.escape( - "Load snapshot error: Failed to restore from snapshot: Failed to load guest " - "memory: Error creating guest memory from file: Failed to load guest memory: " - "Permission denied (os error 13)" + "Load snapshot error: Failed to restore from snapshot: Failed to build microVM " + "from snapshot: Failed to load guest memory: Error creating guest memory from file: " + "Failed to load guest memory: Permission denied (os error 13)" ) with pytest.raises(RuntimeError, match=expected_err): microvm.restore_from_snapshot(snapshot, resume=True) diff --git a/tests/integration_tests/functional/test_uffd.py b/tests/integration_tests/functional/test_uffd.py index a67a24a4f6b..06fb59205de 100644 --- a/tests/integration_tests/functional/test_uffd.py +++ b/tests/integration_tests/functional/test_uffd.py @@ -12,18 +12,20 @@ @pytest.fixture(scope="function", name="snapshot") -def snapshot_fxt(microvm_factory, guest_kernel_linux_5_10, rootfs): +def snapshot_fxt(microvm_factory, guest_kernel_linux_5_10, rootfs, secret_free): """Create a snapshot of a microVM.""" basevm = microvm_factory.build(guest_kernel_linux_5_10, rootfs) basevm.spawn() - basevm.basic_config(vcpu_count=2, mem_size_mib=256) + basevm.basic_config(vcpu_count=2, mem_size_mib=256, secret_free=secret_free) basevm.add_net_iface() # Add a memory balloon. - basevm.api.balloon.put( - amount_mib=0, deflate_on_oom=True, stats_polling_interval_s=0 - ) + # Note: Secret Free VMs do not support ballooning as of now. + if not secret_free: + basevm.api.balloon.put( + amount_mib=0, deflate_on_oom=True, stats_polling_interval_s=0 + ) basevm.start() @@ -43,9 +45,9 @@ def test_bad_socket_path(uvm_plain, snapshot): jailed_vmstate = vm.create_jailed_resource(snapshot.vmstate) expected_msg = re.escape( - "Load snapshot error: Failed to restore from snapshot: Failed to load guest " - "memory: Error creating guest memory from uffd: Failed to connect to UDS Unix stream: No " - "such file or directory (os error 2)" + "Load snapshot error: Failed to restore from snapshot: Failed to build microVM from " + "snapshot: Failed to load guest memory: Error creating guest memory from uffd: Failed " + "to connect to UDS Unix stream: No such file or directory (os error 2)" ) with pytest.raises(RuntimeError, match=expected_msg): vm.api.snapshot_load.put( @@ -69,9 +71,9 @@ def test_unbinded_socket(uvm_plain, snapshot): jailed_sock_path = vm.create_jailed_resource(socket_path) expected_msg = re.escape( - "Load snapshot error: Failed to restore from snapshot: Failed to load guest " - "memory: Error creating guest memory from uffd: Failed to connect to UDS Unix stream: " - "Connection refused (os error 111)" + "Load snapshot error: Failed to restore from snapshot: Failed to build microVM " + "from snapshot: Failed to load guest memory: Error creating guest memory from uffd: " + "Failed to connect to UDS Unix stream: Connection refused (os error 111)" ) with pytest.raises(RuntimeError, match=expected_msg): vm.api.snapshot_load.put( @@ -82,6 +84,16 @@ def test_unbinded_socket(uvm_plain, snapshot): vm.mark_killed() +def has_balloon_device(microvm): + """ + Check if a balloon device is present in the Firecracker microVM. + """ + response = microvm.api.vm_config.get() + config = response.json() + + return "balloon" in config and config["balloon"] is not None + + def test_valid_handler(uvm_plain, snapshot): """ Test valid uffd handler scenario. @@ -91,14 +103,16 @@ def test_valid_handler(uvm_plain, snapshot): vm.spawn() vm.restore_from_snapshot(snapshot, resume=True, uffd_handler_name="on_demand") - # Inflate balloon. - vm.api.balloon.patch(amount_mib=200) + # Secret Free VMs do not support ballooning so the balloon device is not added to them. + if has_balloon_device(vm): + # Inflate balloon. + vm.api.balloon.patch(amount_mib=200) - # Verify if the restored guest works. - vm.ssh.check_output("true") + # Verify if the restored guest works. + vm.ssh.check_output("true") - # Deflate balloon. - vm.api.balloon.patch(amount_mib=0) + # Deflate balloon. + vm.api.balloon.patch(amount_mib=0) # Verify if the restored guest works. vm.ssh.check_output("true") diff --git a/tests/integration_tests/performance/test_snapshot_ab.py b/tests/integration_tests/performance/test_snapshot_ab.py index d1668468beb..e54f1eb736d 100644 --- a/tests/integration_tests/performance/test_snapshot_ab.py +++ b/tests/integration_tests/performance/test_snapshot_ab.py @@ -46,7 +46,7 @@ def id(self): """Computes a unique id for this test instance""" return "all_dev" if self.all_devices else f"{self.vcpus}vcpu_{self.mem}mb" - def boot_vm(self, microvm_factory, guest_kernel, rootfs) -> Microvm: + def boot_vm(self, microvm_factory, guest_kernel, rootfs, secret_free) -> Microvm: """Creates the initial snapshot that will be loaded repeatedly to sample latencies""" vm = microvm_factory.build( guest_kernel, @@ -60,6 +60,7 @@ def boot_vm(self, microvm_factory, guest_kernel, rootfs) -> Microvm: mem_size_mib=self.mem, rootfs_io_engine="Sync", huge_pages=self.huge_pages, + secret_free=secret_free, ) for _ in range(self.nets): @@ -98,7 +99,11 @@ def boot_vm(self, microvm_factory, guest_kernel, rootfs) -> Microvm: ids=lambda x: x.id, ) def test_restore_latency( - microvm_factory, rootfs, guest_kernel_linux_5_10, test_setup, metrics + microvm_factory, + rootfs, + guest_kernel_linux_5_10, + test_setup, + metrics, ): """ Restores snapshots with vcpu/memory configuration, roughly scaling according to mem = (vcpus - 1) * 2048MB, @@ -116,7 +121,7 @@ def test_restore_latency( "huge pages with secret hiding kernels on ARM are currently failing" ) - vm = test_setup.boot_vm(microvm_factory, guest_kernel_linux_5_10, rootfs) + vm = test_setup.boot_vm(microvm_factory, guest_kernel_linux_5_10, rootfs, False) metrics.set_dimensions( { @@ -159,6 +164,7 @@ def test_post_restore_latency( metrics, uffd_handler, huge_pages, + secret_free, ): """Collects latency metric of post-restore memory accesses done inside the guest""" if huge_pages != HugePagesConfig.NONE and uffd_handler is None: @@ -173,8 +179,16 @@ def test_post_restore_latency( "huge pages with secret hiding kernels on ARM are currently failing" ) + if secret_free and uffd_handler is None: + pytest.skip("Restoring from a file is not compatible with Secret Freedom") + + if secret_free and huge_pages != HugePagesConfig.NONE: + pytest.skip("Huge pages are not supported with Secret Freedom yet") + test_setup = SnapshotRestoreTest(mem=1024, vcpus=2, huge_pages=huge_pages) - vm = test_setup.boot_vm(microvm_factory, guest_kernel_linux_5_10, rootfs) + vm = test_setup.boot_vm( + microvm_factory, guest_kernel_linux_5_10, rootfs, secret_free + ) metrics.set_dimensions( { @@ -226,6 +240,7 @@ def test_population_latency( huge_pages, vcpus, mem, + secret_free, ): """Collects population latency metrics (e.g. how long it takes UFFD handler to fault in all memory)""" if ( @@ -237,8 +252,13 @@ def test_population_latency( "huge pages with secret hiding kernels on ARM are currently failing" ) + if secret_free and huge_pages != HugePagesConfig.NONE: + pytest.skip("Huge pages are not supported with Secret Freedom yet") + test_setup = SnapshotRestoreTest(mem=mem, vcpus=vcpus, huge_pages=huge_pages) - vm = test_setup.boot_vm(microvm_factory, guest_kernel_linux_5_10, rootfs) + vm = test_setup.boot_vm( + microvm_factory, guest_kernel_linux_5_10, rootfs, secret_free + ) metrics.set_dimensions( { @@ -281,16 +301,13 @@ def test_population_latency( def test_snapshot_create_latency( - microvm_factory, - guest_kernel_linux_5_10, - rootfs, - metrics, + microvm_factory, guest_kernel_linux_5_10, rootfs, metrics, secret_free ): """Measure the latency of creating a Full snapshot""" vm = microvm_factory.build(guest_kernel_linux_5_10, rootfs, monitor_memory=False) vm.spawn() - vm.basic_config(vcpu_count=2, mem_size_mib=512) + vm.basic_config(vcpu_count=2, mem_size_mib=512, secret_free=secret_free) vm.start() vm.pin_threads(0)