Skip to content

[SH] add userfault support #5261

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 18 commits into
base: feature/secret-hiding
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions resources/seccomp/aarch64-unknown-linux-musl.json
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

wouldn't be required if we use rust channels instead of pipes :p

Original file line number Diff line number Diff line change
Expand Up @@ -743,6 +743,9 @@
{
"syscall": "exit_group"
},
{
"syscall": "read"
},
{
"syscall": "write"
},
Expand Down
3 changes: 3 additions & 0 deletions resources/seccomp/x86_64-unknown-linux-musl.json
Original file line number Diff line number Diff line change
Expand Up @@ -755,6 +755,9 @@
{
"syscall": "exit_group"
},
{
"syscall": "read"
},
{
"syscall": "write"
},
Expand Down
91 changes: 75 additions & 16 deletions src/firecracker/examples/uffd/fault_all_handler.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,14 @@
mod uffd_utils;

use std::fs::File;
use std::os::fd::AsRawFd;
use std::os::unix::net::UnixListener;

use uffd_utils::{Runtime, UffdHandler};
use utils::time::{ClockType, get_time_us};

use crate::uffd_utils::uffd_continue;

fn main() {
let mut args = std::env::args();
let uffd_sock_path = args.nth(1).expect("No socket path given");
Expand All @@ -23,27 +26,83 @@ fn main() {
// Get Uffd from UDS. We'll use the uffd to handle PFs for Firecracker.
let listener = UnixListener::bind(uffd_sock_path).expect("Cannot bind to socket path");
let (stream, _) = listener.accept().expect("Cannot listen on UDS socket");
stream
.set_nonblocking(true)
.expect("Cannot set non-blocking");

let mut runtime = Runtime::new(stream, file);
runtime.install_panic_hook();
runtime.run(|uffd_handler: &mut UffdHandler| {
// Read an event from the userfaultfd.
let event = uffd_handler
.read_event()
.expect("Failed to read uffd_msg")
.expect("uffd_msg not ready");

match event {
userfaultfd::Event::Pagefault { .. } => {
let start = get_time_us(ClockType::Monotonic);
for region in uffd_handler.mem_regions.clone() {
uffd_handler.serve_pf(region.base_host_virt_addr as _, region.size);
runtime.run(
|uffd_handler: &mut UffdHandler| {
// Read an event from the userfaultfd.
let event = uffd_handler
.read_event()
.expect("Failed to read uffd_msg")
.expect("uffd_msg not ready");

if let userfaultfd::Event::Pagefault { addr, .. } = event {
let bit =
uffd_handler.addr_to_offset(addr.cast()) as usize / uffd_handler.page_size;

// If Secret Free, we know if this is the first fault based on the userfault
// bitmap state. Otherwise, we assume that we will ever only receive a single fault
// event via UFFD.
let are_we_faulted_yet = uffd_handler
.userfault_bitmap
.as_mut()
.map_or(false, |bitmap| !bitmap.is_bit_set(bit));

if are_we_faulted_yet {
// TODO: we currently ignore the result as we may attempt to
// populate the page that is already present as we may receive
// multiple minor fault events per page.
let _ = uffd_continue(
uffd_handler.uffd.as_raw_fd(),
addr as _,
uffd_handler.page_size as u64,
)
.inspect_err(|err| println!("Error during uffdio_continue: {:?}", err));
} else {
fault_all(uffd_handler, addr);
}
let end = get_time_us(ClockType::Monotonic);
}
},
|_uffd_handler: &mut UffdHandler, _offset: usize| {},
);
}

println!("Finished Faulting All: {}us", end - start);
fn fault_all(uffd_handler: &mut UffdHandler, fault_addr: *mut libc::c_void) {
let start = get_time_us(ClockType::Monotonic);
for region in uffd_handler.mem_regions.clone() {
match uffd_handler.guest_memfd {
None => {
uffd_handler.serve_pf(region.base_host_virt_addr as _, region.size);
}
Some(_) => {
let written = uffd_handler.populate_via_write(region.offset as usize, region.size);

// This code is written under the assumption that the first fault triggered by
// Firecracker is either due to an MSR write (on x86) or due to device restoration
// reading from guest memory to check the virtio queues are sane (on
// ARM). This will be reported via a UFFD minor fault which needs to
// be handled via memcpy. Importantly, we get to the UFFD handler
// with the actual guest_memfd page already faulted in, meaning pwrite will stop
// once it gets to the offset of that page (e.g. written < region.size above).
// Thus, to fault in everything, we now need to skip this one page, write the
// remaining region, and then deal with the "gap" via uffd_handler.serve_pf().

if written < region.size - uffd_handler.page_size {
let r = uffd_handler.populate_via_write(
region.offset as usize + written + uffd_handler.page_size,
region.size - written - uffd_handler.page_size,
);
assert_eq!(written + r, region.size - uffd_handler.page_size);
}
}
_ => panic!("Unexpected event on userfaultfd"),
}
});
}
uffd_handler.serve_pf(fault_addr.cast(), uffd_handler.page_size);
let end = get_time_us(ClockType::Monotonic);

println!("Finished Faulting All: {}us", end - start);
}
28 changes: 17 additions & 11 deletions src/firecracker/examples/uffd/malicious_handler.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,17 +21,23 @@ fn main() {
// Get Uffd from UDS. We'll use the uffd to handle PFs for Firecracker.
let listener = UnixListener::bind(uffd_sock_path).expect("Cannot bind to socket path");
let (stream, _) = listener.accept().expect("Cannot listen on UDS socket");
stream
.set_nonblocking(true)
.expect("Cannot set non-blocking");

let mut runtime = Runtime::new(stream, file);
runtime.run(|uffd_handler: &mut UffdHandler| {
// Read an event from the userfaultfd.
let event = uffd_handler
.read_event()
.expect("Failed to read uffd_msg")
.expect("uffd_msg not ready");

if let userfaultfd::Event::Pagefault { .. } = event {
panic!("Fear me! I am the malicious page fault handler.")
}
});
runtime.run(
|uffd_handler: &mut UffdHandler| {
// Read an event from the userfaultfd.
let event = uffd_handler
.read_event()
.expect("Failed to read uffd_msg")
.expect("uffd_msg not ready");

if let userfaultfd::Event::Pagefault { .. } = event {
panic!("Fear me! I am the malicious page fault handler.")
}
},
|_uffd_handler: &mut UffdHandler, _offset: usize| {},
);
}
189 changes: 121 additions & 68 deletions src/firecracker/examples/uffd/on_demand_handler.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,13 @@
mod uffd_utils;

use std::fs::File;
use std::os::fd::AsRawFd;
use std::os::unix::net::UnixListener;

use uffd_utils::{Runtime, UffdHandler};

use crate::uffd_utils::uffd_continue;

fn main() {
let mut args = std::env::args();
let uffd_sock_path = args.nth(1).expect("No socket path given");
Expand All @@ -22,84 +25,134 @@ fn main() {
// Get Uffd from UDS. We'll use the uffd to handle PFs for Firecracker.
let listener = UnixListener::bind(uffd_sock_path).expect("Cannot bind to socket path");
let (stream, _) = listener.accept().expect("Cannot listen on UDS socket");
stream
.set_nonblocking(true)
.expect("Cannot set non-blocking");

let mut runtime = Runtime::new(stream, file);
runtime.install_panic_hook();
runtime.run(|uffd_handler: &mut UffdHandler| {
// !DISCLAIMER!
// When using UFFD together with the balloon device, this handler needs to deal with
// `remove` and `pagefault` events. There are multiple things to keep in mind in
// such setups:
//
// As long as any `remove` event is pending in the UFFD queue, all ioctls return EAGAIN
// -----------------------------------------------------------------------------------
//
// This means we cannot process UFFD events simply one-by-one anymore - if a `remove` event
// arrives, we need to pre-fetch all other events up to the `remove` event, to unblock the
// UFFD, and then go back to the process the pre-fetched events.
//
// UFFD might receive events in not in their causal order
// -----------------------------------------------------
//
// For example, the guest
// kernel might first respond to a balloon inflation by freeing some memory, and
// telling Firecracker about this. Firecracker will then madvise(MADV_DONTNEED) the
// free memory range, which causes a `remove` event to be sent to UFFD. Then, the
// guest kernel might immediately fault the page in again (for example because
// default_on_oom was set). which causes a `pagefault` event to be sent to UFFD.
//
// However, the pagefault will be triggered from inside KVM on the vCPU thread, while the
// balloon device is handled by Firecracker on its VMM thread. This means that potentially
// this handler can receive the `pagefault` _before_ the `remove` event.
//
// This means that the simple "greedy" strategy of simply prefetching _all_ UFFD events
// to make sure no `remove` event is blocking us can result in the handler acting on
// the `pagefault` event before the `remove` message (despite the `remove` event being
// in the causal past of the `pagefault` event), which means that we will fault in a page
// from the snapshot file, while really we should be faulting in a zero page.
//
// In this example handler, we ignore this problem, to avoid
// complexity (under the assumption that the guest kernel will zero a newly faulted in
// page anyway). A production handler will most likely want to ensure that `remove`
// events for a specific range are always handled before `pagefault` events.
//
// Lastly, we still need to deal with the race condition where a `remove` event arrives
// in the UFFD queue after we got done reading all events, in which case we need to go
// back to reading more events before we can continue processing `pagefault`s.
let mut deferred_events = Vec::new();
runtime.run(
|uffd_handler: &mut UffdHandler| {
// !DISCLAIMER!
// When using UFFD together with the balloon device, this handler needs to deal with
// `remove` and `pagefault` events. There are multiple things to keep in mind in
// such setups:
//
// As long as any `remove` event is pending in the UFFD queue, all ioctls return EAGAIN
// -----------------------------------------------------------------------------------
//
// This means we cannot process UFFD events simply one-by-one anymore - if a `remove`
// event arrives, we need to pre-fetch all other events up to the `remove`
// event, to unblock the UFFD, and then go back to the process the
// pre-fetched events.
//
// UFFD might receive events in not in their causal order
// -----------------------------------------------------
//
// For example, the guest
// kernel might first respond to a balloon inflation by freeing some memory, and
// telling Firecracker about this. Firecracker will then madvise(MADV_DONTNEED) the
// free memory range, which causes a `remove` event to be sent to UFFD. Then, the
// guest kernel might immediately fault the page in again (for example because
// default_on_oom was set). which causes a `pagefault` event to be sent to UFFD.
//
// However, the pagefault will be triggered from inside KVM on the vCPU thread, while
// the balloon device is handled by Firecracker on its VMM thread. This
// means that potentially this handler can receive the `pagefault` _before_
// the `remove` event.
//
// This means that the simple "greedy" strategy of simply prefetching _all_ UFFD events
// to make sure no `remove` event is blocking us can result in the handler acting on
// the `pagefault` event before the `remove` message (despite the `remove` event being
// in the causal past of the `pagefault` event), which means that we will fault in a
// page from the snapshot file, while really we should be faulting in a zero
// page.
//
// In this example handler, we ignore this problem, to avoid
// complexity (under the assumption that the guest kernel will zero a newly faulted in
// page anyway). A production handler will most likely want to ensure that `remove`
// events for a specific range are always handled before `pagefault` events.
//
// Lastly, we still need to deal with the race condition where a `remove` event arrives
// in the UFFD queue after we got done reading all events, in which case we need to go
// back to reading more events before we can continue processing `pagefault`s.
let mut deferred_events = Vec::new();

loop {
// First, try events that we couldn't handle last round
let mut events_to_handle = Vec::from_iter(deferred_events.drain(..));
loop {
// First, try events that we couldn't handle last round
let mut events_to_handle = Vec::from_iter(deferred_events.drain(..));

// Read all events from the userfaultfd.
while let Some(event) = uffd_handler.read_event().expect("Failed to read uffd_msg") {
events_to_handle.push(event);
}
// Read all events from the userfaultfd.
while let Some(event) = uffd_handler.read_event().expect("Failed to read uffd_msg")
{
events_to_handle.push(event);
}

for event in events_to_handle.drain(..) {
// We expect to receive either a Page Fault or `remove`
// event (if the balloon device is enabled).
match event {
userfaultfd::Event::Pagefault { addr, .. } => {
let bit = uffd_handler.addr_to_offset(addr.cast()) as usize
/ uffd_handler.page_size;

for event in events_to_handle.drain(..) {
// We expect to receive either a Page Fault or `remove`
// event (if the balloon device is enabled).
match event {
userfaultfd::Event::Pagefault { addr, .. } => {
if !uffd_handler.serve_pf(addr.cast(), uffd_handler.page_size) {
deferred_events.push(event);
if uffd_handler.userfault_bitmap.is_some() {
if uffd_handler
.userfault_bitmap
.as_mut()
.unwrap()
.is_bit_set(bit)
{
if !uffd_handler.serve_pf(addr.cast(), uffd_handler.page_size) {
deferred_events.push(event);
}
} else {
// TODO: we currently ignore the result as we may attempt to
// populate the page that is already present as we may receive
// multiple minor fault events per page.
let _ = uffd_continue(
uffd_handler.uffd.as_raw_fd(),
addr as _,
uffd_handler.page_size as u64,
)
.inspect_err(|err| {
println!("uffdio_continue error: {:?}", err)
});
}
} else {
if !uffd_handler.serve_pf(addr.cast(), uffd_handler.page_size) {
deferred_events.push(event);
}
}
}
userfaultfd::Event::Remove { start, end } => {
uffd_handler.mark_range_removed(start as u64, end as u64)
}
_ => panic!("Unexpected event on userfaultfd"),
}
userfaultfd::Event::Remove { start, end } => {
uffd_handler.mark_range_removed(start as u64, end as u64)
}
_ => panic!("Unexpected event on userfaultfd"),
}

// We assume that really only the above removed/pagefault interaction can result in
// deferred events. In that scenario, the loop will always terminate (unless
// newly arriving `remove` events end up indefinitely blocking it, but there's
// nothing we can do about that, and it's a largely theoretical
// problem).
if deferred_events.is_empty() {
break;
}
}
},
|uffd_handler: &mut UffdHandler, offset: usize| {
let bytes_written = uffd_handler.populate_via_write(offset, uffd_handler.page_size);

// We assume that really only the above removed/pagefault interaction can result in
// deferred events. In that scenario, the loop will always terminate (unless
// newly arriving `remove` events end up indefinitely blocking it, but there's nothing
// we can do about that, and it's a largely theoretical problem).
if deferred_events.is_empty() {
break;
if bytes_written == 0 {
println!(
"got a vcpu fault for an already populated page at offset {}",
offset
);
} else {
assert_eq!(bytes_written, uffd_handler.page_size);
}
}
});
},
);
}
Loading