From 9b57dfc4d58e6ba40eaeb050c3fe836ed774b650 Mon Sep 17 00:00:00 2001 From: Cerul Alain Date: Fri, 30 Apr 2021 10:16:58 -0400 Subject: [PATCH 1/4] Unwind main() for clean shutdown (vs. libc::_exit()) The design of the existing main() function was such that Firecracker never unwound the stack to the top level in order to end the program. Instead, it would run some Firecracker-specific shutdown code and then call `unsafe { libc::_exit() }`. That skips some Rust-specific shutdown done by std::process::exit(). But even using that does not call destructors for the stack objects, so it is not a "clean shutdown": https://doc.rust-lang.org/std/process/fn.exit.html While a "dirty shutdown" can be a bit faster, the benefit of having non-panic shutdowns run their finalization is that it helps with the accounting of Valgrind and sanitizers. It's a good sanity check to make sure the program could exit cleanly if it wanted to. This commit attempts a somewhat minimally-invasive means of bubbling up an "ExitCode". Exiting was managed uniquely by the Vmm event subscriber during the process() callback, and requires running the vmm.stop() method. So this extends the Subscriber trait with an extended form of callback named process_exitable() that returns an optional exit code. The event loop doesn't have a clear (to me) way to get at the VMM to call stop() if a non-Vmm subscriber requests an exit. So the interface means any other subscriber that overrode process_exitable() would miss the required stop code. So it suggests the need for a redesign by someone more familiar with the architecture. However, it does accomplish a proof of concept by getting closer to a clean bill of health from a Valgrind analysis. --- src/firecracker/src/api_server_adapter.rs | 15 +++-- src/firecracker/src/main.rs | 31 +++++++-- src/polly/src/event_manager.rs | 82 ++++++++++++++++++++--- src/vmm/src/lib.rs | 19 +++--- src/vmm/tests/integration_tests.rs | 8 +-- 5 files changed, 116 insertions(+), 39 deletions(-) diff --git a/src/firecracker/src/api_server_adapter.rs b/src/firecracker/src/api_server_adapter.rs index 759f633e3f9..f3f67af3d62 100644 --- a/src/firecracker/src/api_server_adapter.rs +++ b/src/firecracker/src/api_server_adapter.rs @@ -12,7 +12,7 @@ use std::{ use api_server::{ApiRequest, ApiResponse, ApiServer}; use logger::{error, warn, ProcessTimeReporter}; use mmds::MMDS; -use polly::event_manager::{EventManager, Subscriber}; +use polly::event_manager::{EventManager, Subscriber, ExitCode}; use seccomp::BpfProgram; use utils::{ epoll::{EpollEvent, EventSet}, @@ -43,7 +43,7 @@ impl ApiServerAdapter { vm_resources: VmResources, vmm: Arc>, event_manager: &mut EventManager, - ) { + ) -> ExitCode { let api_adapter = Arc::new(Mutex::new(Self { api_event_fd, from_api, @@ -53,10 +53,13 @@ impl ApiServerAdapter { event_manager .add_subscriber(api_adapter) .expect("Cannot register the api event to the event manager."); + loop { - event_manager - .run() + let opt_exit_code = event_manager + .run_maybe_exiting() .expect("EventManager events driver fatal error"); + + if let Some(exit_code) = opt_exit_code { return exit_code; } } } @@ -127,7 +130,7 @@ pub(crate) fn run_with_api( instance_info: InstanceInfo, process_time_reporter: ProcessTimeReporter, boot_timer_enabled: bool, -) { +) -> ExitCode { // FD to notify of API events. This is a blocking eventfd by design. // It is used in the config/pre-boot loop which is a simple blocking loop // which only consumes API events. @@ -242,5 +245,5 @@ pub(crate) fn run_with_api( vm_resources, vmm, &mut event_manager, - ); + ) } diff --git a/src/firecracker/src/main.rs b/src/firecracker/src/main.rs index 130a83bbc8c..feab23add58 100644 --- a/src/firecracker/src/main.rs +++ b/src/firecracker/src/main.rs @@ -11,7 +11,7 @@ use std::process; use std::sync::{Arc, Mutex}; use logger::{error, info, IncMetric, ProcessTimeReporter, LOGGER, METRICS}; -use polly::event_manager::EventManager; +use polly::event_manager::{EventManager, ExitCode}; use seccomp::{BpfProgram, SeccompLevel}; use utils::arg_parser::{ArgParser, Argument}; use utils::terminal::Terminal; @@ -30,7 +30,7 @@ const DEFAULT_API_SOCK_PATH: &str = "/run/firecracker.socket"; const DEFAULT_INSTANCE_ID: &str = "anonymous-instance"; const FIRECRACKER_VERSION: &str = env!("FIRECRACKER_VERSION"); -fn main() { +fn main_exitable() -> ExitCode { LOGGER .configure(Some(DEFAULT_INSTANCE_ID.to_string())) .expect("Failed to register logger"); @@ -254,17 +254,32 @@ fn main() { instance_info, process_time_reporter, boot_timer_enabled, - ); + ) } else { run_without_api( seccomp_filter, vmm_config_json, &instance_info, boot_timer_enabled, - ); + ) } } +fn main () { + // This idiom is the prescribed way to get a clean shutdown of Rust (that will report + // no leaks in Valgrind or sanitizers). Calling `unsafe { libc::exit() }` does no + // cleanup, and std::process::exit() does more--but does not run destructors. So the + // best thing to do is to is bubble up the exit code through the whole stack, and + // only exit when everything potentially destructible has cleaned itself up. + // + // https://doc.rust-lang.org/std/process/fn.exit.html + // + // See process_exitable() method of Subscriber trait for what triggers the exit_code. + // + let exit_code = main_exitable(); + std::process::exit(i32::from(exit_code)); +} + // Print supported snapshot data format versions. fn print_supported_snapshot_versions() { let mut snapshot_versions_str = "Supported snapshot data format versions:".to_string(); @@ -316,7 +331,7 @@ fn run_without_api( config_json: Option, instance_info: &InstanceInfo, bool_timer_enabled: bool, -) { +) -> ExitCode { let mut event_manager = EventManager::new().expect("Unable to create EventManager"); // Right before creating the signalfd, @@ -357,8 +372,10 @@ fn run_without_api( // Run the EventManager that drives everything in the microVM. loop { - event_manager - .run() + let opt_exit_code = event_manager + .run_maybe_exiting() .expect("Failed to start the event manager"); + + if let Some(exit_code) = opt_exit_code { return exit_code; } } } diff --git a/src/polly/src/event_manager.rs b/src/polly/src/event_manager.rs index c2203e4e565..49ce3b4c4ea 100644 --- a/src/polly/src/event_manager.rs +++ b/src/polly/src/event_manager.rs @@ -12,6 +12,15 @@ use utils::epoll::{self, Epoll, EpollEvent}; pub type Result = std::result::Result; pub type Pollable = RawFd; +/// Process Exit Code, historically an 8 bit number on Unix; use a type alias +/// since `fn something(...) -> u8` is non-obvious. +/// +/// https://tldp.org/LDP/abs/html/exitcodes.html +/// +/// !!! Should i32 be used instead, since `std::process::exit` allows it? +/// +pub type ExitCode = u8; + /// Errors associated with epoll events handling. pub enum Error { /// Cannot create epoll fd. @@ -57,7 +66,25 @@ pub trait Subscriber { /// The only functions safe to call on this `EventManager` reference /// are `register`, `unregister` and `modify` which correspond to /// the `libc::epoll_ctl` operations. - fn process(&mut self, event: &EpollEvent, event_manager: &mut EventManager); + fn process(&mut self, _event: &EpollEvent, _event_manager: &mut EventManager) { + // + // !!! Can it be compile-time checked that one-and-only-one of process() or + // process_exitable() are overloaded? + // + panic!("Either process() or process_exitable() must be implemented for Subscriber"); + } + + /// Variant of the process() callback that lets the subscriber signal a desire to exit + /// by returning a non-None ExitCode. + /// + /// This is what's actually called when dispatching subscriptions, but it delegates + /// to the plain form. That way most subscribers implement process() and can avoid + /// needing to `return None` to signal not exiting. + /// + fn process_exitable(&mut self, event: &EpollEvent, event_manager: &mut EventManager) -> Option { + self.process(event, event_manager); + None + } /// Returns a list of `EpollEvent` that this subscriber is interested in. fn interest_list(&self) -> Vec; @@ -170,41 +197,74 @@ impl EventManager { Ok(()) } - /// Wait for events, then dispatch to the registered event handlers. - pub fn run(&mut self) -> Result { - self.run_with_timeout(-1) - } - /// Wait for events for a maximum timeout of `miliseconds`. Dispatch the events to the /// registered signal handlers. - pub fn run_with_timeout(&mut self, milliseconds: i32) -> Result { + /// + /// First return is None if the events were processed and no exit events were encountered. + /// If an exit event happens and `--cleanup-level` is 2, the exit code will be returned. + /// Lower cleanup levels that get exit events will call libc::exit(), thus not return + /// + /// Second return is the number of dispatched events, which is only used by the test code. + pub fn run_core(&mut self, milliseconds: i32) -> Result<(Option, usize)> { let event_count = match self.epoll.wait(milliseconds, &mut self.ready_events[..]) { Ok(event_count) => event_count, Err(e) if e.raw_os_error() == Some(libc::EINTR) => 0, Err(e) => return Err(Error::Poll(e)), }; - self.dispatch_events(event_count); + let opt_exit_code = self.dispatch_events(event_count); + + Ok((opt_exit_code, event_count)) // event count is checked by the tests + } + /// Version of run used by Firecracker, which doesn't need the count of dispatched events + /// or a timeout. It wants to be able to receive an exit code if `--cleanup-level` is 2. + pub fn run_maybe_exiting(&mut self) -> Result> { + let (opt_exit_code, _event_count) = self.run_core(-1)?; + Ok(opt_exit_code) + } + + /// Variation of run used by tests which want a count of dispatched events, and also + /// a timeout. The tests were written before the concept of clean shutdown, so they + /// assume any exit events would have terminated Firecracker abuptly. + pub fn run_with_timeout(&mut self, milliseconds: i32) -> Result { + let (opt_exit_code, event_count) = self.run_core(milliseconds)?; + assert!(opt_exit_code.is_none()); Ok(event_count) } - fn dispatch_events(&mut self, event_count: usize) { + /// Variation of run used by tests which don't want to use a timeout. + pub fn run(&mut self) -> Result { + self.run_with_timeout(-1) + } + + fn dispatch_events(&mut self, event_count: usize) -> Option { // Use the temporary, pre-allocated buffer to check ready events. for ev_index in 0..event_count { let event = &self.ready_events[ev_index].clone(); let pollable = event.fd(); if self.subscribers.contains_key(&pollable) { - self.subscribers + let opt_exit_code = self.subscribers .get_mut(&pollable) .unwrap() // Safe because we have already checked existence .clone() .lock() .expect("Poisoned lock") - .process(&event, self); + .process_exitable(&event, self); + + if opt_exit_code != None { + // + // Note: allowing other handlers to run could create an + // ambiguity of which exit code to use if more than one asked + // to exit. Thus we signal exit on the first handler that asks. + // + return opt_exit_code; + } } // TODO: Should we log an error in case the subscriber does not exist? } + + None } } diff --git a/src/vmm/src/lib.rs b/src/vmm/src/lib.rs index 3e44193e90d..ff9ad98481f 100644 --- a/src/vmm/src/lib.rs +++ b/src/vmm/src/lib.rs @@ -58,7 +58,7 @@ use devices::virtio::{ }; use devices::BusDevice; use logger::{error, info, warn, LoggerError, MetricsError, METRICS}; -use polly::event_manager::{EventManager, Subscriber}; +use polly::event_manager::{EventManager, Subscriber, ExitCode}; use rate_limiter::BucketUpdate; use seccomp::BpfProgramRef; use snapshot::Persist; @@ -339,8 +339,9 @@ impl Vmm { .map_err(Error::I8042Error) } - /// Waits for all vCPUs to exit and terminates the Firecracker process. - pub fn stop(&mut self, exit_code: i32) { + /// Waits for all vCPUs to exit. Does not terminate the Firecracker process. + /// (See notes in main() about why ExitCode is bubbled up for clean shutdown.) + pub fn stop(&mut self) { info!("Vmm is stopping."); if let Some(observer) = self.events_observer.as_mut() { @@ -353,12 +354,6 @@ impl Vmm { if let Err(e) = METRICS.write() { error!("Failed to write metrics while stopping: {}", e); } - - // Exit from Firecracker using the provided exit code. Safe because we're terminating - // the process anyway. - unsafe { - libc::_exit(exit_code); - } } /// Saves the state of a paused Microvm. @@ -717,7 +712,7 @@ impl Drop for Vmm { impl Subscriber for Vmm { /// Handle a read event (EPOLLIN). - fn process(&mut self, event: &EpollEvent, _: &mut EventManager) { + fn process_exitable(&mut self, event: &EpollEvent, _: &mut EventManager) -> Option { let source = event.fd(); let event_set = event.event_set(); @@ -735,9 +730,11 @@ impl Subscriber for Vmm { _ => None, }) .unwrap_or(FC_EXIT_CODE_OK); - self.stop(i32::from(exit_code)); + self.stop(); + Some(exit_code) } else { error!("Spurious EventManager event for handler: Vmm"); + None } } diff --git a/src/vmm/tests/integration_tests.rs b/src/vmm/tests/integration_tests.rs index 4c1003aae03..f699882acaf 100644 --- a/src/vmm/tests/integration_tests.rs +++ b/src/vmm/tests/integration_tests.rs @@ -73,7 +73,7 @@ fn test_build_microvm() { let _ = event_manager.run_with_timeout(500).unwrap(); #[cfg(target_arch = "x86_64")] - vmm.lock().unwrap().stop(-1); // If we got here, something went wrong. + vmm.lock().unwrap().stop(1); // If we got here, something went wrong. #[cfg(target_arch = "aarch64")] vmm.lock().unwrap().stop(0); } @@ -104,7 +104,7 @@ fn test_vmm_seccomp() { // Give the vCPUs a chance to attempt KVM_RUN. thread::sleep(Duration::from_millis(200)); // Should never get here. - vmm.lock().unwrap().stop(-1); + vmm.lock().unwrap().stop(1); } vmm_pid => { // Parent process: wait for the vmm to exit. @@ -160,7 +160,7 @@ fn test_pause_resume_microvm() { let _ = event_manager.run_with_timeout(500).unwrap(); #[cfg(target_arch = "x86_64")] - vmm.lock().unwrap().stop(-1); // If we got here, something went wrong. + vmm.lock().unwrap().stop(1); // If we got here, something went wrong. #[cfg(target_arch = "aarch64")] vmm.lock().unwrap().stop(0); } @@ -193,7 +193,7 @@ fn test_dirty_bitmap_error() { let _ = event_manager.run_with_timeout(500).unwrap(); #[cfg(target_arch = "x86_64")] - vmm.lock().unwrap().stop(-1); // If we got here, something went wrong. + vmm.lock().unwrap().stop(1); // If we got here, something went wrong. #[cfg(target_arch = "aarch64")] vmm.lock().unwrap().stop(0); } From 960758e0eecc0ba69038f2740a4c91c0fda22910 Mon Sep 17 00:00:00 2001 From: Cerul Alain Date: Sat, 1 May 2021 10:18:36 -0400 Subject: [PATCH 2/4] Shutdown API server thread so it can thread::join() The general presumption for Firecracker was that it would be exited via an exit() call. This meant there was no way to signal threads to exit their loops so they could be join()'d to release their resources. Not being able to exit the stack normally (if one wants to) misses a sanity check that shutting down cleanly provides. The output of Valgrind and other tools is also less useful. Because the API Server thread is in a loop that does a blocking wait on a socket, the only way to tell it to stop looping is via information on that socket. This uses an internal HTTP GET signal to request that shutdown. Rather than forget the thread handle and the socket bind location, the main thread now sends the shutdown request and then waits while the thread joins. The request is for internal use only, however--because it is supposed to happen after the Vmm has been cleaned up and its contents destructed. A different approach for signaling the thread for this purpose could be used, e.g. to have the API server thread waiting on multiple events (the socket or this internal signal). But since this is only needed for clean shutdown, it is probably good enough. A faster shutdown using exit() would be a better default--this will likely only be used for debugging and Valgrind/sanitization. --- src/api_server/src/lib.rs | 119 ++++++++++++++-------- src/api_server/src/parsed_request.rs | 2 + src/firecracker/src/api_server_adapter.rs | 33 +++++- 3 files changed, 108 insertions(+), 46 deletions(-) diff --git a/src/api_server/src/lib.rs b/src/api_server/src/lib.rs index 1fe8110ccbf..1d4f25a0321 100644 --- a/src/api_server/src/lib.rs +++ b/src/api_server/src/lib.rs @@ -198,71 +198,106 @@ impl ApiServer { } server.start_server().expect("Cannot start HTTP server"); + loop { - match server.requests() { - Ok(request_vec) => { - for server_request in request_vec { - let request_processing_start_us = - utils::time::get_time_us(utils::time::ClockType::Monotonic); - server - .respond( - // Use `self.handle_request()` as the processing callback. - server_request.process(|request| { - self.handle_request(request, request_processing_start_us) - }), - ) - .or_else(|e| { - error!("API Server encountered an error on response: {}", e); - Ok(()) - })?; - let delta_us = utils::time::get_time_us(utils::time::ClockType::Monotonic) - - request_processing_start_us; - debug!("Total previous API call duration: {} us.", delta_us); - if self.vmm_fatal_error { - // Flush the remaining outgoing responses - // and proceed to exit - server.flush_outgoing_writes(); - error!( - "Fatal error with exit code: {}", - FC_EXIT_CODE_BAD_CONFIGURATION - ); - unsafe { - libc::_exit(i32::from(FC_EXIT_CODE_BAD_CONFIGURATION)); - } - } - } - } - Err(e) => { + let request_vec = match server.requests() { + Ok(vec) => vec, + Err(e) => { // print request error, but keep server running error!( "API Server error on retrieving incoming request. Error: {}", e ); + continue + } + }; + + for server_request in request_vec { + let request_processing_start_us = + utils::time::get_time_us(utils::time::ClockType::Monotonic); + + // !!! ServerRequest::process() doesn't appear to do anything besides put a private + // id field into the response. But since it takes an inner function it cannot + // return from this function without giving a response. Review a better way of + // exiting cleanly than by reaching out of the closure to set a boolean. + + let mut shutting_down = false; + + server + .respond( + server_request.process(|request| { + match self.handle_request_or_finishing(request, request_processing_start_us) { + Some(response) => response, + None => { + shutting_down = true; + Response::new(Version::Http11, StatusCode::NoContent) + } + } + }), + ) + .or_else(|e| { + error!("API Server encountered an error on response: {}", e); + Ok(()) + })?; + + let delta_us = utils::time::get_time_us(utils::time::ClockType::Monotonic) + - request_processing_start_us; + debug!("Total previous API call duration: {} us.", delta_us); + + if shutting_down { + debug!("/shutdown-internal request received, API server thread now ending itself"); + return Ok(()); + } + + if self.vmm_fatal_error { + // Flush the remaining outgoing responses + // and proceed to exit + server.flush_outgoing_writes(); + error!( + "Fatal error with exit code: {}", + FC_EXIT_CODE_BAD_CONFIGURATION + ); + unsafe { + libc::_exit(i32::from(FC_EXIT_CODE_BAD_CONFIGURATION)); + } } } } } /// Handles an API request received through the associated socket. - pub fn handle_request( + /// If None is given back, it means a ShutdownInternal was requested and no response + /// is expected (should be requested by the main thread, not by clients) + pub fn handle_request_or_finishing( &mut self, request: &Request, request_processing_start_us: u64, - ) -> Response { + ) -> Option { match ParsedRequest::try_from_request(request) { Ok(ParsedRequest::Sync(vmm_action)) => { - self.serve_vmm_action_request(vmm_action, request_processing_start_us) + Some(self.serve_vmm_action_request(vmm_action, request_processing_start_us)) } - Ok(ParsedRequest::GetInstanceInfo) => self.get_instance_info(), - Ok(ParsedRequest::GetMMDS) => self.get_mmds(), - Ok(ParsedRequest::PatchMMDS(value)) => self.patch_mmds(value), - Ok(ParsedRequest::PutMMDS(value)) => self.put_mmds(value), + Ok(ParsedRequest::GetInstanceInfo) => Some(self.get_instance_info()), + Ok(ParsedRequest::GetMMDS) => Some(self.get_mmds()), + Ok(ParsedRequest::PatchMMDS(value)) => Some(self.patch_mmds(value)), + Ok(ParsedRequest::PutMMDS(value)) => Some(self.put_mmds(value)), + Ok(ParsedRequest::ShutdownInternal) => None, Err(e) => { error!("{}", e); - e.into() + Some(e.into()) } } } + /// Variant of handle_request that is used by tests, and does not know about the + /// ShutdownInternal mechanism. So the response is non-optional. + pub fn handle_request( + &mut self, + request: &Request, + request_processing_start_us: u64, + ) -> Response { + self.handle_request_or_finishing(request, request_processing_start_us).unwrap() + } + fn serve_vmm_action_request( &mut self, vmm_action: Box, diff --git a/src/api_server/src/parsed_request.rs b/src/api_server/src/parsed_request.rs index e52ad6e4197..0237657fbed 100644 --- a/src/api_server/src/parsed_request.rs +++ b/src/api_server/src/parsed_request.rs @@ -31,6 +31,7 @@ pub(crate) enum ParsedRequest { PatchMMDS(Value), PutMMDS(Value), Sync(Box), + ShutdownInternal, // !!! not an API, used by shutdown to thread::join the API thread } impl ParsedRequest { @@ -57,6 +58,7 @@ impl ParsedRequest { match (request.method(), path, request.body.as_ref()) { (Method::Get, "", None) => parse_get_instance_info(), + (Method::Get, "shutdown-internal", None) => Ok(ParsedRequest::ShutdownInternal), (Method::Get, "balloon", None) => parse_get_balloon(path_tokens.get(1)), (Method::Get, "machine-config", None) => parse_get_machine_config(), (Method::Get, "mmds", None) => parse_get_mmds(), diff --git a/src/firecracker/src/api_server_adapter.rs b/src/firecracker/src/api_server_adapter.rs index f3f67af3d62..faa0d2d6b35 100644 --- a/src/firecracker/src/api_server_adapter.rs +++ b/src/firecracker/src/api_server_adapter.rs @@ -7,6 +7,9 @@ use std::{ sync::mpsc::{channel, Receiver, Sender, TryRecvError}, sync::{Arc, Mutex}, thread, + + os::unix::net::UnixStream, // for main thread to send ShutdownInternal to API thread + io::prelude::*, // UnixStream write_all() requires prelude }; use api_server::{ApiRequest, ApiResponse, ApiServer}; @@ -146,9 +149,10 @@ pub(crate) fn run_with_api( .try_clone() .expect("Failed to clone API event FD"); + let api_bind_path = bind_path.clone(); let api_seccomp_filter = seccomp_filter.clone(); // Start the separate API thread. - thread::Builder::new() + let api_thread = thread::Builder::new() .name("fc_api".to_owned()) .spawn(move || { mask_handled_signals().expect("Unable to install signal mask on API thread."); @@ -160,7 +164,7 @@ pub(crate) fn run_with_api( from_vmm, to_vmm_event_fd, ) - .bind_and_run(bind_path, process_time_reporter, api_seccomp_filter) + .bind_and_run(api_bind_path, process_time_reporter, api_seccomp_filter) { Ok(_) => (), Err(api_server::Error::Io(inner)) => match inner.kind() { @@ -238,12 +242,33 @@ pub(crate) fn run_with_api( .expect("Poisoned lock") .start(super::metrics::WRITE_METRICS_PERIOD_MS); - ApiServerAdapter::run_microvm( + let exit_code = ApiServerAdapter::run_microvm( api_event_fd, from_api, to_api, vm_resources, vmm, &mut event_manager, - ) + ); + + // We want to tell the API thread to shut down for a clean exit. But this is after + // the Vmm.stop() has been called, so it's a moment of internal finalization (as + // opposed to be something the client might call to shut the Vm down). Since it's + // an internal signal implementing it with an HTTP request is probably not the ideal + // way to do it...but having another way would involve waiting on the socket or some + // other signal. This leverages the existing wait. + // + // !!! Since the code is only needed for a "clean" shutdown mode, a non-clean mode + // could not respond to the request, making this effectively a debug-only feature. + // + let mut sock = UnixStream::connect(bind_path).unwrap(); + assert!(sock.write_all(b"GET /shutdown-internal HTTP/1.1\r\n\r\n").is_ok()); + + // This call to thread::join() should block until the API thread has processed the + // shutdown-internal and returns from its function. If it doesn't block here, then + // that means it got the message; so no need to process a response. + // + api_thread.join().unwrap(); + + exit_code } From ace3ce0ec6216dc8b59e0f9308b1257b8752ef1f Mon Sep 17 00:00:00 2001 From: Cerul Alain Date: Sat, 1 May 2021 10:46:50 -0400 Subject: [PATCH 3/4] Shutdown VCPU threads so they can thread::join() This continues the work from the prior commit toward the goal of releasing all resources with a clean shutdown. The VCPU threads in non-test configurations were running with an unbreakable wait, so they only way to get past them was with exit(): // Wait indefinitely // The VMM thread will kill the entire process let barrier = Barrier::new(2); barrier.wait(); Here that is changed, along with several other entangled issues. One was that Drop methods were used to try and clean up the Vmm and the VcpuHandles, but these were not being called. The reason is that there was a cyclic dependency: threads held onto the Vmm, so the Vmm could not exit VCPU threads in its Drop method (as it wouldn't be dropped so long as the VCPU threads were around). Another complexity surrounds defining the difference between an exited thread (from which an exit event can be read out of the channel) and a finished thread (which tears down the channel). Trying to send a message to a thread which has finished will panic, and some CPUs might be in the exit state while others may not...and reading the "exit state" by taking the message out of the channel removes it. It's a bit complicated. This commit tries to move toward simplification by being explicit about the Vmm.stop() call being responsible for exiting the VCPUs, and making the VcpuHandle's Drop method only do the simple task of joining an exited thread. --- src/vmm/src/lib.rs | 96 ++++++++++++++++++++++++++++------ src/vmm/src/vstate/vcpu/mod.rs | 68 ++++++++++++------------ 2 files changed, 115 insertions(+), 49 deletions(-) diff --git a/src/vmm/src/lib.rs b/src/vmm/src/lib.rs index ff9ad98481f..2a8aac067f3 100644 --- a/src/vmm/src/lib.rs +++ b/src/vmm/src/lib.rs @@ -36,7 +36,7 @@ use std::collections::HashMap; use std::fmt::{Display, Formatter}; use std::io; use std::os::unix::io::AsRawFd; -use std::sync::mpsc::RecvTimeoutError; +use std::sync::mpsc::{RecvTimeoutError, TryRecvError}; use std::sync::Mutex; use std::time::Duration; @@ -317,12 +317,26 @@ impl Vmm { /// Sends an exit command to the vCPUs. pub fn exit_vcpus(&mut self) -> Result<()> { - self.broadcast_vcpu_event( - VcpuEvent::Exit, - VcpuResponse::Exited(FC_EXIT_CODE_GENERIC_ERROR), - ) - .map_err(|_| Error::VcpuExit) + // + // We actually send a "Finish" event. If a VCPU has already exited, this is + // the only message it will accept...but runinng and paused will take it as well. + // It breaks out of the state machine loop so that the thread can be joined. + // + for handle in &self.vcpus_handles { + handle + .send_event(VcpuEvent::Finish) + .map_err(|_| Error::VcpuMessage)?; + } + + // The actual thread::join() that runs to release the thread's resource is done in + // the VcpuHandle's Drop trait. We can trigger that to happen now by clearing the + // list of handles (Vmm's Drop will also asserts this list is empty). + // + self.vcpus_handles.clear(); + + Ok(()) } + /// Returns a reference to the inner `GuestMemoryMmap` object if present, or `None` otherwise. pub fn guest_memory(&self) -> &GuestMemoryMmap { &self.guest_memory @@ -344,6 +358,8 @@ impl Vmm { pub fn stop(&mut self) { info!("Vmm is stopping."); + self.exit_vcpus().unwrap(); // exit all not-already-exited VCPUs, join their threads + if let Some(observer) = self.events_observer.as_mut() { if let Err(e) = observer.on_vmm_stop() { warn!("{}", Error::VmmObserverTeardown(e)); @@ -706,7 +722,24 @@ fn construct_kvm_mpidrs(vcpu_states: &[VcpuState]) -> Vec { impl Drop for Vmm { fn drop(&mut self) { - let _ = self.exit_vcpus(); + // + // !!! This used to think it exited the cpus, which would do a thread join and tie up + // the vcpu threads: + // + // let _ = self.exit_vcpus(); + // + // But since the main firecracker process used exit() to terminate in mid-stack, the + // destructors for Vmm and other classes would not run. Even after mitigating that by + // running the exit from a top-level main() wrapper, the Vmm's Drop still was not + // happening because of a cycle: VCPU threads held references to the Vmm, so you couldn't + // have the Vmm drop be how the threads were exiting. Further, the VCPU threads would + // not exit cleanly--they'd just block on a barrier--and could not be thread::join'd. + // + // To be more explicit and untangle those problems, exit_vcpus() is moved into Vmm::stop(). + // So now, this Drop method just asserts that happened. It won't trigger if there's + // another situation where there's no Vmm Drop...but a Valgrind run will pick up the leak. + // + assert!(self.vcpus_handles.is_empty()); } } @@ -718,20 +751,49 @@ impl Subscriber for Vmm { if source == self.exit_evt.as_raw_fd() && event_set == EventSet::IN { let _ = self.exit_evt.read(); + + let mut opt_exit_code: Option = None; + // Query each vcpu for the exit_code. + // + for handle in &self.vcpus_handles { + match handle.response_receiver().try_recv() { + Ok(VcpuResponse::Exited(status)) => { + if opt_exit_code.is_none() { + opt_exit_code = Some(status); + } else { + warn!("Multiple VCPU exit states detected, using first exit code"); + } + } + Err(TryRecvError::Empty) => {} // Nothing pending in channel + Ok(_response) => { + // + // !!! This removes a response from the VCPU channel. Is there + // anything to lose from that, given that we are exiting? + } + Err(e) => { + panic!("Error while looking for VCPU exit status: {}", e); + } + } + } + + // !!! The caller of this routine is receiving the exit code to bubble back up + // to the main() function to return cleanly. However, it does not have clean + // access to the Vmm to shut it down (here we have it, since it is `self`). It + // might seem tempting to put the stop() code in the Drop trait...but since + // stopping the Vmm involves shutting down VCPU threads which hold references + // on the Vmm, that presents a cycle. The solution of the moment is to do + // everything here...which means this is the only process_exitable() handler + // that will actually work with an exit code (all other Subscriber trait + // implementers must use process()) + // + self.stop(); + // If the exit_code can't be found on any vcpu, it means that the exit signal // has been issued by the i8042 controller in which case we exit with // FC_EXIT_CODE_OK. - let exit_code = self - .vcpus_handles - .iter() - .find_map(|handle| match handle.response_receiver().try_recv() { - Ok(VcpuResponse::Exited(exit_code)) => Some(exit_code), - _ => None, - }) - .unwrap_or(FC_EXIT_CODE_OK); - self.stop(); - Some(exit_code) + // + Some(opt_exit_code.unwrap_or(FC_EXIT_CODE_OK)) } else { error!("Spurious EventManager event for handler: Vmm"); None diff --git a/src/vmm/src/vstate/vcpu/mod.rs b/src/vmm/src/vstate/vcpu/mod.rs index 1e91bc69632..e995dd12341 100644 --- a/src/vmm/src/vstate/vcpu/mod.rs +++ b/src/vmm/src/vstate/vcpu/mod.rs @@ -6,8 +6,6 @@ // found in the THIRD-PARTY file. use libc::{c_int, c_void, siginfo_t}; -#[cfg(not(test))] -use std::sync::Barrier; #[cfg(test)] use std::sync::Mutex; use std::{ @@ -332,6 +330,7 @@ impl Vcpu { .expect("failed to send save not allowed status"); } Ok(VcpuEvent::Exit) => return self.exit(FC_EXIT_CODE_GENERIC_ERROR), + Ok(VcpuEvent::Finish) => return StateMachine::finish(), // Unhandled exit of the other end. Err(TryRecvError::Disconnected) => { // Move to 'exited' state. @@ -396,6 +395,7 @@ impl Vcpu { StateMachine::next(Self::paused) } Ok(VcpuEvent::Exit) => self.exit(FC_EXIT_CODE_GENERIC_ERROR), + Ok(VcpuEvent::Finish) => StateMachine::finish(), // Unhandled exit of the other end. Err(_) => { // Move to 'exited' state. @@ -404,41 +404,38 @@ impl Vcpu { } } - #[cfg(not(test))] // Transition to the exited state. fn exit(&mut self, exit_code: u8) -> StateMachine { self.response_sender .send(VcpuResponse::Exited(exit_code)) .expect("vcpu channel unexpectedly closed"); - if let Err(e) = self.exit_evt.write(1) { - METRICS.vcpu.failures.inc(); - error!("Failed signaling vcpu exit event: {}", e); - } - - // State machine reached its end. StateMachine::next(Self::exited) } - #[cfg(not(test))] // This is the main loop of the `Exited` state. fn exited(&mut self) -> StateMachine { - // Wait indefinitely. - // The VMM thread will kill the entire process. - let barrier = Barrier::new(2); - barrier.wait(); - - StateMachine::finish() - } + // + // !!! This signaled `EventFd` is not used by the main protocol. Is it specific to test? + // What does the test actually want to know by seeing it set? + // + if let Err(e) = self.exit_evt.write(1) { + METRICS.vcpu.failures.inc(); + error!("Failed signaling vcpu exit event: {}", e); + } - #[cfg(test)] - // In tests the main/vmm thread exits without 'exit()'ing the whole process. - // All channels get closed on the other side while this Vcpu thread is still running. - // This Vcpu thread should just do a clean finish without reporting back to the main thread. - fn exit(&mut self, _: u8) -> StateMachine { - self.exit_evt.write(1).unwrap(); - // State machine reached its end. - StateMachine::finish() + // !!! Stylistically we might like to force a transition to an exit state for all VCPUs + // before allowing them to accept a Finish and terminate the state loop. But you run into + // trouble with sending Exit to already exited VCPUs. This is because right now, reading + // (and removing) an exited event from the channel is how the main thread discovers exits + // arising from the VCPU itself. So by the time clean shutdown is performed, one can't + // tell if a VCPU is in the exit state or not. For the moment, the main benefit is that + // the exit state is able to refuse to process any other events besides Finish. + // + match self.event_receiver.recv() { + Ok(VcpuEvent::Finish) => StateMachine::finish(), + _ => panic!("exited() state of VCPU can only respond to Finish events (for thread join)") + } } #[cfg(not(test))] @@ -558,6 +555,8 @@ impl Drop for Vcpu { pub enum VcpuEvent { /// The vCPU will go to exited state when receiving this message. Exit, + /// Actual thread (channel) ends on finish--hence no response. + Finish, /// Pause the Vcpu. Pause, /// Event to resume the Vcpu. @@ -825,15 +824,20 @@ mod tests { } } - // In tests we need to close any pending Vcpu threads on test completion. + // Wait for the Vcpu thread to finish execution impl Drop for VcpuHandle { fn drop(&mut self) { - // Make sure the Vcpu is out of KVM_RUN. - self.send_event(VcpuEvent::Pause).unwrap(); - // Close the original channel so that the Vcpu thread errors and goes to exit state. - let (event_sender, _event_receiver) = channel(); - self.event_sender = event_sender; - // Wait for the Vcpu thread to finish execution + // + // !!! Previously, this Drop code would attempt to send messages to the Vcpu + // to put it in an exit state. We now assume that by the time a VcpuHandle is + // dropped, other code has run to get the state machine loop to finish so the + // thread is ready to join. The strategy of avoiding more complex messaging + // protocols during the Drop helps avoid cycles which were preventing a truly + // clean shutdown. + // + // If the code hangs at this point, that means that a Finish event was not + // sent by Vmm.stop(). + // self.vcpu_thread.take().unwrap().join().unwrap(); } } From 314da6ce34ad648dcb793f108443e25349ef29c5 Mon Sep 17 00:00:00 2001 From: Cerul Alain Date: Tue, 4 May 2021 10:23:17 -0400 Subject: [PATCH 4/4] Exit cleanly in debug build, abruptly in release build In order to mitigate the risks of deadlock during shutdown in production builds, this adds back the behavior of exiting the process abruptly after writing metrics files. This also offers better performance by letting the OS deal with cleaning up memory and resources. This limits the behavior of unwinding all the way up to main() and joining the threads to debug builds. The policy could be replaced by an independent flag (if someone thought it important to run Valgrind on release builds, e.g. because it was too slow using it with debug ones). --- src/api_server/src/lib.rs | 3 ++ src/api_server/src/parsed_request.rs | 13 ++++- src/firecracker/src/api_server_adapter.rs | 7 +-- src/firecracker/src/main.rs | 62 +++++++++++++++-------- src/vmm/src/lib.rs | 57 ++++++++++++++++----- 5 files changed, 105 insertions(+), 37 deletions(-) diff --git a/src/api_server/src/lib.rs b/src/api_server/src/lib.rs index 1d4f25a0321..b84496933eb 100644 --- a/src/api_server/src/lib.rs +++ b/src/api_server/src/lib.rs @@ -280,7 +280,10 @@ impl ApiServer { Ok(ParsedRequest::GetMMDS) => Some(self.get_mmds()), Ok(ParsedRequest::PatchMMDS(value)) => Some(self.patch_mmds(value)), Ok(ParsedRequest::PutMMDS(value)) => Some(self.put_mmds(value)), + + #[cfg(debug_assertions)] Ok(ParsedRequest::ShutdownInternal) => None, + Err(e) => { error!("{}", e); Some(e.into()) diff --git a/src/api_server/src/parsed_request.rs b/src/api_server/src/parsed_request.rs index 0237657fbed..63a98f724de 100644 --- a/src/api_server/src/parsed_request.rs +++ b/src/api_server/src/parsed_request.rs @@ -31,6 +31,8 @@ pub(crate) enum ParsedRequest { PatchMMDS(Value), PutMMDS(Value), Sync(Box), + + #[cfg(debug_assertions)] ShutdownInternal, // !!! not an API, used by shutdown to thread::join the API thread } @@ -58,7 +60,16 @@ impl ParsedRequest { match (request.method(), path, request.body.as_ref()) { (Method::Get, "", None) => parse_get_instance_info(), - (Method::Get, "shutdown-internal", None) => Ok(ParsedRequest::ShutdownInternal), + + #[cfg(debug_assertions)] + (Method::Get, "shutdown-internal", None) => { + // + // This isn't a user-facing API, and was added solely to facilitate clean shutdowns. + // Calling it manually will cause problems, so only enable it in debug builds. + // + Ok(ParsedRequest::ShutdownInternal) + }, + (Method::Get, "balloon", None) => parse_get_balloon(path_tokens.get(1)), (Method::Get, "machine-config", None) => parse_get_machine_config(), (Method::Get, "mmds", None) => parse_get_mmds(), diff --git a/src/firecracker/src/api_server_adapter.rs b/src/firecracker/src/api_server_adapter.rs index faa0d2d6b35..11b09164c0c 100644 --- a/src/firecracker/src/api_server_adapter.rs +++ b/src/firecracker/src/api_server_adapter.rs @@ -251,6 +251,10 @@ pub(crate) fn run_with_api( &mut event_manager, ); + // Note: In the release build, this is never reached...because exit() is called + // abruptly (the OS does faster cleanup, and it reduces the risk of hanging). + // Top level main() will complain if the bubbling process happens in release builds. + // We want to tell the API thread to shut down for a clean exit. But this is after // the Vmm.stop() has been called, so it's a moment of internal finalization (as // opposed to be something the client might call to shut the Vm down). Since it's @@ -258,9 +262,6 @@ pub(crate) fn run_with_api( // way to do it...but having another way would involve waiting on the socket or some // other signal. This leverages the existing wait. // - // !!! Since the code is only needed for a "clean" shutdown mode, a non-clean mode - // could not respond to the request, making this effectively a debug-only feature. - // let mut sock = UnixStream::connect(bind_path).unwrap(); assert!(sock.write_all(b"GET /shutdown-internal HTTP/1.1\r\n\r\n").is_ok()); diff --git a/src/firecracker/src/main.rs b/src/firecracker/src/main.rs index feab23add58..e1eee08c270 100644 --- a/src/firecracker/src/main.rs +++ b/src/firecracker/src/main.rs @@ -16,6 +16,7 @@ use seccomp::{BpfProgram, SeccompLevel}; use utils::arg_parser::{ArgParser, Argument}; use utils::terminal::Terminal; use utils::validators::validate_instance_id; + use vmm::default_syscalls::get_seccomp_filter; use vmm::resources::VmResources; use vmm::signal_handler::{mask_handled_signals, SignalManager}; @@ -23,6 +24,9 @@ use vmm::version_map::FC_VERSION_TO_SNAP_VERSION; use vmm::vmm_config::instance_info::InstanceInfo; use vmm::vmm_config::logger::{init_logger, LoggerConfig, LoggerLevel}; +#[cfg(debug_assertions)] +use vmm::exit_firecracker_abruptly; + // The reason we place default API socket under /run is that API socket is a // runtime file. // see https://refspecs.linuxfoundation.org/FHS_3.0/fhs/ch03s15.html for more information. @@ -30,7 +34,7 @@ const DEFAULT_API_SOCK_PATH: &str = "/run/firecracker.socket"; const DEFAULT_INSTANCE_ID: &str = "anonymous-instance"; const FIRECRACKER_VERSION: &str = env!("FIRECRACKER_VERSION"); -fn main_exitable() -> ExitCode { +fn main_exitable() -> (SeccompLevel, ExitCode) { LOGGER .configure(Some(DEFAULT_INSTANCE_ID.to_string())) .expect("Failed to register logger"); @@ -206,13 +210,11 @@ fn main_exitable() -> ExitCode { } // It's safe to unwrap here because the field's been provided with a default value. - let seccomp_level = arguments.single_value("seccomp-level").unwrap(); - let seccomp_filter = get_seccomp_filter( - SeccompLevel::from_string(&seccomp_level).unwrap_or_else(|err| { - panic!("Invalid value for seccomp-level: {}", err); - }), - ) - .unwrap_or_else(|err| { + let seccomp_level_arg = arguments.single_value("seccomp-level").unwrap(); + let seccomp_level = SeccompLevel::from_string(&seccomp_level_arg).unwrap_or_else(|err| { + panic!("Invalid value for seccomp-level: {}", err); + }); + let seccomp_filter = get_seccomp_filter(seccomp_level).unwrap_or_else(|err| { panic!("Could not create seccomp filter: {}", err); }); @@ -224,7 +226,7 @@ fn main_exitable() -> ExitCode { let boot_timer_enabled = arguments.flag_present("boot-timer"); let api_enabled = !arguments.flag_present("no-api"); - if api_enabled { + let exit_code = if api_enabled { let bind_path = arguments .single_value("api-sock") .map(PathBuf::from) @@ -262,22 +264,40 @@ fn main_exitable() -> ExitCode { &instance_info, boot_timer_enabled, ) - } + }; + + (seccomp_level, exit_code) } -fn main () { - // This idiom is the prescribed way to get a clean shutdown of Rust (that will report - // no leaks in Valgrind or sanitizers). Calling `unsafe { libc::exit() }` does no - // cleanup, and std::process::exit() does more--but does not run destructors. So the - // best thing to do is to is bubble up the exit code through the whole stack, and - // only exit when everything potentially destructible has cleaned itself up. +// This idiom of wrapping main is the prescribed way to get a clean shutdown of Rust (that +// will report no leaks in Valgrind or sanitizers). It gives destructors a chance to run. +// +// See process_exitable() method of Subscriber trait for what triggers the exit_code. +// +// Variable named _seccomp_level instead of seccomp_level to avoid warning that the +// release build doesn't use it. +// +fn main() { + // Release builds exit as soon as possible; faster and reduces impact of deadlock bugs. // - // https://doc.rust-lang.org/std/process/fn.exit.html - // - // See process_exitable() method of Subscriber trait for what triggers the exit_code. + #[cfg(not(debug_assertions))] + { + main_exitable(); + panic!("Release build bubbled exit_code to main() vs. ending abruptly earlier"); + } + + // Debug builds exit as cleanly as they are able to, for Valgrind and sanity checking. // - let exit_code = main_exitable(); - std::process::exit(i32::from(exit_code)); + #[cfg(debug_assertions)] + { + let (seccomp_level, exit_code) = main_exitable(); + + if seccomp_level == SeccompLevel::None { + std::process::exit(i32::from(exit_code)); // includes Rust library cleanup + } else { + exit_firecracker_abruptly(exit_code); // see notes on seccomp interaction + } + } } // Print supported snapshot data format versions. diff --git a/src/vmm/src/lib.rs b/src/vmm/src/lib.rs index 2a8aac067f3..12fd25de4b2 100644 --- a/src/vmm/src/lib.rs +++ b/src/vmm/src/lib.rs @@ -227,6 +227,21 @@ pub(crate) fn mem_size_mib(guest_memory: &GuestMemoryMmap) -> u64 { guest_memory.map_and_fold(0, |(_, region)| region.len(), |a, b| a + b) >> 20 } +/// The default/recommended filter for production workloads is a tight whitelist that +/// doesn't include some syscalls done by Rust during thread exit (rt_sigprocmask is +/// the first to hit the filter). So when default seccomp filter is installed, +/// calling Rust's `std::process::exit()` will result in a seccomp violation panic. +/// +/// Calling the libc::_exit() function is even more basic than libc::exit(), and so +/// it must be used when seccomp filtering is enabled, unless something else changes. +/// +/// But see main() for how debug builds use std::process::exit() when they are verifying +/// shutdown can run cleanly, when seccomp filtering is off. +/// +pub fn exit_firecracker_abruptly(exit_code: ExitCode) -> ! { // ! -> diverging function + unsafe { libc::_exit(i32::from(exit_code)) } +} + /// Contains the state and associated methods required for the Firecracker VMM. pub struct Vmm { events_observer: Option>, @@ -353,13 +368,14 @@ impl Vmm { .map_err(Error::I8042Error) } - /// Waits for all vCPUs to exit. Does not terminate the Firecracker process. - /// (See notes in main() about why ExitCode is bubbled up for clean shutdown.) - pub fn stop(&mut self) { + /// This stops the VMM. If it's a release build, this will exit the Firecracker process + /// entirely. But debug builds enforce a higher level of cleanliness, and return an + /// exit code that the caller should bubble up to main(). + /// + pub fn stop(&mut self, exit_code: ExitCode) -> ExitCode { info!("Vmm is stopping."); - self.exit_vcpus().unwrap(); // exit all not-already-exited VCPUs, join their threads - + // Teardown the VMM (produces metrics we need to write out) if let Some(observer) = self.events_observer.as_mut() { if let Err(e) = observer.on_vmm_stop() { warn!("{}", Error::VmmObserverTeardown(e)); @@ -370,6 +386,23 @@ impl Vmm { if let Err(e) = METRICS.write() { error!("Failed to write metrics while stopping: {}", e); } + + // The release build exits here, to reduce the impact of shutdown deadlock bugs. + // It's also faster to let the OS do memory and resource cleanup, once semantically + // important shutdown (e.g. flushing and writing any open files) is done. + // + #[cfg(not(debug_assertions))] + exit_firecracker_abruptly(exit_code); + + // The debug build will shut down in an orderly fashion, bubbling up the exit code all + // the way to main and joining all threads (including ending the API server gracefully). + // + #[cfg(debug_assertions)] + { + self.exit_vcpus().unwrap(); // exit all not-already-exited VCPUs, join their threads + + exit_code + } } /// Saves the state of a paused Microvm. @@ -777,6 +810,12 @@ impl Subscriber for Vmm { } } + // If the exit_code can't be found on any vcpu, it means that the exit signal + // has been issued by the i8042 controller in which case we exit with + // FC_EXIT_CODE_OK. + // + let exit_code = opt_exit_code.unwrap_or(FC_EXIT_CODE_OK); + // !!! The caller of this routine is receiving the exit code to bubble back up // to the main() function to return cleanly. However, it does not have clean // access to the Vmm to shut it down (here we have it, since it is `self`). It @@ -787,13 +826,7 @@ impl Subscriber for Vmm { // that will actually work with an exit code (all other Subscriber trait // implementers must use process()) // - self.stop(); - - // If the exit_code can't be found on any vcpu, it means that the exit signal - // has been issued by the i8042 controller in which case we exit with - // FC_EXIT_CODE_OK. - // - Some(opt_exit_code.unwrap_or(FC_EXIT_CODE_OK)) + Some(self.stop(exit_code)) // exits abruptly if release build, else returns } else { error!("Spurious EventManager event for handler: Vmm"); None