Skip to content

Commit 0aeca43

Browse files
committed
Shutdown API server thread so it can thread::join()
The general presumption for Firecracker was that it would be exited via an exit() call. This meant there was no way to signal threads to exit their loops so they could be join()'d to release their resources. Not being able to exit the stack normally (if one wants to) misses a sanity check that shutting down cleanly provides. The output of Valgrind and other tools is also less useful. Because the API Server thread is in a loop that does a blocking wait on a socket, the only way to tell it to stop looping is via information on that socket. This uses an internal HTTP GET signal to request that shutdown. Rather than forget the thread handle and the socket bind location, the main thread now sends the shutdown request and then waits while the thread joins. The request is for internal use only, however--because it is supposed to happen after the Vmm has been cleaned up and its contents destructed. A different approach for signaling the thread for this purpose could be used, e.g. to have the API server thread waiting on multiple events (the socket or this internal signal). But since this is only needed for clean shutdown, it is probably good enough. A faster shutdown using exit() would be a better default--this will likely only be used for debugging and Valgrind/sanitization.
1 parent c526970 commit 0aeca43

File tree

3 files changed

+97
-45
lines changed

3 files changed

+97
-45
lines changed

src/api_server/src/lib.rs

Lines changed: 66 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -198,67 +198,92 @@ impl ApiServer {
198198
}
199199

200200
server.start_server().expect("Cannot start HTTP server");
201+
201202
loop {
202-
match server.requests() {
203-
Ok(request_vec) => {
204-
for server_request in request_vec {
205-
let request_processing_start_us =
206-
utils::time::get_time_us(utils::time::ClockType::Monotonic);
207-
server
208-
.respond(
209-
// Use `self.handle_request()` as the processing callback.
210-
server_request.process(|request| {
211-
self.handle_request(request, request_processing_start_us)
212-
}),
213-
)
214-
.or_else(|e| {
215-
error!("API Server encountered an error on response: {}", e);
216-
Ok(())
217-
})?;
218-
let delta_us = utils::time::get_time_us(utils::time::ClockType::Monotonic)
219-
- request_processing_start_us;
220-
debug!("Total previous API call duration: {} us.", delta_us);
221-
if self.vmm_fatal_error {
222-
// Flush the remaining outgoing responses
223-
// and proceed to exit
224-
server.flush_outgoing_writes();
225-
error!(
226-
"Fatal error with exit code: {}",
227-
FC_EXIT_CODE_BAD_CONFIGURATION
228-
);
229-
unsafe {
230-
libc::_exit(i32::from(FC_EXIT_CODE_BAD_CONFIGURATION));
231-
}
232-
}
233-
}
234-
}
235-
Err(e) => {
203+
let request_vec = match server.requests() {
204+
Ok(vec) => vec,
205+
Err(e) => { // print request error, but keep server running
236206
error!(
237207
"API Server error on retrieving incoming request. Error: {}",
238208
e
239209
);
210+
continue
211+
}
212+
};
213+
214+
for server_request in request_vec {
215+
let request_processing_start_us =
216+
utils::time::get_time_us(utils::time::ClockType::Monotonic);
217+
218+
// !!! ServerRequest::process() doesn't appear to do anything besides put a private
219+
// id field into the response. But since it takes an inner function it cannot
220+
// return from this function without giving a response. Review a better way of
221+
// exiting cleanly than by reaching out of the closure to set a boolean.
222+
223+
let mut shutting_down = false;
224+
225+
server
226+
.respond(
227+
server_request.process(|request| {
228+
match self.handle_request(request, request_processing_start_us) {
229+
Some(response) => response,
230+
None => {
231+
shutting_down = true;
232+
Response::new(Version::Http11, StatusCode::NoContent)
233+
}
234+
}
235+
}),
236+
)
237+
.or_else(|e| {
238+
error!("API Server encountered an error on response: {}", e);
239+
Ok(())
240+
})?;
241+
242+
let delta_us = utils::time::get_time_us(utils::time::ClockType::Monotonic)
243+
- request_processing_start_us;
244+
debug!("Total previous API call duration: {} us.", delta_us);
245+
246+
if shutting_down {
247+
debug!("/shutdown-internal request received, API server thread now ending itself");
248+
return Ok(());
249+
}
250+
251+
if self.vmm_fatal_error {
252+
// Flush the remaining outgoing responses
253+
// and proceed to exit
254+
server.flush_outgoing_writes();
255+
error!(
256+
"Fatal error with exit code: {}",
257+
FC_EXIT_CODE_BAD_CONFIGURATION
258+
);
259+
unsafe {
260+
libc::_exit(i32::from(FC_EXIT_CODE_BAD_CONFIGURATION));
261+
}
240262
}
241263
}
242264
}
243265
}
244266

245267
/// Handles an API request received through the associated socket.
268+
/// If None is given back, it means a ShutdownInternal was requested and no response
269+
/// is expected (should be requested by the main thread, not by clients)
246270
pub fn handle_request(
247271
&mut self,
248272
request: &Request,
249273
request_processing_start_us: u64,
250-
) -> Response {
274+
) -> Option<Response> {
251275
match ParsedRequest::try_from_request(request) {
252276
Ok(ParsedRequest::Sync(vmm_action)) => {
253-
self.serve_vmm_action_request(vmm_action, request_processing_start_us)
277+
Some(self.serve_vmm_action_request(vmm_action, request_processing_start_us))
254278
}
255-
Ok(ParsedRequest::GetInstanceInfo) => self.get_instance_info(),
256-
Ok(ParsedRequest::GetMMDS) => self.get_mmds(),
257-
Ok(ParsedRequest::PatchMMDS(value)) => self.patch_mmds(value),
258-
Ok(ParsedRequest::PutMMDS(value)) => self.put_mmds(value),
279+
Ok(ParsedRequest::GetInstanceInfo) => Some(self.get_instance_info()),
280+
Ok(ParsedRequest::GetMMDS) => Some(self.get_mmds()),
281+
Ok(ParsedRequest::PatchMMDS(value)) => Some(self.patch_mmds(value)),
282+
Ok(ParsedRequest::PutMMDS(value)) => Some(self.put_mmds(value)),
283+
Ok(ParsedRequest::ShutdownInternal) => None,
259284
Err(e) => {
260285
error!("{}", e);
261-
e.into()
286+
Some(e.into())
262287
}
263288
}
264289
}

src/api_server/src/parsed_request.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ pub(crate) enum ParsedRequest {
3131
PatchMMDS(Value),
3232
PutMMDS(Value),
3333
Sync(Box<VmmAction>),
34+
ShutdownInternal, // !!! not an API, used by shutdown to thread::join the API thread
3435
}
3536

3637
impl ParsedRequest {
@@ -57,6 +58,7 @@ impl ParsedRequest {
5758

5859
match (request.method(), path, request.body.as_ref()) {
5960
(Method::Get, "", None) => parse_get_instance_info(),
61+
(Method::Get, "shutdown-internal", None) => Ok(ParsedRequest::ShutdownInternal),
6062
(Method::Get, "balloon", None) => parse_get_balloon(path_tokens.get(1)),
6163
(Method::Get, "machine-config", None) => parse_get_machine_config(),
6264
(Method::Get, "mmds", None) => parse_get_mmds(),

src/firecracker/src/api_server_adapter.rs

Lines changed: 29 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,9 @@ use std::{
77
sync::mpsc::{channel, Receiver, Sender, TryRecvError},
88
sync::{Arc, Mutex},
99
thread,
10+
11+
os::unix::net::UnixStream, // for main thread to send ShutdownInternal to API thread
12+
io::prelude::*, // UnixStream write_all() requires prelude
1013
};
1114

1215
use api_server::{ApiRequest, ApiResponse, ApiServer};
@@ -146,9 +149,10 @@ pub(crate) fn run_with_api(
146149
.try_clone()
147150
.expect("Failed to clone API event FD");
148151

152+
let api_bind_path = bind_path.clone();
149153
let api_seccomp_filter = seccomp_filter.clone();
150154
// Start the separate API thread.
151-
thread::Builder::new()
155+
let api_thread = thread::Builder::new()
152156
.name("fc_api".to_owned())
153157
.spawn(move || {
154158
mask_handled_signals().expect("Unable to install signal mask on API thread.");
@@ -160,7 +164,7 @@ pub(crate) fn run_with_api(
160164
from_vmm,
161165
to_vmm_event_fd,
162166
)
163-
.bind_and_run(bind_path, process_time_reporter, api_seccomp_filter)
167+
.bind_and_run(api_bind_path, process_time_reporter, api_seccomp_filter)
164168
{
165169
Ok(_) => (),
166170
Err(api_server::Error::Io(inner)) => match inner.kind() {
@@ -238,12 +242,33 @@ pub(crate) fn run_with_api(
238242
.expect("Poisoned lock")
239243
.start(super::metrics::WRITE_METRICS_PERIOD_MS);
240244

241-
ApiServerAdapter::run_microvm(
245+
let exit_code = ApiServerAdapter::run_microvm(
242246
api_event_fd,
243247
from_api,
244248
to_api,
245249
vm_resources,
246250
vmm,
247251
&mut event_manager,
248-
)
252+
);
253+
254+
// We want to tell the API thread to shut down for a clean exit. But this is after
255+
// the Vmm.stop() has been called, so it's a moment of internal finalization (as
256+
// opposed to be something the client might call to shut the Vm down). Since it's
257+
// an internal signal implementing it with an HTTP request is probably not the ideal
258+
// way to do it...but having another way would involve waiting on the socket or some
259+
// other signal. This leverages the existing wait.
260+
//
261+
// !!! Since the code is only needed for a "clean" shutdown mode, a non-clean mode
262+
// could not respond to the request, making this effectively a debug-only feature.
263+
//
264+
let mut sock = UnixStream::connect(bind_path).unwrap();
265+
assert!(sock.write_all(b"GET /shutdown-internal HTTP/1.1\r\n\r\n").is_ok());
266+
267+
// This call to thread::join() should block until the API thread has processed the
268+
// shutdown-internal and returns from its function. If it doesn't block here, then
269+
// that means it got the message; so no need to process a response.
270+
//
271+
api_thread.join().unwrap();
272+
273+
exit_code
249274
}

0 commit comments

Comments
 (0)