[sled-agent] VMM graceful shutdown timeout

hawkw · hawkw · commit 290a639e868f · 2025-02-21T11:44:23.000-08:00
Presently, sled-agent's `InstanceRunner` has two mechanisms for shutting down a VMM: sending an instance state PUT request to the `propolis-server` process for the `Stopped` state, or forcibly terminating the `propolis-server` and tearing down the zone. At present, when a request to stop an instance is sent to the sled-agent, it uses the first mechanism, where Propolis is politely asked to stop the instance --- which I'll refer to as "graceful shutdown". The forceful termination path is used when asked to unregister an instance where the VMM has not started up yet, when encountering an unrecoverable VMM error, or when killing an instance that was making use of an expunged disk. Currently, these two paths don't really overlap: when Nexus asks a sled-agent to stop an instance, all it will do is politely ask Propolis to please stop the instance gracefully, and will only fall back to violently shooting the zone in the face if Propolis returns the error that indicates it never knew about that instance in the first place. This means that, should a VMM get *stuck* while shutting down the instance, stopping it will never complete successfully, and the Propolis zone won't get cleaned up. This can happen due to e.g. [a Crucible activation that will never complete][1]. Thus, the sled-agent should attempt to violently terminate a Propolis zone when a graceful shutdown of the VMM fails to complete in a timely manner. This commit introduces a timeout for the graceful shutdown process. Now, when we send a PUT request to Propolis with the `Stopped` instance state, the sled-agent will start a 10-minute timer. If no update from Propolis that indicates the instance has transitioned to `Stopped` is received before the timer completes, the sled-agent will proceed with the forceful termination of the Propolis zone. Fixes #4004. [1]: #4004 (comment)
diff --git a/sled-agent/src/instance.rs b/sled-agent/src/instance.rs
@@ -47,7 +47,9 @@ use sled_storage::manager::StorageHandle;
 use slog::Logger;
 use std::net::IpAddr;
 use std::net::SocketAddr;
+use std::pin::Pin;
 use std::sync::Arc;
+use std::time::Duration;
 use tokio::sync::{mpsc, oneshot};
 use uuid::Uuid;
 
@@ -472,8 +474,31 @@ struct InstanceRunner {
 }
 
 impl InstanceRunner {
+    /// How long to wait for VMM shutdown to complete before forcefully
+    /// terminating the zone.
+    const STOP_GRACE_PERIOD: Duration = Duration::from_secs(60 * 10);
+
     async fn run(mut self, mut terminate_rx: mpsc::Receiver<TerminateRequest>) {
         use InstanceRequest::*;
+
+        // Timeout for stopping the instance gracefully.
+        //
+        // When we send Propolis a put-state request to transition to
+        // Stopped`, we start this timer. If Propolis does not report back
+        // that it has stopped the instance within `STOP_GRACE_PERIOD`, we
+        // will forcibly terminate the VMM. This is to ensure that a totally
+        // stuck VMM does not prevent the Propolis zone from being cleaned up.
+        let mut stop_timeout = None;
+        async fn stop_timeout_completed(
+            stop_timeout: &mut Option<Pin<Box<tokio::time::Sleep>>>,
+        ) {
+            if let Some(ref mut timeout) = stop_timeout {
+                timeout.await
+            } else {
+                std::future::pending().await
+            }
+        }
+
         while !self.should_terminate {
             tokio::select! {
                 biased;
@@ -539,6 +564,24 @@ impl InstanceRunner {
                         },
                     }
                 },
+
+                // We are waiting for the VMM to stop, and the grace period has
+                // elapsed without us hearing back from Propolis that it has
+                // stopped the instance. Time to terminate it violently!
+                //
+                // Note that this must be a lower priority in the `select!` than
+                // instance monitor requests, as we would prefer to honor a
+                // message from the instance monitor indicating a successful
+                // stop, even if we have reached the timeout.
+                _ = stop_timeout_completed(&mut stop_timeout) => {
+                    warn!(
+                        self.log,
+                        "Instance failed to stop within the grace period, \
+                         terminating it violently!",
+                    );
+                    self.terminate(false).await;
+                }
+
                 // Requests to terminate the instance take priority over any
                 // other request to the instance.
                 request = terminate_rx.recv() => {
@@ -584,7 +627,22 @@ impl InstanceRunner {
                                 tx.send(Ok(self.current_state()))
                                     .map_err(|_| Error::FailedSendClientClosed)
                             },
-                            PutState{ state, tx } => {
+                            PutState { state, tx } => {
+                                // If we're going to stop the instance, start
+                                // the timeout after which we will forcefully
+                                // terminate the VMM.
+                                if let VmmStateRequested::Stopped = state {
+                                    // Only start the stop timeout if there
+                                    // isn't one already, so that additional
+                                    // requests to stop coming in don't reset
+                                    // the clock.
+                                    if stop_timeout.is_none() {
+                                        stop_timeout = Some(Box::pin(tokio::time::sleep(
+                                            Self::STOP_GRACE_PERIOD
+                                        )));
+                                    }
+                                }
+
                                 tx.send(self.put_state(state).await
                                     .map(|r| VmmPutStateResponse { updated_runtime: Some(r) })
                                     .map_err(|e| e.into()))