Skip to content

Commit 5beedcf

Browse files
authored
Fix flaky instance zone-boot timeout test (#6238)
- Fixes #6197 - Use a multi-threaded runtime for the test so we can literally sleep in the boot closure without blocking the test itself.
1 parent 5831dc6 commit 5beedcf

File tree

1 file changed

+47
-24
lines changed

1 file changed

+47
-24
lines changed

sled-agent/src/instance.rs

Lines changed: 47 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1617,6 +1617,7 @@ mod tests {
16171617
use std::net::Ipv6Addr;
16181618
use std::net::SocketAddrV6;
16191619
use std::str::FromStr;
1620+
use std::time::Duration;
16201621
use tokio::sync::watch::Receiver;
16211622
use tokio::time::timeout;
16221623

@@ -2043,7 +2044,7 @@ mod tests {
20432044
logctx.cleanup_successful();
20442045
}
20452046

2046-
#[tokio::test]
2047+
#[tokio::test(flavor = "multi_thread")]
20472048
async fn test_instance_create_timeout_while_creating_zone() {
20482049
let logctx = omicron_test_utils::dev::test_setup_log(
20492050
"test_instance_create_timeout_while_creating_zone",
@@ -2055,17 +2056,37 @@ mod tests {
20552056

20562057
// time out while booting zone, on purpose!
20572058
let boot_ctx = MockZones::boot_context();
2058-
let start = tokio::time::Instant::now();
2059+
const TIMEOUT: Duration = Duration::from_secs(1);
2060+
let (boot_continued_tx, boot_continued_rx) =
2061+
std::sync::mpsc::sync_channel(1);
2062+
let boot_log = log.clone();
20592063
boot_ctx.expect().times(1).return_once(move |_| {
2060-
// We need something that will look like the zone taking a long time
2061-
// to boot, but we cannot use a `tokio::time` construct here since
2062-
// this is a blocking context and we cannot call `block_on()`
2063-
// recursively. We advance time by this amount below, so this will
2064-
// most likely result in a small number of additional sleeps until
2065-
// the timeout has really elased.
2066-
while start.elapsed() < TIMEOUT_DURATION * 2 {
2067-
std::thread::sleep(std::time::Duration::from_millis(1));
2068-
}
2064+
// We need a way to slow down zone boot, but that doesn't block the
2065+
// entire Tokio runtime. Since this closure is synchronous, it also
2066+
// has no way to await anything, all waits are blocking. That means
2067+
// we cannot use a single-threaded runtime, which also means no
2068+
// manually advancing time. The test has to take the full "slow boot
2069+
// time".
2070+
//
2071+
// To do this, we use a multi-threaded runtime, and call
2072+
// block_in_place so that we can just literally sleep for a while.
2073+
// The sleep duration here is twice a timeout we set on the attempt
2074+
// to actually set the instance running below.
2075+
//
2076+
// This boot method also directly signals the main test code to
2077+
// continue when it's done sleeping to synchronize with it.
2078+
tokio::task::block_in_place(move || {
2079+
debug!(
2080+
boot_log,
2081+
"MockZones::boot() called, waiting for timeout"
2082+
);
2083+
std::thread::sleep(TIMEOUT * 2);
2084+
debug!(
2085+
boot_log,
2086+
"MockZones::boot() waited for timeout, continuing"
2087+
);
2088+
boot_continued_tx.send(()).unwrap();
2089+
});
20692090
Ok(())
20702091
});
20712092
let wait_ctx = illumos_utils::svc::wait_for_service_context();
@@ -2100,8 +2121,6 @@ mod tests {
21002121
.await
21012122
.expect("timed out creating Instance struct");
21022123

2103-
tokio::time::pause();
2104-
21052124
let (put_tx, put_rx) = oneshot::channel();
21062125

21072126
// pretending we're InstanceManager::ensure_state, try in vain to start
@@ -2110,20 +2129,16 @@ mod tests {
21102129
.await
21112130
.expect("failed to send Instance::put_state");
21122131

2113-
// Timeout our future waiting for the instance-state-change at
2114-
// `TIMEOUT_DURATION`, which should fail because zone boot will take
2115-
// twice that by construction.
2116-
let timeout_fut = timeout(TIMEOUT_DURATION, put_rx);
2117-
2118-
// And advance time by twice that, so that the actual
2119-
// `MockZones::boot()` call should be exercised (or will be soon).
2120-
tokio::time::advance(TIMEOUT_DURATION * 2).await;
2121-
2122-
tokio::time::resume();
2123-
2132+
// Timeout our future waiting for the instance-state-change at 1s. This
2133+
// is much shorter than the actual `TIMEOUT_DURATION`, but the test
2134+
// structure requires that we actually wait this period, since we cannot
2135+
// advance time manually in a multi-threaded runtime.
2136+
let timeout_fut = timeout(TIMEOUT, put_rx);
2137+
debug!(log, "Awaiting zone-boot timeout");
21242138
timeout_fut
21252139
.await
21262140
.expect_err("*should've* timed out waiting for Instance::put_state, but didn't?");
2141+
debug!(log, "Zone-boot timeout awaited");
21272142

21282143
if let ReceivedInstanceState::InstancePut(SledInstanceState {
21292144
vmm_state: VmmRuntimeState { state: VmmState::Running, .. },
@@ -2133,6 +2148,14 @@ mod tests {
21332148
panic!("Nexus's InstanceState should never have reached running if zone creation timed out");
21342149
}
21352150

2151+
// Notify the "boot" closure that it can continue, and then wait to
2152+
// ensure it's actually called.
2153+
debug!(log, "Waiting for zone-boot to continue");
2154+
tokio::task::spawn_blocking(move || boot_continued_rx.recv().unwrap())
2155+
.await
2156+
.unwrap();
2157+
debug!(log, "Received continued message from MockZones::boot()");
2158+
21362159
storage_harness.cleanup().await;
21372160
logctx.cleanup_successful();
21382161
}

0 commit comments

Comments
 (0)