Skip to content

Commit e45b7f1

Browse files
committed
Fix flaky instance zone-boot timeout test
- Fixes #6197 - Use a multi-threaded runtime for the test so we can literally sleep in the boot closure without blocking the test itself.
1 parent 30edf1c commit e45b7f1

File tree

1 file changed

+40
-24
lines changed

1 file changed

+40
-24
lines changed

sled-agent/src/instance.rs

Lines changed: 40 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1617,6 +1617,7 @@ mod tests {
16171617
use std::net::Ipv6Addr;
16181618
use std::net::SocketAddrV6;
16191619
use std::str::FromStr;
1620+
use std::time::Duration;
16201621
use tokio::sync::watch::Receiver;
16211622
use tokio::time::timeout;
16221623

@@ -2043,7 +2044,7 @@ mod tests {
20432044
logctx.cleanup_successful();
20442045
}
20452046

2046-
#[tokio::test]
2047+
#[tokio::test(flavor = "multi_thread")]
20472048
async fn test_instance_create_timeout_while_creating_zone() {
20482049
let logctx = omicron_test_utils::dev::test_setup_log(
20492050
"test_instance_create_timeout_while_creating_zone",
@@ -2055,17 +2056,30 @@ mod tests {
20552056

20562057
// time out while booting zone, on purpose!
20572058
let boot_ctx = MockZones::boot_context();
2058-
let start = tokio::time::Instant::now();
2059+
const TIMEOUT: Duration = Duration::from_secs(1);
2060+
let (boot_continued_tx, boot_continued_rx) =
2061+
std::sync::mpsc::sync_channel(1);
20592062
boot_ctx.expect().times(1).return_once(move |_| {
2060-
// We need something that will look like the zone taking a long time
2061-
// to boot, but we cannot use a `tokio::time` construct here since
2062-
// this is a blocking context and we cannot call `block_on()`
2063-
// recursively. We advance time by this amount below, so this will
2064-
// most likely result in a small number of additional sleeps until
2065-
// the timeout has really elased.
2066-
while start.elapsed() < TIMEOUT_DURATION * 2 {
2067-
std::thread::sleep(std::time::Duration::from_millis(1));
2068-
}
2063+
// We need a way to slow down zone boot, but that doesn't block the
2064+
// entire Tokio runtime. Since this closure is synchronous, it also
2065+
// has no way to await anything, all waits are blocking. That means
2066+
// we cannot use a single-threaded runtime, which also means no
2067+
// manually advancing time. The test has to take the full "slow boot
2068+
// time".
2069+
//
2070+
// To do this, we use a multi-threaded runtime, and call
2071+
// block_in_place so that we can just literally sleep for a while.
2072+
// The sleep duration here is twice a timeout we set on the attempt
2073+
// to actually set the instance running below.
2074+
//
2075+
// This boot method also directly signals the main test code to
2076+
// continue when it's done sleeping to synchronize with it.
2077+
tokio::task::block_in_place(move || {
2078+
println!("MockZones::boot() called, waiting for timeout");
2079+
std::thread::sleep(TIMEOUT * 2);
2080+
println!("MockZones::boot() waited for timeout, continuing");
2081+
boot_continued_tx.send(()).unwrap();
2082+
});
20692083
Ok(())
20702084
});
20712085
let wait_ctx = illumos_utils::svc::wait_for_service_context();
@@ -2100,8 +2114,6 @@ mod tests {
21002114
.await
21012115
.expect("timed out creating Instance struct");
21022116

2103-
tokio::time::pause();
2104-
21052117
let (put_tx, put_rx) = oneshot::channel();
21062118

21072119
// pretending we're InstanceManager::ensure_state, try in vain to start
@@ -2110,20 +2122,16 @@ mod tests {
21102122
.await
21112123
.expect("failed to send Instance::put_state");
21122124

2113-
// Timeout our future waiting for the instance-state-change at
2114-
// `TIMEOUT_DURATION`, which should fail because zone boot will take
2115-
// twice that by construction.
2116-
let timeout_fut = timeout(TIMEOUT_DURATION, put_rx);
2117-
2118-
// And advance time by twice that, so that the actual
2119-
// `MockZones::boot()` call should be exercised (or will be soon).
2120-
tokio::time::advance(TIMEOUT_DURATION * 2).await;
2121-
2122-
tokio::time::resume();
2123-
2125+
// Timeout our future waiting for the instance-state-change at 1s. This
2126+
// is much shorter than the actual `TIMEOUT_DURATION`, but the test
2127+
// structure requires that we actually wait this period, since we cannot
2128+
// advance time manually in a multi-threaded runtime.
2129+
let timeout_fut = timeout(TIMEOUT, put_rx);
2130+
println!("Awaiting zone-boot timeout");
21242131
timeout_fut
21252132
.await
21262133
.expect_err("*should've* timed out waiting for Instance::put_state, but didn't?");
2134+
println!("Zone-boot timeout awaited");
21272135

21282136
if let ReceivedInstanceState::InstancePut(SledInstanceState {
21292137
vmm_state: VmmRuntimeState { state: VmmState::Running, .. },
@@ -2133,6 +2141,14 @@ mod tests {
21332141
panic!("Nexus's InstanceState should never have reached running if zone creation timed out");
21342142
}
21352143

2144+
// Notify the "boot" closure that it can continue, and then wait to
2145+
// ensure it's actually called.
2146+
println!("Waiting for zone-boot to continue");
2147+
tokio::task::spawn_blocking(move || boot_continued_rx.recv().unwrap())
2148+
.await
2149+
.unwrap();
2150+
println!("Received continued message from MockZones::boot()");
2151+
21362152
storage_harness.cleanup().await;
21372153
logctx.cleanup_successful();
21382154
}

0 commit comments

Comments
 (0)