Skip to content

Commit a3da136

Browse files
author
lif
committed
sled-agent: don't block during instance creation request from nexus
Alleviating request timeouts occurring when propolis zone installation takes too long (Propolis zone installation took 81 seconds and caused instance start to time out oxidecomputer#3927) by making the zone installation not happen during a request handler. Since the instance creation request no longer blocks, we need to wait before proceeding in some cases where we had assumed that a successful return from the Nexus call meant the instance existed, e.g. test_instance_serial now polls for the instance's running state before attempting to send serial console data requests.
1 parent c0ba701 commit a3da136

File tree

8 files changed

+337
-90
lines changed

8 files changed

+337
-90
lines changed

end-to-end-tests/src/instance_launch.rs

Lines changed: 56 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -7,15 +7,14 @@ use omicron_test_utils::dev::poll::{wait_for_condition, CondCheckError};
77
use oxide_client::types::{
88
ByteCount, DiskCreate, DiskSource, ExternalIp, ExternalIpCreate,
99
InstanceCpuCount, InstanceCreate, InstanceDiskAttachment,
10-
InstanceNetworkInterfaceAttachment, SshKeyCreate,
10+
InstanceNetworkInterfaceAttachment, InstanceState, SshKeyCreate,
1111
};
1212
use oxide_client::{ClientDisksExt, ClientInstancesExt, ClientSessionExt};
1313
use russh::{ChannelMsg, Disconnect};
1414
use russh_keys::key::{KeyPair, PublicKey};
1515
use russh_keys::PublicKeyBase64;
1616
use std::sync::Arc;
1717
use std::time::Duration;
18-
use tokio::time::sleep;
1918

2019
#[tokio::test]
2120
async fn instance_launch() -> Result<()> {
@@ -106,6 +105,19 @@ async fn instance_launch() -> Result<()> {
106105
type Error =
107106
CondCheckError<oxide_client::Error<oxide_client::types::Error>>;
108107

108+
let instance_state = ctx
109+
.client
110+
.instance_view()
111+
.project(ctx.project_name.clone())
112+
.instance(instance.name.clone())
113+
.send()
114+
.await?
115+
.run_state;
116+
117+
if instance_state == InstanceState::Starting {
118+
return Err(Error::NotYet);
119+
}
120+
109121
let data = String::from_utf8_lossy(
110122
&ctx.client
111123
.instance_serial_console()
@@ -188,19 +200,49 @@ async fn instance_launch() -> Result<()> {
188200

189201
// check that we saw it on the console
190202
eprintln!("waiting for serial console");
191-
sleep(Duration::from_secs(5)).await;
192-
let data = String::from_utf8_lossy(
193-
&ctx.client
194-
.instance_serial_console()
195-
.project(ctx.project_name.clone())
196-
.instance(instance.name.clone())
197-
.most_recent(1024 * 1024)
198-
.max_bytes(1024 * 1024)
199-
.send()
200-
.await?
201-
.data,
203+
204+
let data = wait_for_condition(
205+
|| async {
206+
type Error =
207+
CondCheckError<oxide_client::Error<oxide_client::types::Error>>;
208+
209+
let instance_state = ctx
210+
.client
211+
.instance_view()
212+
.project(ctx.project_name.clone())
213+
.instance(instance.name.clone())
214+
.send()
215+
.await?
216+
.run_state;
217+
218+
if instance_state == InstanceState::Starting {
219+
return Err(Error::NotYet);
220+
}
221+
222+
let data = String::from_utf8_lossy(
223+
&ctx.client
224+
.instance_serial_console()
225+
.project(ctx.project_name.clone())
226+
.instance(instance.name.clone())
227+
.most_recent(1024 * 1024)
228+
.max_bytes(1024 * 1024)
229+
.send()
230+
.await
231+
.map_err(|_e| Error::NotYet)?
232+
.data,
233+
)
234+
.into_owned();
235+
if data.contains("-----END SSH HOST KEY KEYS-----") {
236+
Ok(data)
237+
} else {
238+
Err(Error::NotYet)
239+
}
240+
},
241+
&Duration::from_secs(5),
242+
&Duration::from_secs(300),
202243
)
203-
.into_owned();
244+
.await?;
245+
204246
ensure!(
205247
data.contains("Hello, Oxide!"),
206248
"string not seen on console\n{}",

nexus/src/app/instance.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -986,6 +986,9 @@ impl super::Nexus {
986986
//
987987
// If the operation failed, kick the sled agent error back up to
988988
// the caller to let it decide how to handle it.
989+
//
990+
// When creating the zone for the first time, we just get
991+
// Ok(None) here, which is a no-op in write_returned_instance_state.
989992
match instance_put_result {
990993
Ok(state) => self
991994
.write_returned_instance_state(&instance_id, state)

nexus/tests/integration_tests/instances.rs

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,7 @@ use sled_agent_client::TestInterfaces as _;
6767
use std::convert::TryFrom;
6868
use std::net::Ipv4Addr;
6969
use std::sync::Arc;
70+
use std::time::Duration;
7071
use uuid::Uuid;
7172

7273
use dropshot::test_util::ClientTestContext;
@@ -80,6 +81,8 @@ use nexus_test_utils::resource_helpers::{
8081
use nexus_test_utils_macros::nexus_test;
8182
use nexus_types::external_api::shared::SiloRole;
8283
use omicron_sled_agent::sim;
84+
use omicron_test_utils::dev::poll;
85+
use omicron_test_utils::dev::poll::CondCheckError;
8386

8487
type ControlPlaneTestContext =
8588
nexus_test_utils::ControlPlaneTestContext<omicron_nexus::Server>;
@@ -3794,10 +3797,22 @@ async fn test_instance_serial(cptestctx: &ControlPlaneTestContext) {
37943797

37953798
// Create an instance and poke it to ensure it's running.
37963799
let instance = create_instance(client, PROJECT_NAME, instance_name).await;
3797-
instance_simulate(nexus, &instance.identity.id).await;
3798-
let instance_next = instance_get(&client, &instance_url).await;
3800+
let instance_next = poll::wait_for_condition(
3801+
|| async {
3802+
instance_simulate(nexus, &instance.identity.id).await;
3803+
let instance_next = instance_get(&client, &instance_url).await;
3804+
if instance_next.runtime.run_state == InstanceState::Running {
3805+
Ok(instance_next)
3806+
} else {
3807+
Err(CondCheckError::<()>::NotYet)
3808+
}
3809+
},
3810+
&Duration::from_secs(5),
3811+
&Duration::from_secs(60),
3812+
)
3813+
.await
3814+
.unwrap();
37993815
identity_eq(&instance.identity, &instance_next.identity);
3800-
assert_eq!(instance_next.runtime.run_state, InstanceState::Running);
38013816
assert!(
38023817
instance_next.runtime.time_run_state_updated
38033818
> instance.runtime.time_run_state_updated

sled-agent/src/fakes/nexus.rs

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,6 @@ async fn sled_agent_put(
121121
struct InstancePathParam {
122122
instance_id: Uuid,
123123
}
124-
125124
#[endpoint {
126125
method = PUT,
127126
path = "/instances/{instance_id}",

0 commit comments

Comments
 (0)