Skip to content

Commit 7e7ed02

Browse files
author
lif
committed
wip: might be worth testing now...
1 parent a77e315 commit 7e7ed02

File tree

2 files changed

+60
-10
lines changed

2 files changed

+60
-10
lines changed

nexus/src/app/instance.rs

Lines changed: 50 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -983,10 +983,35 @@ impl super::Nexus {
983983
// long -- say, InstanceRuntimeState::time_updated
984984
// plus the timeout, assuming time_updated is the
985985
// right point to measure from.
986-
tokio::spawn(async {
986+
let prev_instance_runtime =
987+
prev_instance_state.runtime_state.clone();
988+
let db_datastore_weak =
989+
Arc::downgrade(&self.db_datastore);
990+
let log = self
991+
.log
992+
.new(o!("component" => "Instance timeout"));
993+
tokio::spawn(async move {
987994
tokio::time::sleep(Duration::from_secs(120))
988995
.await;
989-
todo!("fail instance")
996+
if let Some(db_datastore) =
997+
db_datastore_weak.upgrade()
998+
{
999+
Self::mark_instance_failed_inner(
1000+
&db_datastore,
1001+
&instance_id,
1002+
&prev_instance_runtime,
1003+
"Timed out waiting for instance state change.",
1004+
&log,
1005+
)
1006+
.await
1007+
.ok();
1008+
} else {
1009+
error!(
1010+
log,
1011+
"DataStore no longer exists to mark instance failed after instance state change timeout.";
1012+
"instance_id" => %instance_id
1013+
);
1014+
}
9901015
});
9911016
}
9921017
self.write_returned_instance_state(&instance_id, state)
@@ -1043,7 +1068,8 @@ impl super::Nexus {
10431068
&state.instance().runtime_state,
10441069
error,
10451070
)
1046-
.await
1071+
.await?;
1072+
Ok(HandleInstancePutResultResult::Ok)
10471073
}
10481074
}
10491075
}
@@ -1343,7 +1369,24 @@ impl super::Nexus {
13431369
prev_instance_runtime: &db::model::InstanceRuntimeState,
13441370
reason: impl std::fmt::Debug,
13451371
) -> Result<(), Error> {
1346-
error!(self.log, "marking instance failed due to sled agent API error";
1372+
Self::mark_instance_failed_inner(
1373+
&self.db_datastore,
1374+
instance_id,
1375+
prev_instance_runtime,
1376+
reason,
1377+
&self.log,
1378+
)
1379+
.await
1380+
}
1381+
1382+
async fn mark_instance_failed_inner(
1383+
db_datastore: &Arc<db::DataStore>,
1384+
instance_id: &Uuid,
1385+
prev_instance_runtime: &db::model::InstanceRuntimeState,
1386+
reason: impl std::fmt::Debug,
1387+
log: &slog::Logger,
1388+
) -> Result<(), Error> {
1389+
error!(log, "marking instance failed due to sled agent API error";
13471390
"instance_id" => %instance_id,
13481391
"error" => ?reason);
13491392

@@ -1358,16 +1401,15 @@ impl super::Nexus {
13581401
..prev_instance_runtime.clone()
13591402
};
13601403

1361-
match self
1362-
.db_datastore
1404+
match db_datastore
13631405
.instance_update_runtime(&instance_id, &new_runtime)
13641406
.await
13651407
{
1366-
Ok(_) => info!(self.log, "marked instance as Failed";
1408+
Ok(_) => info!(log, "marked instance as Failed";
13671409
"instance_id" => %instance_id),
13681410
// XXX: It's not clear what to do with this error; should it be
13691411
// bubbled back up to the caller?
1370-
Err(e) => error!(self.log,
1412+
Err(e) => error!(log,
13711413
"failed to write Failed instance state to DB";
13721414
"instance_id" => %instance_id,
13731415
"error" => ?e),

sled-agent/src/instance_manager.rs

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -387,7 +387,11 @@ impl InstanceManager {
387387
{
388388
Ok(HandleInstancePutResultResult::Ok) => {}
389389
Ok(HandleInstancePutResultResult::TimedOut) => {
390-
todo!("nexus doesn't want us any more, terminate instance")
390+
error!(log, "Nexus gave up waiting for us to finish creating the propolis zone and abandoned the instance. Rudely terminating it from our side."; "instance_id" => %instance_id);
391+
if let Err(err) = instance.terminate().await
392+
{
393+
error!(log, "Couldn't terminate instance whose creation was timed-out by Nexus"; "instance_id" => %instance_id, "err" => %err);
394+
}
391395
}
392396
Err(err) => {
393397
error!(log, "Failed to inform Nexus of instance_put success";
@@ -409,7 +413,11 @@ impl InstanceManager {
409413
{
410414
Ok(HandleInstancePutResultResult::Ok) => {}
411415
Ok(HandleInstancePutResultResult::TimedOut) => {
412-
todo!("well, i guess this is less awkward but clean up if we have to")
416+
error!(log, "Nexus gave up waiting for us to finish creating the propolis zone and abandoned the instance, but the instance also explicitly failed on our side. Rudely terminating what remains of it."; "instance_id" => %instance_id);
417+
if let Err(err) = instance.terminate().await
418+
{
419+
error!(log, "Couldn't terminate faulted instance (whose creation was also timed-out by Nexus)"; "instance_id" => %instance_id, "err" => %err);
420+
}
413421
}
414422
Err(err) => {
415423
error!(log, "Failed to inform Nexus of instance_put failure";

0 commit comments

Comments
 (0)