Skip to content

Commit f9a566d

Browse files
committed
[nexus] Having children shouldn't prevent reincarnation
See #8178 (comment)
1 parent c179ea6 commit f9a566d

File tree

2 files changed

+51
-32
lines changed

2 files changed

+51
-32
lines changed

nexus/src/app/sagas/instance_update/mod.rs

Lines changed: 39 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -342,7 +342,7 @@
342342
343343
use super::{
344344
ACTION_GENERATE_ID, ActionRegistry, NexusActionContext, NexusSaga,
345-
SagaInitError,
345+
SagaContext, SagaInitError,
346346
};
347347
use crate::app::db::datastore::InstanceGestalt;
348348
use crate::app::db::datastore::VmmStateUpdateResult;
@@ -1307,37 +1307,7 @@ async fn siu_chain_successor_saga(
13071307
"instance_id" => %instance_id,
13081308
);
13091309
} else {
1310-
// If the instance has transitioned to the `Failed` state and no
1311-
// additional update saga is required, check if the instance's
1312-
// auto-restart policy allows it to be automatically restarted. If
1313-
// it does, activate the instance-reincarnation background task to
1314-
// automatically restart it.
1315-
let karmic_state = new_state
1316-
.instance
1317-
.auto_restart_status(new_state.active_vmm.as_ref());
1318-
if karmic_state.should_reincarnate() {
1319-
info!(
1320-
log,
1321-
"instance update: instance transitioned to Failed, \
1322-
but can be automatically restarted; activating \
1323-
reincarnation.";
1324-
"instance_id" => %instance_id,
1325-
"auto_restart_config" => ?new_state.instance.auto_restart,
1326-
"runtime_state" => ?new_state.instance.runtime_state,
1327-
"intended_state" => %new_state.instance.intended_state,
1328-
);
1329-
nexus.background_tasks.task_instance_reincarnation.activate();
1330-
} else {
1331-
debug!(
1332-
log,
1333-
"instance update: instance will not reincarnate";
1334-
"instance_id" => %instance_id,
1335-
"auto_restart_config" => ?new_state.instance.auto_restart,
1336-
"needs_reincarnation" => karmic_state.needs_reincarnation,
1337-
"karmic_state" => ?karmic_state.can_reincarnate,
1338-
"intended_state" => %new_state.instance.intended_state,
1339-
)
1340-
}
1310+
reincarnate_if_needed(osagactx, &new_state)
13411311
}
13421312

13431313
Ok::<(), anyhow::Error>(())
@@ -1361,6 +1331,43 @@ async fn siu_chain_successor_saga(
13611331
Ok(())
13621332
}
13631333

1334+
fn reincarnate_if_needed(osagactx: &SagaContext, state: &InstanceGestalt) {
1335+
// If the instance has transitioned to the `Failed` state and no
1336+
// additional update saga is required, check if the instance's
1337+
// auto-restart policy allows it to be automatically restarted. If
1338+
// it does, activate the instance-reincarnation background task to
1339+
// automatically restart it.
1340+
let karmic_state =
1341+
state.instance.auto_restart_status(state.active_vmm.as_ref());
1342+
if karmic_state.should_reincarnate() {
1343+
info!(
1344+
&osagactx.log(),
1345+
"instance update: instance transitioned to Failed, \
1346+
but can be automatically restarted; activating \
1347+
reincarnation.";
1348+
"instance_id" => %state.instance.id(),
1349+
"auto_restart_config" => ?state.instance.auto_restart,
1350+
"runtime_state" => ?state.instance.runtime_state,
1351+
"intended_state" => %state.instance.intended_state,
1352+
);
1353+
osagactx
1354+
.nexus()
1355+
.background_tasks
1356+
.task_instance_reincarnation
1357+
.activate();
1358+
} else {
1359+
debug!(
1360+
&osagactx.log(),
1361+
"instance update: instance will not reincarnate";
1362+
"instance_id" => %state.instance.id(),
1363+
"auto_restart_config" => ?state.instance.auto_restart,
1364+
"needs_reincarnation" => karmic_state.needs_reincarnation,
1365+
"karmic_state" => ?karmic_state.can_reincarnate,
1366+
"intended_state" => %state.instance.intended_state,
1367+
)
1368+
}
1369+
}
1370+
13641371
/// Unlock the instance record while unwinding.
13651372
///
13661373
/// This is factored out of the actual reverse action, because the `Params` type

nexus/src/app/sagas/instance_update/start.rs

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -302,6 +302,18 @@ async fn siu_fetch_state_and_start_real_saga(
302302
.instance_updater_unlock(&opctx, &authz_instance, &orig_lock)
303303
.await
304304
.map_err(ActionError::action_failed)?;
305+
// If we're releasing the lock, check if we should activate the
306+
// instance reincarnation background task to reincarnate this
307+
// instance. A previous activation may have not been able to
308+
// reincarnate this instance because we held the lock, so poking the
309+
// reincarnation task here should help ensure it sees a failed instance
310+
// in a timely manner.
311+
//
312+
// This doesn't matter a whole lot in production systems, where the task
313+
// will activate periodically regardless, but it should make the tests
314+
// less flakey, and hopefully also decrease the latency with which
315+
// failed instances are reincarnated a bit, maybe?
316+
super::reincarnate_if_needed(osagactx, &state);
305317
}
306318

307319
Ok(())

0 commit comments

Comments
 (0)