Skip to content

Commit a61e036

Browse files
committed
Add VmmState::SagaUnwound
This branch is part of ongoing work on the `instance-update` saga (see PR #5749); I've factored it out into a separate PR. This is largely because this branch makes mechanical changes to a bunch of different files that aren't directly related to the core change of #5749, and I'd like to avoid the additional merge conflicts that are likely when these changes remain un-merged for a long time. --- Depends on #5854. As part of #5749, it is desirable to distinguish between *why* a VMM record was marked as `Destroyed`, in order to determine which saga is responsible for cleaning up that VMM's resources. The `instance-update` saga must be the only entity that can set an instance's VMM IDs (active and target) to NULL. Presently, however, the `instance-start` and `instance-migrate` sagas may also do this when they unwind. This is a requirement to avoid situations where a failed `instance-update` saga cannot unwind, because the instance's generation number has changed while the saga was executing. We want to ensure that if a start or migrate request fails, the instance will appear to be in the correct post-state as soon as the relevant API call returns. In order to do this, without having to also guarantee that an instance update has occurred, we introduce a new VMM state, `SagaUnwound`, with the following rules: - It is legal to start or migrate an instance if the `active_propolis_id` or `destination_propolis_id` (respectively) is either `NULL` or refers to a VMM that’s in the `SagaUnwound` state (the new VMM ID directly replaces the old ID). - If an instance has an `active_propolis_id` in the `SagaUnwound` state, then the instance appears to be `Stopped`. - If an instance has a `destination_propolis_id` in the `SagaUnwound` state, nothing special happens–the instance’s state is still derived from its active VMM’s state. - The update saga treats `SagaUnwound` VMMs as identical to `Destroyed` VMMs for purposes of deciding whether to remove a VMM ID from an instance. This branch adds a new `VmmState::SagaUnwound` variant. The `SagaUnwound` state is an internal implementation detail that shouldn't be exposed to an operator or in the external API. Sled-agents will never report that a VMM is in this state. Instead, this state is set my the `instance-start` and `instance-migrate` sagas when they unwind. When determining the API instance state from an instance and active VMM query so that the `SagaUnwound` state is mapped to `Destroyed`. Closes #5848, which this replaces.
1 parent e488d57 commit a61e036

File tree

8 files changed

+26
-14
lines changed

8 files changed

+26
-14
lines changed

nexus/db-model/src/schema_versions.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ use std::collections::BTreeMap;
1717
///
1818
/// This must be updated when you change the database schema. Refer to
1919
/// schema/crdb/README.adoc in the root of this repository for details.
20-
pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(70, 0, 0);
20+
pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(71, 0, 0);
2121

2222
/// List of all past database schema versions, in *reverse* order
2323
///
@@ -29,6 +29,7 @@ static KNOWN_VERSIONS: Lazy<Vec<KnownVersion>> = Lazy::new(|| {
2929
// | leaving the first copy as an example for the next person.
3030
// v
3131
// KnownVersion::new(next_int, "unique-dirname-with-the-sql-files"),
32+
KnownVersion::new(71, "add-saga-unwound-vmm-state"),
3233
KnownVersion::new(70, "separate-instance-and-vmm-states"),
3334
KnownVersion::new(69, "expose-stage0"),
3435
KnownVersion::new(68, "filter-v2p-mapping-by-instance-state"),

nexus/db-model/src/vmm_state.rs

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ impl_enum_type!(
2424
Migrating => b"migrating"
2525
Failed => b"failed"
2626
Destroyed => b"destroyed"
27+
SagaUnwound => b"saga_unwound"
2728
);
2829

2930
impl VmmState {
@@ -37,6 +38,7 @@ impl VmmState {
3738
VmmState::Migrating => "migrating",
3839
VmmState::Failed => "failed",
3940
VmmState::Destroyed => "destroyed",
41+
VmmState::SagaUnwound => "saga_unwound",
4042
}
4143
}
4244
}
@@ -58,7 +60,7 @@ impl From<VmmState> for omicron_common::api::internal::nexus::VmmState {
5860
VmmState::Rebooting => Output::Rebooting,
5961
VmmState::Migrating => Output::Migrating,
6062
VmmState::Failed => Output::Failed,
61-
VmmState::Destroyed => Output::Destroyed,
63+
VmmState::Destroyed | VmmState::SagaUnwound => Output::Destroyed,
6264
}
6365
}
6466
}
@@ -74,7 +76,7 @@ impl From<VmmState> for sled_agent_client::types::VmmState {
7476
VmmState::Rebooting => Output::Rebooting,
7577
VmmState::Migrating => Output::Migrating,
7678
VmmState::Failed => Output::Failed,
77-
VmmState::Destroyed => Output::Destroyed,
79+
VmmState::Destroyed | VmmState::SagaUnwound => Output::Destroyed,
7880
}
7981
}
8082
}
@@ -108,7 +110,7 @@ impl From<VmmState> for omicron_common::api::external::InstanceState {
108110
VmmState::Rebooting => Output::Rebooting,
109111
VmmState::Migrating => Output::Migrating,
110112
VmmState::Failed => Output::Failed,
111-
VmmState::Destroyed => Output::Destroyed,
113+
VmmState::Destroyed | VmmState::SagaUnwound => Output::Destroyed,
112114
}
113115
}
114116
}

nexus/src/app/instance.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1664,7 +1664,7 @@ impl super::Nexus {
16641664
vmm.runtime.state,
16651665
)))
16661666
}
1667-
DbVmmState::Destroyed => Err(Error::invalid_request(
1667+
DbVmmState::Destroyed | DbVmmState::SagaUnwound => Err(Error::invalid_request(
16681668
"cannot connect to serial console of destroyed instance",
16691669
)),
16701670
}

nexus/src/app/sagas/instance_common.rs

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -111,19 +111,22 @@ pub async fn create_and_insert_vmm_record(
111111
Ok(vmm)
112112
}
113113

114-
/// Given a previously-inserted VMM record, set its state to Destroyed and then
115-
/// delete it.
114+
/// Given a previously-inserted VMM record, set its state to `SagaUnwound` and
115+
/// then delete it.
116116
///
117117
/// This function succeeds idempotently if called with the same parameters,
118118
/// provided that the VMM record was not changed by some other actor after the
119119
/// calling saga inserted it.
120-
pub async fn destroy_vmm_record(
120+
///
121+
/// This function is intended to be called when a saga which created a VMM
122+
/// record unwinds.
123+
pub async fn unwind_vmm_record(
121124
datastore: &DataStore,
122125
opctx: &OpContext,
123126
prev_record: &db::model::Vmm,
124127
) -> Result<(), anyhow::Error> {
125128
let new_runtime = db::model::VmmRuntimeState {
126-
state: db::model::VmmState::Destroyed,
129+
state: db::model::VmmState::SagaUnwound,
127130
time_state_updated: Utc::now(),
128131
gen: prev_record.runtime.gen.next().into(),
129132
};
@@ -326,7 +329,10 @@ pub async fn instance_ip_get_instance_state(
326329
),
327330
)));
328331
}
329-
(InstanceState::Vmm, Some(VmmState::Destroyed)) => {
332+
(
333+
InstanceState::Vmm,
334+
Some(VmmState::Destroyed | VmmState::SagaUnwound),
335+
) => {
330336
return Err(ActionError::action_failed(Error::internal_error(
331337
&format!(
332338
"instance {} points to destroyed VMM",

nexus/src/app/sagas/instance_migrate.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -235,7 +235,7 @@ async fn sim_destroy_vmm_record(
235235
info!(osagactx.log(), "destroying vmm record for migration unwind";
236236
"propolis_id" => %vmm.id);
237237

238-
super::instance_common::destroy_vmm_record(
238+
super::instance_common::unwind_vmm_record(
239239
osagactx.datastore(),
240240
&opctx,
241241
&vmm,

nexus/src/app/sagas/instance_start.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -200,7 +200,7 @@ async fn sis_destroy_vmm_record(
200200
);
201201

202202
let vmm = sagactx.lookup::<db::model::Vmm>("vmm_record")?;
203-
super::instance_common::destroy_vmm_record(
203+
super::instance_common::unwind_vmm_record(
204204
osagactx.datastore(),
205205
&opctx,
206206
&vmm,
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
ALTER TYPE omicron.public.vmm_state
2+
ADD VALUE IF NOT EXISTS 'saga_unwound' AFTER 'destroyed';

schema/crdb/dbinit.sql

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -982,7 +982,8 @@ CREATE TYPE IF NOT EXISTS omicron.public.vmm_state AS ENUM (
982982
'rebooting',
983983
'migrating',
984984
'failed',
985-
'destroyed'
985+
'destroyed',
986+
'saga_unwound',
986987
);
987988

988989
/*
@@ -4075,7 +4076,7 @@ INSERT INTO omicron.public.db_metadata (
40754076
version,
40764077
target_version
40774078
) VALUES
4078-
(TRUE, NOW(), NOW(), '70.0.0', NULL)
4079+
(TRUE, NOW(), NOW(), '71.0.0', NULL)
40794080
ON CONFLICT DO NOTHING;
40804081

40814082
COMMIT;

0 commit comments

Comments
 (0)