From f508831c21c604353bf61b81c61f5506a83191e1 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Mon, 25 Mar 2024 11:17:52 -0700 Subject: [PATCH 001/234] start sketching saga --- nexus/db-queries/src/db/datastore/mod.rs | 2 +- nexus/src/app/sagas/instance_update.rs | 86 ++++++++++++++++++++++++ nexus/src/app/sagas/mod.rs | 1 + 3 files changed, 88 insertions(+), 1 deletion(-) create mode 100644 nexus/src/app/sagas/instance_update.rs diff --git a/nexus/db-queries/src/db/datastore/mod.rs b/nexus/db-queries/src/db/datastore/mod.rs index 88e1f44cea2..209ee94e121 100644 --- a/nexus/db-queries/src/db/datastore/mod.rs +++ b/nexus/db-queries/src/db/datastore/mod.rs @@ -111,7 +111,7 @@ mod zpool; pub use address_lot::AddressLotCreateResult; pub use dns::DataStoreDnsTest; pub use dns::DnsVersionUpdateBuilder; -pub use instance::InstanceAndActiveVmm; +pub use instance::{InstanceAndActiveVmm, InstanceSnapshot}; pub use inventory::DataStoreInventoryTest; use nexus_db_model::AllSchemaVersions; pub use rack::RackInit; diff --git a/nexus/src/app/sagas/instance_update.rs b/nexus/src/app/sagas/instance_update.rs new file mode 100644 index 00000000000..9f0f76d378d --- /dev/null +++ b/nexus/src/app/sagas/instance_update.rs @@ -0,0 +1,86 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use super::{ + instance_common::allocate_vmm_ipv6, NexusActionContext, NexusSaga, + SagaInitError, ACTION_GENERATE_ID, +}; +use crate::app::instance::InstanceStateChangeError; +use crate::app::sagas::declare_saga_actions; +use chrono::Utc; +use nexus_db_model::Generation; +use nexus_db_queries::db::{ + datastore::InstanceSnapshot, identity::Resource, lookup::LookupPath, +}; +use nexus_db_queries::{authn, authz, db}; +use omicron_common::api::external::{Error, InstanceState}; +use serde::{Deserialize, Serialize}; +use uuid::Uuid; + +/// Parameters to the instance update saga. +#[derive(Debug, Deserialize, Serialize)] +pub(crate) struct Params { + pub authz_instance: authz::Instance, + + /// Authentication context to use to fetch the instance's current state from + /// the database. + pub serialized_authn: authn::saga::Serialized, +} + +declare_saga_actions! { + instance_update; + + LOOKUP_AND_LOCK_INSTANCE -> "instance_and_vmms" { + + siu_lookup_and_lock_instance + - siu_lookup_and_lock_instance_undo + } +} + +const SAGA_INSTANCE_LOCK_ID: &str = "saga_instance_lock_id"; + +async fn siu_lookup_and_lock_instance( + sagactx: NexusActionContext, +) -> Result<(), anyhow::Error> { + let osagactx = sagactx.user_data(); + let Params { ref authz_instance, ref serialized_authn, .. } = + sagactx.saga_params::()?; + + let opctx = + crate::context::op_context_for_saga_action(&sagactx, serialized_authn); + let snapshot = + osagactx.datastore().instance_fetch_all(&opctx, authz_instance).await?; + + // try to lock + + let lock_id = sagactx.lookup::(SAGA_INSTANCE_LOCK_ID)?; + let lock = osagactx + .datastore() + .instance_updater_try_lock( + &opctx, + &authz_instance, + &snapshot.instance.runtime_state.updater_gen, + &lock_id, + ) + .await?; + + Ok(snapshot) +} + +async fn siu_lookup_and_lock_instance_undo( + sagactx: NexusActionContext, +) -> Result<(), anyhow::Error> { + let osagactx = sagactx.user_data(); + let Params { ref authz_instance, ref serialized_authn, .. } = + sagactx.saga_params::()?; + let lock_id = sagactx.lookup::(SAGA_INSTANCE_LOCK_ID)?; + + let opctx = + crate::context::op_context_for_saga_action(&sagactx, serialized_authn); + osagactx.datastore().instance_updater_unlock( + &opctx, + &authz_instance, + &lock_id, + )?; + Ok(()) +} diff --git a/nexus/src/app/sagas/mod.rs b/nexus/src/app/sagas/mod.rs index 17f43b4950c..af93711b6cb 100644 --- a/nexus/src/app/sagas/mod.rs +++ b/nexus/src/app/sagas/mod.rs @@ -33,6 +33,7 @@ pub mod instance_ip_attach; pub mod instance_ip_detach; pub mod instance_migrate; pub mod instance_start; +pub mod instance_update; pub mod project_create; pub mod region_replacement_drive; pub mod region_replacement_finish; From 7a8b30e11e5c66db660a6b53a98b16c68ab540b8 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Tue, 9 Apr 2024 14:22:35 -0700 Subject: [PATCH 002/234] wip --- nexus/src/app/instance.rs | 16 ++++++++++++++++ nexus/src/app/sagas/instance_update.rs | 6 ++++++ 2 files changed, 22 insertions(+) diff --git a/nexus/src/app/instance.rs b/nexus/src/app/instance.rs index e6866bfab6a..472e50aa08f 100644 --- a/nexus/src/app/instance.rs +++ b/nexus/src/app/instance.rs @@ -29,6 +29,8 @@ use nexus_db_queries::context::OpContext; use nexus_db_queries::db; use nexus_db_queries::db::datastore::instance::InstanceUpdateResult; use nexus_db_queries::db::datastore::InstanceAndActiveVmm; +use nexus_db_queries::db::datastore::InstanceRuntimeState; +use nexus_db_queries::db::datastore::InstanceSnapshot; use nexus_db_queries::db::identity::Resource; use nexus_db_queries::db::lookup; use nexus_db_queries::db::lookup::LookupPath; @@ -1514,6 +1516,20 @@ impl super::Nexus { Ok(disk) } + pub(crate) async fn update_instance_state( + &self, + opctx: &OpContext, + InstanceSnapshot { instance, active_vmm, target_vmm, .. }: InstanceSnapshot, + ) -> Result { + if let Some(active_vmm) = active_vmm { + match active_vmm.runtime_state.instance_state { + InstanceState::Destroyed => {} + _ => todo!("eliza"), + } + } + todo!("eliza") + } + /// Invoked by a sled agent to publish an updated runtime state for an /// Instance. pub(crate) async fn notify_instance_updated( diff --git a/nexus/src/app/sagas/instance_update.rs b/nexus/src/app/sagas/instance_update.rs index 9f0f76d378d..9957a640245 100644 --- a/nexus/src/app/sagas/instance_update.rs +++ b/nexus/src/app/sagas/instance_update.rs @@ -31,10 +31,16 @@ pub(crate) struct Params { declare_saga_actions! { instance_update; + // Read the target Instance from CRDB and join with its active VMM and + // migration target VMM records if they exist, and then acquire the + // "instance updater" lock with this saga's ID if no other saga is currently + // updating the instance. LOOKUP_AND_LOCK_INSTANCE -> "instance_and_vmms" { + siu_lookup_and_lock_instance - siu_lookup_and_lock_instance_undo } + + // } const SAGA_INSTANCE_LOCK_ID: &str = "saga_instance_lock_id"; From 2c0fb3a8a910874a0fdf7be1495efe57efba481c Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Mon, 13 May 2024 09:42:05 -0700 Subject: [PATCH 003/234] wip --- .../app/sagas/instance_update/destroyed.rs | 49 +++++++++++++++++++ .../mod.rs} | 44 +++++++---------- 2 files changed, 66 insertions(+), 27 deletions(-) create mode 100644 nexus/src/app/sagas/instance_update/destroyed.rs rename nexus/src/app/sagas/{instance_update.rs => instance_update/mod.rs} (67%) diff --git a/nexus/src/app/sagas/instance_update/destroyed.rs b/nexus/src/app/sagas/instance_update/destroyed.rs new file mode 100644 index 00000000000..dc80c5ab442 --- /dev/null +++ b/nexus/src/app/sagas/instance_update/destroyed.rs @@ -0,0 +1,49 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use super::{siu_lock_instance, siu_unlock_instance, NexusActionContext}; +use crate::app::instance::InstanceStateChangeError; +use crate::app::sagas::declare_saga_actions; +use chrono::Utc; +use nexus_db_model::Generation; +use nexus_db_queries::db::{ + datastore::InstanceAndVmms, identity::Resource, lookup::LookupPath, +}; +use nexus_db_queries::{authn, authz, db}; +use omicron_common::api::external::{Error, InstanceState}; +use serde::{Deserialize, Serialize}; +use uuid::Uuid; + +declare_saga_actions! { + instance_update_destroyed; + + // Read the target Instance from CRDB and join with its active VMM and + // migration target VMM records if they exist, and then acquire the + // "instance updater" lock with this saga's ID if no other saga is currently + // updating the instance. + LOCK_INSTANCE -> "instance_and_vmms" { + + siu_lock_instance + - siu_unlock_instance + } + + DELETE_SLED_RESOURCE -> "no_result1" { + + siud_delete_sled_resource + } + + DELETE_VIRTUAL_PROVISIONING -> "no_result2" { + + siud_delete_virtual_provisioning + } +} + +async fn siud_delete_sled_resource( + sagactx: NexusActionContext, +) -> Result<(), anyhow::Error> { + todo!() +} + +async fn siud_delete_virtual_provisioning( + sagactx: NexusActionContext, +) -> Result<(), anyhow::Error> { + todo!() +} diff --git a/nexus/src/app/sagas/instance_update.rs b/nexus/src/app/sagas/instance_update/mod.rs similarity index 67% rename from nexus/src/app/sagas/instance_update.rs rename to nexus/src/app/sagas/instance_update/mod.rs index 9957a640245..fdd1a5162fb 100644 --- a/nexus/src/app/sagas/instance_update.rs +++ b/nexus/src/app/sagas/instance_update/mod.rs @@ -18,6 +18,8 @@ use omicron_common::api::external::{Error, InstanceState}; use serde::{Deserialize, Serialize}; use uuid::Uuid; +pub mod destroyed; + /// Parameters to the instance update saga. #[derive(Debug, Deserialize, Serialize)] pub(crate) struct Params { @@ -26,36 +28,25 @@ pub(crate) struct Params { /// Authentication context to use to fetch the instance's current state from /// the database. pub serialized_authn: authn::saga::Serialized, -} - -declare_saga_actions! { - instance_update; - // Read the target Instance from CRDB and join with its active VMM and - // migration target VMM records if they exist, and then acquire the - // "instance updater" lock with this saga's ID if no other saga is currently - // updating the instance. - LOOKUP_AND_LOCK_INSTANCE -> "instance_and_vmms" { - + siu_lookup_and_lock_instance - - siu_lookup_and_lock_instance_undo - } - - // + pub start_state: InstanceSnapshot, } const SAGA_INSTANCE_LOCK_ID: &str = "saga_instance_lock_id"; -async fn siu_lookup_and_lock_instance( +async fn siu_lock_instance( sagactx: NexusActionContext, ) -> Result<(), anyhow::Error> { let osagactx = sagactx.user_data(); - let Params { ref authz_instance, ref serialized_authn, .. } = - sagactx.saga_params::()?; + let Params { + ref authz_instance, + ref serialized_authn, + ref start_state, + .. + } = sagactx.saga_params::()?; let opctx = crate::context::op_context_for_saga_action(&sagactx, serialized_authn); - let snapshot = - osagactx.datastore().instance_fetch_all(&opctx, authz_instance).await?; // try to lock @@ -65,15 +56,15 @@ async fn siu_lookup_and_lock_instance( .instance_updater_try_lock( &opctx, &authz_instance, - &snapshot.instance.runtime_state.updater_gen, + start_state.instance.runtime_state.updater_gen, &lock_id, ) .await?; - Ok(snapshot) + Ok(()) } -async fn siu_lookup_and_lock_instance_undo( +async fn siu_unlock_instance( sagactx: NexusActionContext, ) -> Result<(), anyhow::Error> { let osagactx = sagactx.user_data(); @@ -83,10 +74,9 @@ async fn siu_lookup_and_lock_instance_undo( let opctx = crate::context::op_context_for_saga_action(&sagactx, serialized_authn); - osagactx.datastore().instance_updater_unlock( - &opctx, - &authz_instance, - &lock_id, - )?; + osagactx + .datastore() + .instance_updater_unlock(&opctx, &authz_instance, &lock_id) + .await?; Ok(()) } From 3331f666fe6b2f37ecf47510dc69b08dcb628e16 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Mon, 13 May 2024 12:44:47 -0700 Subject: [PATCH 004/234] remove dead code --- nexus/src/app/instance.rs | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/nexus/src/app/instance.rs b/nexus/src/app/instance.rs index 472e50aa08f..05482dbb6f7 100644 --- a/nexus/src/app/instance.rs +++ b/nexus/src/app/instance.rs @@ -29,7 +29,6 @@ use nexus_db_queries::context::OpContext; use nexus_db_queries::db; use nexus_db_queries::db::datastore::instance::InstanceUpdateResult; use nexus_db_queries::db::datastore::InstanceAndActiveVmm; -use nexus_db_queries::db::datastore::InstanceRuntimeState; use nexus_db_queries::db::datastore::InstanceSnapshot; use nexus_db_queries::db::identity::Resource; use nexus_db_queries::db::lookup; @@ -1516,20 +1515,6 @@ impl super::Nexus { Ok(disk) } - pub(crate) async fn update_instance_state( - &self, - opctx: &OpContext, - InstanceSnapshot { instance, active_vmm, target_vmm, .. }: InstanceSnapshot, - ) -> Result { - if let Some(active_vmm) = active_vmm { - match active_vmm.runtime_state.instance_state { - InstanceState::Destroyed => {} - _ => todo!("eliza"), - } - } - todo!("eliza") - } - /// Invoked by a sled agent to publish an updated runtime state for an /// Instance. pub(crate) async fn notify_instance_updated( From 76c6960adc2a7c932d5ce067b36539b39a540789 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Mon, 13 May 2024 15:05:56 -0700 Subject: [PATCH 005/234] sketch out the whole "instance destroyed" subsaga --- .../app/sagas/instance_update/destroyed.rs | 204 ++++++++++++++++-- nexus/src/app/sagas/instance_update/mod.rs | 38 ++-- 2 files changed, 209 insertions(+), 33 deletions(-) diff --git a/nexus/src/app/sagas/instance_update/destroyed.rs b/nexus/src/app/sagas/instance_update/destroyed.rs index dc80c5ab442..6f9a5f9e460 100644 --- a/nexus/src/app/sagas/instance_update/destroyed.rs +++ b/nexus/src/app/sagas/instance_update/destroyed.rs @@ -2,18 +2,18 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -use super::{siu_lock_instance, siu_unlock_instance, NexusActionContext}; -use crate::app::instance::InstanceStateChangeError; +use super::{ + siu_lock_instance, siu_unlock_instance, NexusActionContext, Params, +}; +use crate::app::instance_network; use crate::app::sagas::declare_saga_actions; -use chrono::Utc; +use crate::app::sagas::ActionError; use nexus_db_model::Generation; -use nexus_db_queries::db::{ - datastore::InstanceAndVmms, identity::Resource, lookup::LookupPath, -}; -use nexus_db_queries::{authn, authz, db}; -use omicron_common::api::external::{Error, InstanceState}; -use serde::{Deserialize, Serialize}; -use uuid::Uuid; +use nexus_db_model::InstanceRuntimeState; +use nexus_db_queries::db::identity::Resource; +use omicron_common::api::external; +use omicron_common::api::external::Error; +use omicron_common::api::external::ResourceType; declare_saga_actions! { instance_update_destroyed; @@ -22,7 +22,7 @@ declare_saga_actions! { // migration target VMM records if they exist, and then acquire the // "instance updater" lock with this saga's ID if no other saga is currently // updating the instance. - LOCK_INSTANCE -> "instance_and_vmms" { + LOCK_INSTANCE -> "lock_generation" { + siu_lock_instance - siu_unlock_instance } @@ -34,16 +34,190 @@ declare_saga_actions! { DELETE_VIRTUAL_PROVISIONING -> "no_result2" { + siud_delete_virtual_provisioning } + + DELETE_V2P_MAPPINGS -> "no_result3" { + + siud_delete_v2p_mappings + } + + DELETE_NAT_ENTRIES -> "no_result4" { + + siud_delete_nat_entries + } + + UPDATE_VMM_DESTROYED -> "no_result5" { + + siud_instance_update_vmm_destroyed + } + + MARK_VMM_DELETED -> "no_result6" { + + siud_mark_vmm_deleted + } + + UNLOCK_INSTANCE -> "no_result7" { + + siu_unlock_instance + } } async fn siud_delete_sled_resource( sagactx: NexusActionContext, -) -> Result<(), anyhow::Error> { - todo!() +) -> Result<(), ActionError> { + let osagactx = sagactx.user_data(); + let Params { ref serialized_authn, ref active_vmm, .. } = + sagactx.saga_params::()?; + + let opctx = + crate::context::op_context_for_saga_action(&sagactx, serialized_authn); + let propolis_id = active_vmm + .as_ref() + // TODO(eliza): don't unwrap here and put it in params instead when deciding + // what to start. + .expect("if we started this saga there is an active propolis ID") + .id; + osagactx + .datastore() + .sled_reservation_delete(&opctx, propolis_id) + .await + .or_else(|err| { + // Necessary for idempotency + match err { + Error::ObjectNotFound { .. } => Ok(()), + _ => Err(err), + } + }) + .map_err(ActionError::action_failed) } async fn siud_delete_virtual_provisioning( sagactx: NexusActionContext, -) -> Result<(), anyhow::Error> { - todo!() +) -> Result<(), ActionError> { + let osagactx = sagactx.user_data(); + let Params { ref serialized_authn, ref instance, .. } = + sagactx.saga_params::()?; + + let opctx = + crate::context::op_context_for_saga_action(&sagactx, serialized_authn); + + osagactx + .datastore() + .virtual_provisioning_collection_delete_instance( + &opctx, + instance.id(), + instance.project_id, + i64::from(instance.ncpus.0 .0), + instance.memory, + i64::try_from(&instance.runtime_state.gen.0).unwrap(), + ) + .await + .map(|_| ()) + .or_else(|err| { + // Necessary for idempotency + match err { + Error::ObjectNotFound { .. } => Ok(()), + _ => Err(ActionError::action_failed(err)), + } + }) +} + +async fn siud_delete_v2p_mappings( + sagactx: NexusActionContext, +) -> Result<(), ActionError> { + let osagactx = sagactx.user_data(); + let Params { ref serialized_authn, ref instance, .. } = + sagactx.saga_params::()?; + + let opctx = + crate::context::op_context_for_saga_action(&sagactx, serialized_authn); + + // Per the commentary in instance_network::delete_instance_v2p_mappings`, + // this should be idempotent. + instance_network::delete_instance_v2p_mappings( + osagactx.datastore(), + osagactx.log(), + &osagactx.nexus().opctx_alloc, + &opctx, + instance.id(), + ) + .await + .or_else(|err| { + // Necessary for idempotency + match err { + Error::ObjectNotFound { + type_name: ResourceType::Instance, + lookup_type: _, + } => Ok(()), + _ => Err(ActionError::action_failed(err)), + } + }) +} + +async fn siud_delete_nat_entries( + sagactx: NexusActionContext, +) -> Result<(), ActionError> { + let osagactx = sagactx.user_data(); + let Params { ref serialized_authn, ref authz_instance, .. } = + sagactx.saga_params::()?; + + let opctx = + crate::context::op_context_for_saga_action(&sagactx, serialized_authn); + let opctx_alloc = &osagactx.nexus().opctx_alloc; + let resolver = osagactx.nexus().resolver().await; + let datastore = osagactx.datastore(); + let log = osagactx.log(); + + instance_network::instance_delete_dpd_config( + datastore, + log, + &resolver, + &opctx, + opctx_alloc, + authz_instance, + ) + .await + .or_else(|err| + // Necessary for idempotency + match err { + Error::ObjectNotFound { .. } => Ok(()), + _ => Err(ActionError::action_failed(err)), + }) +} + +async fn siud_instance_update_vmm_destroyed( + sagactx: NexusActionContext, +) -> Result<(), ActionError> { + let osagactx = sagactx.user_data(); + let Params { instance, .. } = sagactx.saga_params::()?; + let new_runtime = InstanceRuntimeState { + propolis_id: None, + nexus_state: external::InstanceState::Stopped.into(), + gen: Generation(instance.runtime_state.gen.0.next()), + ..instance.runtime_state + }; + + // It's okay for this to fail, it just means that the active VMM ID has changed. + let _ = osagactx + .datastore() + .instance_update_runtime(&instance.id(), &new_runtime) + .await; + Ok(()) +} + +async fn siud_mark_vmm_deleted( + sagactx: NexusActionContext, +) -> Result<(), ActionError> { + let osagactx = sagactx.user_data(); + let Params { ref serialized_authn, ref active_vmm, .. } = + sagactx.saga_params::()?; + + let opctx = + crate::context::op_context_for_saga_action(&sagactx, serialized_authn); + let propolis_id = active_vmm + .as_ref() + // TODO(eliza): don't unwrap here and put it in params instead when deciding + // what to start. + .expect("if we started this saga there is an active propolis ID") + .id; + osagactx + .datastore() + .vmm_mark_deleted(&opctx, &propolis_id) + .await + .map(|_| ()) + .map_err(ActionError::action_failed) } diff --git a/nexus/src/app/sagas/instance_update/mod.rs b/nexus/src/app/sagas/instance_update/mod.rs index fdd1a5162fb..e8f9fa57a35 100644 --- a/nexus/src/app/sagas/instance_update/mod.rs +++ b/nexus/src/app/sagas/instance_update/mod.rs @@ -2,19 +2,16 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -use super::{ - instance_common::allocate_vmm_ipv6, NexusActionContext, NexusSaga, - SagaInitError, ACTION_GENERATE_ID, -}; +use super::{NexusActionContext, NexusSaga, SagaInitError, ACTION_GENERATE_ID}; use crate::app::instance::InstanceStateChangeError; use crate::app::sagas::declare_saga_actions; -use chrono::Utc; use nexus_db_model::Generation; use nexus_db_queries::db::{ datastore::InstanceSnapshot, identity::Resource, lookup::LookupPath, }; +use steno::ActionError; + use nexus_db_queries::{authn, authz, db}; -use omicron_common::api::external::{Error, InstanceState}; use serde::{Deserialize, Serialize}; use uuid::Uuid; @@ -30,38 +27,43 @@ pub(crate) struct Params { pub serialized_authn: authn::saga::Serialized, pub start_state: InstanceSnapshot, + pub instance: db::model::Instance, + + pub active_vmm: Option, + + pub target_vmm: Option, } const SAGA_INSTANCE_LOCK_ID: &str = "saga_instance_lock_id"; async fn siu_lock_instance( sagactx: NexusActionContext, -) -> Result<(), anyhow::Error> { +) -> Result { let osagactx = sagactx.user_data(); let Params { - ref authz_instance, - ref serialized_authn, - ref start_state, - .. + ref authz_instance, ref serialized_authn, ref instance, .. } = sagactx.saga_params::()?; let opctx = crate::context::op_context_for_saga_action(&sagactx, serialized_authn); - // try to lock - + // try to acquire the instance updater lock let lock_id = sagactx.lookup::(SAGA_INSTANCE_LOCK_ID)?; - let lock = osagactx + osagactx .datastore() .instance_updater_try_lock( &opctx, &authz_instance, - start_state.instance.runtime_state.updater_gen, + instance.runtime_state.updater_gen, &lock_id, ) - .await?; - - Ok(()) + .await + .map_err(ActionError::action_failed)? + .ok_or_else(|| { + ActionError::action_failed( + serde_json::json!({"error": "can't get ye lock"}), + ) + }) } async fn siu_unlock_instance( From bc93d0e1f61571fb4a9cce320b6f1eb72e9c61b1 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Tue, 14 May 2024 10:35:12 -0700 Subject: [PATCH 006/234] fix snapshot-create using renamed API --- nexus/src/app/sagas/snapshot_create.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nexus/src/app/sagas/snapshot_create.rs b/nexus/src/app/sagas/snapshot_create.rs index 76a82e74912..ab5a8bcbf47 100644 --- a/nexus/src/app/sagas/snapshot_create.rs +++ b/nexus/src/app/sagas/snapshot_create.rs @@ -828,7 +828,7 @@ async fn ssc_send_snapshot_request_to_sled_agent( let sled_id = osagactx .datastore() - .instance_fetch_with_vmm(&opctx, &authz_instance) + .instance_fetch_with_active_vmm(&opctx, &authz_instance) .await .map_err(ActionError::action_failed)? .sled_id(); From de11162e7c9d0175fc1a70840ff26d15b4c063ca Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Tue, 14 May 2024 10:39:03 -0700 Subject: [PATCH 007/234] oh, i guess undo actions return anyhow::Error --- nexus/src/app/sagas/instance_update/mod.rs | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/nexus/src/app/sagas/instance_update/mod.rs b/nexus/src/app/sagas/instance_update/mod.rs index e8f9fa57a35..ccf69060415 100644 --- a/nexus/src/app/sagas/instance_update/mod.rs +++ b/nexus/src/app/sagas/instance_update/mod.rs @@ -73,6 +73,7 @@ async fn siu_unlock_instance( let Params { ref authz_instance, ref serialized_authn, .. } = sagactx.saga_params::()?; let lock_id = sagactx.lookup::(SAGA_INSTANCE_LOCK_ID)?; + let gen = sagactx.lookup::(SAGA_INSTANCE_LOCK_GEN)?; let opctx = crate::context::op_context_for_saga_action(&sagactx, serialized_authn); @@ -82,3 +83,22 @@ async fn siu_unlock_instance( .await?; Ok(()) } + +// this is different from "lock instance" lol +async fn siu_lock_instance_undo( + sagactx: NexusActionContext, +) -> Result<(), anyhow::Error> { + let osagactx = sagactx.user_data(); + let Params { + ref authz_instance, ref serialized_authn, ref instance, .. + } = sagactx.saga_params::()?; + let lock_id = sagactx.lookup::(SAGA_INSTANCE_LOCK_ID)?; + let opctx = + crate::context::op_context_for_saga_action(&sagactx, serialized_authn); + let updater_gen = instance.runtime_state.updater_gen.next().into(); + osagactx + .datastore() + .instance_updater_unlock(&opctx, &authz_instance, &lock_id, updater_gen) + .await?; + Ok(()) +} From 195d7478cce63e4526d7e86a0a223ea6f8e7eef7 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Tue, 14 May 2024 11:12:27 -0700 Subject: [PATCH 008/234] okay this seems more or less right --- .../app/sagas/instance_update/destroyed.rs | 79 +++++++---- nexus/src/app/sagas/instance_update/mod.rs | 129 +++++++++++++++--- 2 files changed, 159 insertions(+), 49 deletions(-) diff --git a/nexus/src/app/sagas/instance_update/destroyed.rs b/nexus/src/app/sagas/instance_update/destroyed.rs index 6f9a5f9e460..21e15d357f2 100644 --- a/nexus/src/app/sagas/instance_update/destroyed.rs +++ b/nexus/src/app/sagas/instance_update/destroyed.rs @@ -2,31 +2,40 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -use super::{ - siu_lock_instance, siu_unlock_instance, NexusActionContext, Params, -}; +use super::ActionRegistry; +use super::NexusActionContext; +use super::NexusSaga; use crate::app::instance_network; use crate::app::sagas::declare_saga_actions; use crate::app::sagas::ActionError; use nexus_db_model::Generation; use nexus_db_model::InstanceRuntimeState; use nexus_db_queries::db::identity::Resource; +use nexus_db_queries::{authn, authz, db}; use omicron_common::api::external; use omicron_common::api::external::Error; use omicron_common::api::external::ResourceType; +use serde::{Deserialize, Serialize}; + +/// Parameters to the instance update VMM destroyed sub-saga. +#[derive(Debug, Deserialize, Serialize)] +pub(crate) struct Params { + pub(crate) authz_instance: authz::Instance, + + /// Authentication context to use to fetch the instance's current state from + /// the database. + pub serialized_authn: authn::saga::Serialized, + + pub instance: db::model::Instance, + + pub vmm: db::model::Vmm, +} + +// instance update VMM destroyed subsaga: actions declare_saga_actions! { instance_update_destroyed; - // Read the target Instance from CRDB and join with its active VMM and - // migration target VMM records if they exist, and then acquire the - // "instance updater" lock with this saga's ID if no other saga is currently - // updating the instance. - LOCK_INSTANCE -> "lock_generation" { - + siu_lock_instance - - siu_unlock_instance - } - DELETE_SLED_RESOURCE -> "no_result1" { + siud_delete_sled_resource } @@ -50,9 +59,30 @@ declare_saga_actions! { MARK_VMM_DELETED -> "no_result6" { + siud_mark_vmm_deleted } +} + +#[derive(Debug)] +pub(crate) struct SagaVmmDestroyed; +impl NexusSaga for SagaVmmDestroyed { + const NAME: &'static str = "instance-update-vmm-destroyed"; + type Params = Params; + + fn register_actions(registry: &mut ActionRegistry) { + instance_update_destroyed_register_actions(registry); + } - UNLOCK_INSTANCE -> "no_result7" { - + siu_unlock_instance + fn make_saga_dag( + params: &Self::Params, + mut builder: steno::DagBuilder, + ) -> Result { + builder.append(delete_sled_resource_action()); + builder.append(delete_virtual_provosioning_action()); + builder.append(delete_v2p_mappings_action()); + builder.append(delete_nat_entries_action()); + builder.append(instance_update_vmm_destroyed_action()); + builder.append(mark_vmm_deleted_action()); + + Ok(builder.build()?) } } @@ -60,20 +90,15 @@ async fn siud_delete_sled_resource( sagactx: NexusActionContext, ) -> Result<(), ActionError> { let osagactx = sagactx.user_data(); - let Params { ref serialized_authn, ref active_vmm, .. } = + let Params { ref serialized_authn, ref vmm, .. } = sagactx.saga_params::()?; let opctx = crate::context::op_context_for_saga_action(&sagactx, serialized_authn); - let propolis_id = active_vmm - .as_ref() - // TODO(eliza): don't unwrap here and put it in params instead when deciding - // what to start. - .expect("if we started this saga there is an active propolis ID") - .id; + osagactx .datastore() - .sled_reservation_delete(&opctx, propolis_id) + .sled_reservation_delete(&opctx, vmm.id) .await .or_else(|err| { // Necessary for idempotency @@ -203,20 +228,14 @@ async fn siud_mark_vmm_deleted( sagactx: NexusActionContext, ) -> Result<(), ActionError> { let osagactx = sagactx.user_data(); - let Params { ref serialized_authn, ref active_vmm, .. } = + let Params { ref serialized_authn, ref vmm, .. } = sagactx.saga_params::()?; let opctx = crate::context::op_context_for_saga_action(&sagactx, serialized_authn); - let propolis_id = active_vmm - .as_ref() - // TODO(eliza): don't unwrap here and put it in params instead when deciding - // what to start. - .expect("if we started this saga there is an active propolis ID") - .id; osagactx .datastore() - .vmm_mark_deleted(&opctx, &propolis_id) + .vmm_mark_deleted(&opctx, &vmm.id) .await .map(|_| ()) .map_err(ActionError::action_failed) diff --git a/nexus/src/app/sagas/instance_update/mod.rs b/nexus/src/app/sagas/instance_update/mod.rs index ccf69060415..6408ae556ea 100644 --- a/nexus/src/app/sagas/instance_update/mod.rs +++ b/nexus/src/app/sagas/instance_update/mod.rs @@ -2,21 +2,20 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -use super::{NexusActionContext, NexusSaga, SagaInitError, ACTION_GENERATE_ID}; -use crate::app::instance::InstanceStateChangeError; +use super::{ + ActionRegistry, NexusActionContext, NexusSaga, SagaInitError, + ACTION_GENERATE_ID, +}; +use crate::app::db::datastore::InstanceSnapshot; use crate::app::sagas::declare_saga_actions; use nexus_db_model::Generation; -use nexus_db_queries::db::{ - datastore::InstanceSnapshot, identity::Resource, lookup::LookupPath, -}; -use steno::ActionError; - -use nexus_db_queries::{authn, authz, db}; +use nexus_db_queries::{authn, authz}; +use omicron_common::api::external::InstanceState; use serde::{Deserialize, Serialize}; +use steno::{ActionError, DagBuilder, Node, SagaName}; use uuid::Uuid; -pub mod destroyed; - +mod destroyed; /// Parameters to the instance update saga. #[derive(Debug, Deserialize, Serialize)] pub(crate) struct Params { @@ -26,15 +25,107 @@ pub(crate) struct Params { /// the database. pub serialized_authn: authn::saga::Serialized, - pub start_state: InstanceSnapshot, - pub instance: db::model::Instance, + pub state: InstanceSnapshot, +} + +const INSTANCE_LOCK_ID: &str = "saga_instance_lock_id"; +const INSTANCE_LOCK_GEN: &str = "saga_instance_lock_gen"; + +// instance update saga: actions + +declare_saga_actions! { + instance_update; + + // Read the target Instance from CRDB and join with its active VMM and + // migration target VMM records if they exist, and then acquire the + // "instance updater" lock with this saga's ID if no other saga is currently + // updating the instance. + LOCK_INSTANCE -> "saga_instance_lock_gen" { + + siu_lock_instance + - siu_unlock_instance + } + + UNLOCK_INSTANCE -> "no_result7" { + + siu_unlock_instance + } +} + +// instance update saga: definition + +#[derive(Debug)] +pub(crate) struct SagaInstanceUpdate; +impl NexusSaga for SagaInstanceUpdate { + const NAME: &'static str = "instance-update"; + type Params = Params; + + fn register_actions(registry: &mut ActionRegistry) { + instance_update_register_actions(registry); + } + + fn make_saga_dag( + params: &Self::Params, + mut builder: DagBuilder, + ) -> Result { + builder.append(Node::action( + INSTANCE_LOCK_ID, + "GenerateInstanceLockId", + ACTION_GENERATE_ID.as_ref(), + )); + builder.append(lock_instance_action()); + + // determine which subsaga to execute based on the state of the instance + // and the VMMs associated with it. + match params.state { + // VMM destroyed subsaga + InstanceSnapshot { instance, active_vmm: Some(vmm), .. } + if vmm.runtime.state.state() == &InstanceState::Destroyed => + { + const DESTROYED_SUBSAGA_PARAMS: &str = + "params_for_vmm_destroyed_subsaga"; + let subsaga_params = destroyed::Params { + serialized_authn: params.serialized_authn.clone(), + instance: instance.clone(), + authz_instance: params.authz_instance.clone(), + vmm, + }; + let subsaga_dag = { + let subsaga_builder = DagBuilder::new(SagaName::new( + destroyed::SagaVmmDestroyed::NAME, + )); + destroyed::SagaVmmDestroyed::make_saga_dag( + &subsaga_params, + subsaga_builder, + )? + }; + + builder.append(Node::constant( + DESTROYED_SUBSAGA_PARAMS, + serde_json::to_value(&subsaga_params).map_err(|e| { + SagaInitError::SerializeError( + DESTROYED_SUBSAGA_PARAMS.to_string(), + e, + ) + })?, + )); + + builder.append(Node::subsaga( + "vmm_destroyed_subsaga_no_result", + subsaga_dag, + DESTROYED_SUBSAGA_PARAMS, + )); + } + _ => { + // TODO(eliza): other subsagas + } + }; - pub active_vmm: Option, + builder.append(unlock_instance_action()); - pub target_vmm: Option, + Ok(builder.build()?) + } } -const SAGA_INSTANCE_LOCK_ID: &str = "saga_instance_lock_id"; +// instance update saga: action implementations async fn siu_lock_instance( sagactx: NexusActionContext, @@ -48,7 +139,7 @@ async fn siu_lock_instance( crate::context::op_context_for_saga_action(&sagactx, serialized_authn); // try to acquire the instance updater lock - let lock_id = sagactx.lookup::(SAGA_INSTANCE_LOCK_ID)?; + let lock_id = sagactx.lookup::(INSTANCE_LOCK_ID)?; osagactx .datastore() .instance_updater_try_lock( @@ -72,8 +163,8 @@ async fn siu_unlock_instance( let osagactx = sagactx.user_data(); let Params { ref authz_instance, ref serialized_authn, .. } = sagactx.saga_params::()?; - let lock_id = sagactx.lookup::(SAGA_INSTANCE_LOCK_ID)?; - let gen = sagactx.lookup::(SAGA_INSTANCE_LOCK_GEN)?; + let lock_id = sagactx.lookup::(INSTANCE_LOCK_ID)?; + let gen = sagactx.lookup::(INSTANCE_LOCK_GEN)?; let opctx = crate::context::op_context_for_saga_action(&sagactx, serialized_authn); @@ -92,7 +183,7 @@ async fn siu_lock_instance_undo( let Params { ref authz_instance, ref serialized_authn, ref instance, .. } = sagactx.saga_params::()?; - let lock_id = sagactx.lookup::(SAGA_INSTANCE_LOCK_ID)?; + let lock_id = sagactx.lookup::(INSTANCE_LOCK_ID)?; let opctx = crate::context::op_context_for_saga_action(&sagactx, serialized_authn); let updater_gen = instance.runtime_state.updater_gen.next().into(); From 440588355f9f9473f3ddca0d7d4632643cadbf41 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Tue, 14 May 2024 11:34:30 -0700 Subject: [PATCH 009/234] add logging to VMM destroyed subsaga --- .../app/sagas/instance_update/destroyed.rs | 80 ++++++++++++++++--- 1 file changed, 68 insertions(+), 12 deletions(-) diff --git a/nexus/src/app/sagas/instance_update/destroyed.rs b/nexus/src/app/sagas/instance_update/destroyed.rs index 21e15d357f2..ad9b8ddd2a1 100644 --- a/nexus/src/app/sagas/instance_update/destroyed.rs +++ b/nexus/src/app/sagas/instance_update/destroyed.rs @@ -16,6 +16,7 @@ use omicron_common::api::external; use omicron_common::api::external::Error; use omicron_common::api::external::ResourceType; use serde::{Deserialize, Serialize}; +use slog::info; /// Parameters to the instance update VMM destroyed sub-saga. #[derive(Debug, Deserialize, Serialize)] @@ -52,8 +53,8 @@ declare_saga_actions! { + siud_delete_nat_entries } - UPDATE_VMM_DESTROYED -> "no_result5" { - + siud_instance_update_vmm_destroyed + UPDATE_INSTANCE -> "no_result5" { + + siud_update_instance } MARK_VMM_DELETED -> "no_result6" { @@ -76,10 +77,10 @@ impl NexusSaga for SagaVmmDestroyed { mut builder: steno::DagBuilder, ) -> Result { builder.append(delete_sled_resource_action()); - builder.append(delete_virtual_provosioning_action()); + builder.append(delete_virtual_provisioning_action()); builder.append(delete_v2p_mappings_action()); builder.append(delete_nat_entries_action()); - builder.append(instance_update_vmm_destroyed_action()); + builder.append(update_instance_action()); builder.append(mark_vmm_deleted_action()); Ok(builder.build()?) @@ -90,12 +91,20 @@ async fn siud_delete_sled_resource( sagactx: NexusActionContext, ) -> Result<(), ActionError> { let osagactx = sagactx.user_data(); - let Params { ref serialized_authn, ref vmm, .. } = + let Params { ref serialized_authn, ref vmm, ref instance, .. } = sagactx.saga_params::()?; let opctx = crate::context::op_context_for_saga_action(&sagactx, serialized_authn); + info!( + osagactx.log(), + "instance update (VMM destroyed): deleting sled reservation"; + "instance_id" => %instance.id(), + "propolis_id" => %vmm.id, + "instance_update" => %"VMM destroyed", + ); + osagactx .datastore() .sled_reservation_delete(&opctx, vmm.id) @@ -114,12 +123,20 @@ async fn siud_delete_virtual_provisioning( sagactx: NexusActionContext, ) -> Result<(), ActionError> { let osagactx = sagactx.user_data(); - let Params { ref serialized_authn, ref instance, .. } = + let Params { ref serialized_authn, ref instance, ref vmm, .. } = sagactx.saga_params::()?; let opctx = crate::context::op_context_for_saga_action(&sagactx, serialized_authn); + info!( + osagactx.log(), + "instance update (VMM destroyed): deleting virtual provisioning"; + "instance_id" => %instance.id(), + "propolis_id" => %vmm.id, + "instance_update" => %"VMM destroyed", + ); + osagactx .datastore() .virtual_provisioning_collection_delete_instance( @@ -145,12 +162,20 @@ async fn siud_delete_v2p_mappings( sagactx: NexusActionContext, ) -> Result<(), ActionError> { let osagactx = sagactx.user_data(); - let Params { ref serialized_authn, ref instance, .. } = + let Params { ref serialized_authn, ref instance, ref vmm, .. } = sagactx.saga_params::()?; let opctx = crate::context::op_context_for_saga_action(&sagactx, serialized_authn); + info!( + osagactx.log(), + "instance update (VMM destroyed): deleting V2P mappings"; + "instance_id" => %instance.id(), + "propolis_id" => %vmm.id, + "instance_update" => %"VMM destroyed", + ); + // Per the commentary in instance_network::delete_instance_v2p_mappings`, // this should be idempotent. instance_network::delete_instance_v2p_mappings( @@ -177,8 +202,13 @@ async fn siud_delete_nat_entries( sagactx: NexusActionContext, ) -> Result<(), ActionError> { let osagactx = sagactx.user_data(); - let Params { ref serialized_authn, ref authz_instance, .. } = - sagactx.saga_params::()?; + let Params { + ref serialized_authn, + ref authz_instance, + ref vmm, + ref instance, + .. + } = sagactx.saga_params::()?; let opctx = crate::context::op_context_for_saga_action(&sagactx, serialized_authn); @@ -187,6 +217,14 @@ async fn siud_delete_nat_entries( let datastore = osagactx.datastore(); let log = osagactx.log(); + info!( + log, + "instance update (VMM destroyed): deleting NAT entries"; + "instance_id" => %instance.id(), + "propolis_id" => %vmm.id, + "instance_update" => %"VMM destroyed", + ); + instance_network::instance_delete_dpd_config( datastore, log, @@ -204,11 +242,11 @@ async fn siud_delete_nat_entries( }) } -async fn siud_instance_update_vmm_destroyed( +async fn siud_update_instance( sagactx: NexusActionContext, ) -> Result<(), ActionError> { let osagactx = sagactx.user_data(); - let Params { instance, .. } = sagactx.saga_params::()?; + let Params { instance, vmm, .. } = sagactx.saga_params::()?; let new_runtime = InstanceRuntimeState { propolis_id: None, nexus_state: external::InstanceState::Stopped.into(), @@ -216,6 +254,15 @@ async fn siud_instance_update_vmm_destroyed( ..instance.runtime_state }; + info!( + osagactx.log(), + "instance update (VMM destroyed): updating runtime state"; + "instance_id" => %instance.id(), + "propolis_id" => %vmm.id, + "new_runtime_state" => ?new_runtime, + "instance_update" => %"VMM destroyed", + ); + // It's okay for this to fail, it just means that the active VMM ID has changed. let _ = osagactx .datastore() @@ -228,11 +275,20 @@ async fn siud_mark_vmm_deleted( sagactx: NexusActionContext, ) -> Result<(), ActionError> { let osagactx = sagactx.user_data(); - let Params { ref serialized_authn, ref vmm, .. } = + let Params { ref serialized_authn, ref vmm, ref instance, .. } = sagactx.saga_params::()?; let opctx = crate::context::op_context_for_saga_action(&sagactx, serialized_authn); + + info!( + osagactx.log(), + "instance update (VMM destroyed): marking VMM record deleted"; + "instance_id" => %instance.id(), + "propolis_id" => %vmm.id, + "instance_update" => %"VMM destroyed", + ); + osagactx .datastore() .vmm_mark_deleted(&opctx, &vmm.id) From 756b78b47837dcc15de87e997b50548ab1d0a002 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Tue, 14 May 2024 11:40:03 -0700 Subject: [PATCH 010/234] fixy --- nexus/src/app/sagas/instance_update/mod.rs | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/nexus/src/app/sagas/instance_update/mod.rs b/nexus/src/app/sagas/instance_update/mod.rs index 6408ae556ea..0cba5595855 100644 --- a/nexus/src/app/sagas/instance_update/mod.rs +++ b/nexus/src/app/sagas/instance_update/mod.rs @@ -131,9 +131,8 @@ async fn siu_lock_instance( sagactx: NexusActionContext, ) -> Result { let osagactx = sagactx.user_data(); - let Params { - ref authz_instance, ref serialized_authn, ref instance, .. - } = sagactx.saga_params::()?; + let Params { ref authz_instance, ref serialized_authn, ref state, .. } = + sagactx.saga_params::()?; let opctx = crate::context::op_context_for_saga_action(&sagactx, serialized_authn); @@ -145,7 +144,7 @@ async fn siu_lock_instance( .instance_updater_try_lock( &opctx, &authz_instance, - instance.runtime_state.updater_gen, + state.instance.runtime_state.updater_gen, &lock_id, ) .await @@ -180,13 +179,12 @@ async fn siu_lock_instance_undo( sagactx: NexusActionContext, ) -> Result<(), anyhow::Error> { let osagactx = sagactx.user_data(); - let Params { - ref authz_instance, ref serialized_authn, ref instance, .. - } = sagactx.saga_params::()?; + let Params { ref authz_instance, ref serialized_authn, ref state, .. } = + sagactx.saga_params::()?; let lock_id = sagactx.lookup::(INSTANCE_LOCK_ID)?; let opctx = crate::context::op_context_for_saga_action(&sagactx, serialized_authn); - let updater_gen = instance.runtime_state.updater_gen.next().into(); + let updater_gen = state.instance.runtime_state.updater_gen.next().into(); osagactx .datastore() .instance_updater_unlock(&opctx, &authz_instance, &lock_id, updater_gen) From becb3d57045968b331fee67719eb47573d10f2c1 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Tue, 14 May 2024 15:19:35 -0700 Subject: [PATCH 011/234] oh we can just call the nexus methods i guess --- .../app/sagas/instance_update/destroyed.rs | 56 +++++++------------ nexus/src/app/sagas/instance_update/mod.rs | 2 +- 2 files changed, 21 insertions(+), 37 deletions(-) diff --git a/nexus/src/app/sagas/instance_update/destroyed.rs b/nexus/src/app/sagas/instance_update/destroyed.rs index ad9b8ddd2a1..390fe16a55b 100644 --- a/nexus/src/app/sagas/instance_update/destroyed.rs +++ b/nexus/src/app/sagas/instance_update/destroyed.rs @@ -178,24 +178,20 @@ async fn siud_delete_v2p_mappings( // Per the commentary in instance_network::delete_instance_v2p_mappings`, // this should be idempotent. - instance_network::delete_instance_v2p_mappings( - osagactx.datastore(), - osagactx.log(), - &osagactx.nexus().opctx_alloc, - &opctx, - instance.id(), - ) - .await - .or_else(|err| { - // Necessary for idempotency - match err { - Error::ObjectNotFound { - type_name: ResourceType::Instance, - lookup_type: _, - } => Ok(()), - _ => Err(ActionError::action_failed(err)), - } - }) + osagactx + .nexus() + .delete_instance_v2p_mappings(&opctx, instance.id()) + .await + .or_else(|err| { + // Necessary for idempotency + match err { + Error::ObjectNotFound { + type_name: ResourceType::Instance, + lookup_type: _, + } => Ok(()), + _ => Err(ActionError::action_failed(err)), + } + }) } async fn siud_delete_nat_entries( @@ -212,9 +208,6 @@ async fn siud_delete_nat_entries( let opctx = crate::context::op_context_for_saga_action(&sagactx, serialized_authn); - let opctx_alloc = &osagactx.nexus().opctx_alloc; - let resolver = osagactx.nexus().resolver().await; - let datastore = osagactx.datastore(); let log = osagactx.log(); info!( @@ -225,21 +218,12 @@ async fn siud_delete_nat_entries( "instance_update" => %"VMM destroyed", ); - instance_network::instance_delete_dpd_config( - datastore, - log, - &resolver, - &opctx, - opctx_alloc, - authz_instance, - ) - .await - .or_else(|err| - // Necessary for idempotency - match err { - Error::ObjectNotFound { .. } => Ok(()), - _ => Err(ActionError::action_failed(err)), - }) + osagactx + .nexus() + .instance_delete_dpd_config(&opctx, &authz_instance) + .await + .map_err(ActionError::action_failed)?; + Ok(()) } async fn siud_update_instance( diff --git a/nexus/src/app/sagas/instance_update/mod.rs b/nexus/src/app/sagas/instance_update/mod.rs index 0cba5595855..d0c97eb63c0 100644 --- a/nexus/src/app/sagas/instance_update/mod.rs +++ b/nexus/src/app/sagas/instance_update/mod.rs @@ -42,7 +42,7 @@ declare_saga_actions! { // updating the instance. LOCK_INSTANCE -> "saga_instance_lock_gen" { + siu_lock_instance - - siu_unlock_instance + - siu_lock_instance_undo } UNLOCK_INSTANCE -> "no_result7" { From 82f4731b32a75562a46c0404ed134b9e979a1a0f Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Wed, 15 May 2024 14:02:32 -0700 Subject: [PATCH 012/234] wip bgtask stuff --- nexus/db-model/src/instance_state.rs | 5 + nexus/db-queries/src/db/datastore/instance.rs | 34 +++++++ nexus/src/app/background/init.rs | 25 +++++ .../app/background/tasks/instance_updater.rs | 99 +++++++++++++++++++ nexus/src/app/background/tasks/mod.rs | 1 + 5 files changed, 164 insertions(+) create mode 100644 nexus/src/app/background/tasks/instance_updater.rs diff --git a/nexus/db-model/src/instance_state.rs b/nexus/db-model/src/instance_state.rs index 673b06e2cdf..5925e92ae0d 100644 --- a/nexus/db-model/src/instance_state.rs +++ b/nexus/db-model/src/instance_state.rs @@ -59,3 +59,8 @@ impl From for omicron_common::api::external::InstanceState { } } } + +impl diesel::query_builder::QueryId for InstanceStateEnum { + type QueryId = (); + const HAS_STATIC_QUERY_ID: bool = false; +} diff --git a/nexus/db-queries/src/db/datastore/instance.rs b/nexus/db-queries/src/db/datastore/instance.rs index 9fb94f043e7..7443921cba3 100644 --- a/nexus/db-queries/src/db/datastore/instance.rs +++ b/nexus/db-queries/src/db/datastore/instance.rs @@ -295,6 +295,40 @@ impl DataStore { .collect()) } + /// List all instances with active VMMs in the `Destroyed` state that don't + /// have currently-running instance-updater sagas. + pub async fn find_instances_with_destroyed_active_vmms( + &self, + opctx: &OpContext, + ) -> ListResultVec { + use db::model::InstanceState as DbInstanceState; + use db::schema::instance::dsl; + use db::schema::vmm::dsl as vmm_dsl; + use omicron_common::api::external::InstanceState; + let destroyed = DbInstanceState::new(InstanceState::Destroyed); + Ok(vmm_dsl::vmm + .filter(vmm_dsl::time_deleted.is_not_null()) + .filter(vmm_dsl::state.eq(destroyed)) + .inner_join( + dsl::instance.on(dsl::active_propolis_id + .eq(vmm_dsl::id.nullable()) + .and(dsl::time_deleted.is_null()) + .and(dsl::updater_id.is_null())), + ) + .select((Instance::as_select(), Vmm::as_select())) + .load_async::<(Instance, Vmm)>( + &*self.pool_connection_authorized(opctx).await?, + ) + .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))? + .into_iter() + .map(|(instance, vmm)| InstanceAndActiveVmm { + instance, + vmm: Some(vmm), + }) + .collect()) + } + /// Fetches information about an Instance that the caller has previously /// fetched /// diff --git a/nexus/src/app/background/init.rs b/nexus/src/app/background/init.rs index 2f1c4cd7388..05ec3fb8549 100644 --- a/nexus/src/app/background/init.rs +++ b/nexus/src/app/background/init.rs @@ -98,6 +98,7 @@ use super::tasks::dns_config; use super::tasks::dns_propagation; use super::tasks::dns_servers; use super::tasks::external_endpoints; +use super::tasks::instance_updater; use super::tasks::instance_watcher; use super::tasks::inventory_collection; use super::tasks::lookup_region_port; @@ -154,6 +155,7 @@ pub struct BackgroundTasks { pub task_region_replacement: Activator, pub task_region_replacement_driver: Activator, pub task_instance_watcher: Activator, + pub task_instance_updater: Activator, pub task_service_firewall_propagation: Activator, pub task_abandoned_vmm_reaper: Activator, pub task_vpc_route_manager: Activator, @@ -234,6 +236,7 @@ impl BackgroundTasksInitializer { task_region_replacement: Activator::new(), task_region_replacement_driver: Activator::new(), task_instance_watcher: Activator::new(), + task_instance_updater: Activator::new(), task_service_firewall_propagation: Activator::new(), task_abandoned_vmm_reaper: Activator::new(), task_vpc_route_manager: Activator::new(), @@ -294,6 +297,7 @@ impl BackgroundTasksInitializer { task_region_replacement, task_region_replacement_driver, task_instance_watcher, + task_instance_updater, task_service_firewall_propagation, task_abandoned_vmm_reaper, task_vpc_route_manager, @@ -629,6 +633,27 @@ impl BackgroundTasksInitializer { }) }; + // Background task: schedule update sagas for instances in need of + // state updates. + { + let updater = { + let updater = instance_updater::InstanceUpdater::new( + datastore.clone(), + saga_request.clone(), + ); + driver.register( + "instance_updater".to_string(), + "detects if instances require update sagas and schedules them" + .to_string(), + config.instance_updater.period_secs, + Box::new(updater), + opctx.child(BTreeMap::new()), + vec![], + task_instance_updaterm + ); + }; + } + // Background task: service firewall rule propagation driver.register(TaskDefinition { name: "service_firewall_rule_propagation", diff --git a/nexus/src/app/background/tasks/instance_updater.rs b/nexus/src/app/background/tasks/instance_updater.rs new file mode 100644 index 00000000000..78f3299d8f9 --- /dev/null +++ b/nexus/src/app/background/tasks/instance_updater.rs @@ -0,0 +1,99 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Background task for detecting instances in need of update sagas. +//! +//! TODO this is currently a placeholder for a future PR + +use super::common::BackgroundTask; +use crate::app::sagas::SagaRequest; +use futures::future::BoxFuture; +use futures::FutureExt; +use nexus_db_queries::context::OpContext; +use nexus_db_queries::db::DataStore; +use serde_json::json; +use std::sync::Arc; +use tokio::sync::mpsc::Sender; + +pub struct InstanceUpdater { + datastore: Arc, + saga_req: Sender, +} + +impl InstanceUpdater { + pub fn new( + datastore: Arc, + saga_req: Sender, + ) -> Self { + InstanceUpdater { datastore, saga_req } + } + + async fn activate2( + &mut self, + opctx: &OpContext, + ) -> Result { + let mut updated = Updated::default(); + + let log = &opctx.log; + + slog::debug!( + &log, + "looking for instances with destroyed active VMMs..." + ); + + let destroyed_active_vmms = self + .datastore + .find_instances_with_destroyed_active_vmms(opctx) + .await + .context("failed to find instances with destroyed active VMMs")?; + + slog::info!( + &log, + "listed instances with destroyed active VMMs"; + "count" => destroyed_active_vmms.len(), + ); + + updated.destroyed_active_vmms = destroyed_active_vmms.len(); + + for (instance, vmm) in destroyed_active_vmms { + let saga = SagaRequest::InstanceUpdate {}; + } + + Ok(updated) + } +} + +#[derive(Default)] +struct Updated { + destroyed_active_vmms: usize, + sagas_started: usize, +} + +impl BackgroundTask for InstanceUpdater { + fn activate<'a>( + &'a mut self, + opctx: &'a OpContext, + ) -> BoxFuture<'a, serde_json::Value> { + async { + match self.activate2(opctx).await { + Ok(updated) => json!({ + "destroyed_active_vmms": updated.destroyed_active_vmms, + "error": None, + }), + Err(error) => { + slog::error!( + opctx.log, + "failed to start instance update saga(s)"; + "error" => ?error, + ); + json!({ + "destroyed_active_vmms": 0, + "error": error.to_string(), + }) + } + } + } + .boxed() + } +} diff --git a/nexus/src/app/background/tasks/mod.rs b/nexus/src/app/background/tasks/mod.rs index 5062799bdb4..fe041a6daad 100644 --- a/nexus/src/app/background/tasks/mod.rs +++ b/nexus/src/app/background/tasks/mod.rs @@ -14,6 +14,7 @@ pub mod dns_config; pub mod dns_propagation; pub mod dns_servers; pub mod external_endpoints; +pub mod instance_updater; pub mod instance_watcher; pub mod inventory_collection; pub mod lookup_region_port; From 88f3b8bfd51ab274a71120908872bb0f710d62a5 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Thu, 16 May 2024 12:23:40 -0700 Subject: [PATCH 013/234] plumbing etc --- nexus/db-model/src/vmm_state.rs | 5 ++ nexus/db-queries/src/db/datastore/instance.rs | 10 +-- .../app/background/tasks/instance_updater.rs | 73 +++++++++++++------ nexus/src/app/instance.rs | 1 - nexus/src/app/mod.rs | 1 + .../app/sagas/instance_update/destroyed.rs | 21 +++--- nexus/src/app/sagas/instance_update/mod.rs | 44 ++++++----- nexus/src/app/sagas/mod.rs | 3 + 8 files changed, 99 insertions(+), 59 deletions(-) diff --git a/nexus/db-model/src/vmm_state.rs b/nexus/db-model/src/vmm_state.rs index 121daaf7dd8..058e29ba95e 100644 --- a/nexus/db-model/src/vmm_state.rs +++ b/nexus/db-model/src/vmm_state.rs @@ -119,3 +119,8 @@ impl From for omicron_common::api::external::InstanceState { } } } + +impl diesel::query_builder::QueryId for VmmStateEnum { + type QueryId = (); + const HAS_STATIC_QUERY_ID: bool = false; +} diff --git a/nexus/db-queries/src/db/datastore/instance.rs b/nexus/db-queries/src/db/datastore/instance.rs index 7443921cba3..6100a628498 100644 --- a/nexus/db-queries/src/db/datastore/instance.rs +++ b/nexus/db-queries/src/db/datastore/instance.rs @@ -59,8 +59,8 @@ use uuid::Uuid; /// Wraps a record of an `Instance` along with its active `Vmm`, if it has one. #[derive(Clone, Debug)] pub struct InstanceAndActiveVmm { - instance: Instance, - vmm: Option, + pub instance: Instance, + pub vmm: Option, } impl InstanceAndActiveVmm { @@ -301,14 +301,12 @@ impl DataStore { &self, opctx: &OpContext, ) -> ListResultVec { - use db::model::InstanceState as DbInstanceState; + use db::model::VmmState; use db::schema::instance::dsl; use db::schema::vmm::dsl as vmm_dsl; - use omicron_common::api::external::InstanceState; - let destroyed = DbInstanceState::new(InstanceState::Destroyed); Ok(vmm_dsl::vmm .filter(vmm_dsl::time_deleted.is_not_null()) - .filter(vmm_dsl::state.eq(destroyed)) + .filter(vmm_dsl::state.eq(VmmState::Destroyed)) .inner_join( dsl::instance.on(dsl::active_propolis_id .eq(vmm_dsl::id.nullable()) diff --git a/nexus/src/app/background/tasks/instance_updater.rs b/nexus/src/app/background/tasks/instance_updater.rs index 78f3299d8f9..3c14459ce65 100644 --- a/nexus/src/app/background/tasks/instance_updater.rs +++ b/nexus/src/app/background/tasks/instance_updater.rs @@ -7,10 +7,14 @@ //! TODO this is currently a placeholder for a future PR use super::common::BackgroundTask; -use crate::app::sagas::SagaRequest; +use crate::app::authn; +use crate::app::sagas::{self, SagaRequest}; +use anyhow::Context; use futures::future::BoxFuture; use futures::FutureExt; use nexus_db_queries::context::OpContext; +use nexus_db_queries::db::datastore::InstanceAndActiveVmm; +use nexus_db_queries::db::datastore::InstanceSnapshot; use nexus_db_queries::db::DataStore; use serde_json::json; use std::sync::Arc; @@ -32,9 +36,8 @@ impl InstanceUpdater { async fn activate2( &mut self, opctx: &OpContext, - ) -> Result { - let mut updated = Updated::default(); - + stats: &mut ActivationStats, + ) -> Result<(), anyhow::Error> { let log = &opctx.log; slog::debug!( @@ -54,18 +57,33 @@ impl InstanceUpdater { "count" => destroyed_active_vmms.len(), ); - updated.destroyed_active_vmms = destroyed_active_vmms.len(); + stats.destroyed_active_vmms = destroyed_active_vmms.len(); - for (instance, vmm) in destroyed_active_vmms { - let saga = SagaRequest::InstanceUpdate {}; + for InstanceAndActiveVmm { instance, vmm } in destroyed_active_vmms { + let saga = SagaRequest::InstanceUpdate { + params: sagas::instance_update::Params { + serialized_authn: authn::saga::Serialized::for_opctx(opctx), + state: InstanceSnapshot { + instance, + active_vmm: vmm, + target_vmm: None, + migration: None, // TODO(eliza) + }, + }, + }; + self.saga_req + .send(saga) + .await + .context("SagaRequest receiver missing")?; + stats.sagas_started += 1; } - Ok(updated) + Ok(()) } } #[derive(Default)] -struct Updated { +struct ActivationStats { destroyed_active_vmms: usize, sagas_started: usize, } @@ -76,23 +94,32 @@ impl BackgroundTask for InstanceUpdater { opctx: &'a OpContext, ) -> BoxFuture<'a, serde_json::Value> { async { - match self.activate2(opctx).await { - Ok(updated) => json!({ - "destroyed_active_vmms": updated.destroyed_active_vmms, - "error": None, - }), + let mut stats = ActivationStats::default(); + let error = match self.activate2(opctx, &mut stats).await { + Ok(()) => { + slog::info!( + &opctx.log, + "instance updater activation completed"; + "destroyed_active_vmms" => stats.destroyed_active_vmms, + "sagas_started" => stats.sagas_started, + ); + } Err(error) => { - slog::error!( - opctx.log, - "failed to start instance update saga(s)"; - "error" => ?error, + slog::warn!( + &opctx.log, + "instance updater activation failed!"; + "error" => %error, + "destroyed_active_vmms" => stats.destroyed_active_vmms, + "sagas_started" => stats.sagas_started, ); - json!({ - "destroyed_active_vmms": 0, - "error": error.to_string(), - }) + Some(error.to_string()) } - } + }; + json!({ + "destroyed_active_vmms": stats.destroyed_active_vmms, + "sagas_started": stats.sagas_started, + "error": error, + }) } .boxed() } diff --git a/nexus/src/app/instance.rs b/nexus/src/app/instance.rs index 05482dbb6f7..e6866bfab6a 100644 --- a/nexus/src/app/instance.rs +++ b/nexus/src/app/instance.rs @@ -29,7 +29,6 @@ use nexus_db_queries::context::OpContext; use nexus_db_queries::db; use nexus_db_queries::db::datastore::instance::InstanceUpdateResult; use nexus_db_queries::db::datastore::InstanceAndActiveVmm; -use nexus_db_queries::db::datastore::InstanceSnapshot; use nexus_db_queries::db::identity::Resource; use nexus_db_queries::db::lookup; use nexus_db_queries::db::lookup::LookupPath; diff --git a/nexus/src/app/mod.rs b/nexus/src/app/mod.rs index 60ed611bd7e..9508d5e7e31 100644 --- a/nexus/src/app/mod.rs +++ b/nexus/src/app/mod.rs @@ -23,6 +23,7 @@ use nexus_db_queries::authn; use nexus_db_queries::authz; use nexus_db_queries::context::OpContext; use nexus_db_queries::db; +use nexus_types::identity::Resource; use omicron_common::address::DENDRITE_PORT; use omicron_common::address::MGD_PORT; use omicron_common::address::MGS_PORT; diff --git a/nexus/src/app/sagas/instance_update/destroyed.rs b/nexus/src/app/sagas/instance_update/destroyed.rs index 390fe16a55b..2b4c7a34268 100644 --- a/nexus/src/app/sagas/instance_update/destroyed.rs +++ b/nexus/src/app/sagas/instance_update/destroyed.rs @@ -5,9 +5,9 @@ use super::ActionRegistry; use super::NexusActionContext; use super::NexusSaga; -use crate::app::instance_network; use crate::app::sagas::declare_saga_actions; use crate::app::sagas::ActionError; +use db::lookup::LookupPath; use nexus_db_model::Generation; use nexus_db_model::InstanceRuntimeState; use nexus_db_queries::db::identity::Resource; @@ -21,8 +21,6 @@ use slog::info; /// Parameters to the instance update VMM destroyed sub-saga. #[derive(Debug, Deserialize, Serialize)] pub(crate) struct Params { - pub(crate) authz_instance: authz::Instance, - /// Authentication context to use to fetch the instance's current state from /// the database. pub serialized_authn: authn::saga::Serialized, @@ -198,26 +196,25 @@ async fn siud_delete_nat_entries( sagactx: NexusActionContext, ) -> Result<(), ActionError> { let osagactx = sagactx.user_data(); - let Params { - ref serialized_authn, - ref authz_instance, - ref vmm, - ref instance, - .. - } = sagactx.saga_params::()?; + let Params { ref serialized_authn, ref vmm, ref instance, .. } = + sagactx.saga_params::()?; let opctx = crate::context::op_context_for_saga_action(&sagactx, serialized_authn); - let log = osagactx.log(); info!( - log, + osagactx.log(), "instance update (VMM destroyed): deleting NAT entries"; "instance_id" => %instance.id(), "propolis_id" => %vmm.id, "instance_update" => %"VMM destroyed", ); + let (.., authz_instance) = LookupPath::new(&opctx, &osagactx.datastore()) + .instance_id(instance.id()) + .lookup_for(authz::Action::Modify) + .await + .map_err(ActionError::action_failed)?; osagactx .nexus() .instance_delete_dpd_config(&opctx, &authz_instance) diff --git a/nexus/src/app/sagas/instance_update/mod.rs b/nexus/src/app/sagas/instance_update/mod.rs index d0c97eb63c0..066af7f9716 100644 --- a/nexus/src/app/sagas/instance_update/mod.rs +++ b/nexus/src/app/sagas/instance_update/mod.rs @@ -7,9 +7,11 @@ use super::{ ACTION_GENERATE_ID, }; use crate::app::db::datastore::InstanceSnapshot; +use crate::app::db::lookup::LookupPath; use crate::app::sagas::declare_saga_actions; use nexus_db_model::Generation; use nexus_db_queries::{authn, authz}; +use nexus_types::identity::Resource; use omicron_common::api::external::InstanceState; use serde::{Deserialize, Serialize}; use steno::{ActionError, DagBuilder, Node, SagaName}; @@ -19,8 +21,6 @@ mod destroyed; /// Parameters to the instance update saga. #[derive(Debug, Deserialize, Serialize)] pub(crate) struct Params { - pub authz_instance: authz::Instance, - /// Authentication context to use to fetch the instance's current state from /// the database. pub serialized_authn: authn::saga::Serialized, @@ -77,16 +77,15 @@ impl NexusSaga for SagaInstanceUpdate { // and the VMMs associated with it. match params.state { // VMM destroyed subsaga - InstanceSnapshot { instance, active_vmm: Some(vmm), .. } - if vmm.runtime.state.state() == &InstanceState::Destroyed => - { + InstanceSnapshot { + instance, active_vmm: Some(ref vmm), .. + } if vmm.runtime.state.state() == &VmmState::Destroyed => { const DESTROYED_SUBSAGA_PARAMS: &str = "params_for_vmm_destroyed_subsaga"; let subsaga_params = destroyed::Params { serialized_authn: params.serialized_authn.clone(), instance: instance.clone(), - authz_instance: params.authz_instance.clone(), - vmm, + vmm: vmm.clone(), }; let subsaga_dag = { let subsaga_builder = DagBuilder::new(SagaName::new( @@ -131,16 +130,21 @@ async fn siu_lock_instance( sagactx: NexusActionContext, ) -> Result { let osagactx = sagactx.user_data(); - let Params { ref authz_instance, ref serialized_authn, ref state, .. } = + let Params { ref serialized_authn, ref state, .. } = sagactx.saga_params::()?; - + let lock_id = sagactx.lookup::(INSTANCE_LOCK_ID)?; let opctx = crate::context::op_context_for_saga_action(&sagactx, serialized_authn); + let datastore = osagactx.datastore(); + + let (.., authz_instance) = LookupPath::new(&opctx, datastore) + .instance_id(state.instance.id()) + .lookup_for(authz::Action::Modify) + .await + .map_err(ActionError::action_failed)?; // try to acquire the instance updater lock - let lock_id = sagactx.lookup::(INSTANCE_LOCK_ID)?; - osagactx - .datastore() + datastore .instance_updater_try_lock( &opctx, &authz_instance, @@ -160,11 +164,10 @@ async fn siu_unlock_instance( sagactx: NexusActionContext, ) -> Result<(), anyhow::Error> { let osagactx = sagactx.user_data(); - let Params { ref authz_instance, ref serialized_authn, .. } = + let Params { ref serialized_authn, ref state, .. } = sagactx.saga_params::()?; let lock_id = sagactx.lookup::(INSTANCE_LOCK_ID)?; let gen = sagactx.lookup::(INSTANCE_LOCK_GEN)?; - let opctx = crate::context::op_context_for_saga_action(&sagactx, serialized_authn); osagactx @@ -179,14 +182,21 @@ async fn siu_lock_instance_undo( sagactx: NexusActionContext, ) -> Result<(), anyhow::Error> { let osagactx = sagactx.user_data(); - let Params { ref authz_instance, ref serialized_authn, ref state, .. } = + let Params { ref serialized_authn, ref state, .. } = sagactx.saga_params::()?; let lock_id = sagactx.lookup::(INSTANCE_LOCK_ID)?; let opctx = crate::context::op_context_for_saga_action(&sagactx, serialized_authn); + let datastore = osagactx.datastore(); + + let (.., authz_instance) = LookupPath::new(&opctx, datastore) + .instance_id(state.instance.id()) + .lookup_for(authz::Action::Modify) + .await + .map_err(ActionError::action_failed)?; + let updater_gen = state.instance.runtime_state.updater_gen.next().into(); - osagactx - .datastore() + datastore .instance_updater_unlock(&opctx, &authz_instance, &lock_id, updater_gen) .await?; Ok(()) diff --git a/nexus/src/app/sagas/mod.rs b/nexus/src/app/sagas/mod.rs index af93711b6cb..0c57a5b2dc3 100644 --- a/nexus/src/app/sagas/mod.rs +++ b/nexus/src/app/sagas/mod.rs @@ -157,6 +157,9 @@ fn make_action_registry() -> ActionRegistry { ::register_actions( &mut registry, ); + ::register_actions( + &mut registry, + ); ::register_actions( &mut registry, ); From e71f8f09c2627af7a9a44a70d4258306f2b9b457 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Thu, 16 May 2024 12:37:44 -0700 Subject: [PATCH 014/234] whoops, missing none --- nexus/src/app/background/tasks/instance_updater.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/nexus/src/app/background/tasks/instance_updater.rs b/nexus/src/app/background/tasks/instance_updater.rs index 3c14459ce65..dcb63aa7e4c 100644 --- a/nexus/src/app/background/tasks/instance_updater.rs +++ b/nexus/src/app/background/tasks/instance_updater.rs @@ -103,6 +103,7 @@ impl BackgroundTask for InstanceUpdater { "destroyed_active_vmms" => stats.destroyed_active_vmms, "sagas_started" => stats.sagas_started, ); + None } Err(error) => { slog::warn!( From ecbdbca967106ace659f053bfc69e3c096bc06eb Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Thu, 16 May 2024 13:16:51 -0700 Subject: [PATCH 015/234] more plumbing --- nexus-config/src/nexus_config.rs | 14 +++++++++ nexus/src/app/background/init.rs | 30 +++++++++---------- .../app/sagas/instance_update/destroyed.rs | 2 +- 3 files changed, 29 insertions(+), 17 deletions(-) diff --git a/nexus-config/src/nexus_config.rs b/nexus-config/src/nexus_config.rs index 6e9d6b0cf02..05bf15a7751 100644 --- a/nexus-config/src/nexus_config.rs +++ b/nexus-config/src/nexus_config.rs @@ -379,6 +379,8 @@ pub struct BackgroundTaskConfig { pub region_replacement_driver: RegionReplacementDriverConfig, /// configuration for instance watcher task pub instance_watcher: InstanceWatcherConfig, + /// configuration for instance updater task + pub instance_updater: InstanceUpdaterConfig, /// configuration for service VPC firewall propagation task pub service_firewall_propagation: ServiceFirewallPropagationConfig, /// configuration for v2p mapping propagation task @@ -560,6 +562,14 @@ pub struct InstanceWatcherConfig { pub period_secs: Duration, } +#[serde_as] +#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)] +pub struct InstanceUpdaterConfig { + /// period (in seconds) for periodic activations of this background task + #[serde_as(as = "DurationSeconds")] + pub period_secs: Duration, +} + #[serde_as] #[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)] pub struct ServiceFirewallPropagationConfig { @@ -995,6 +1005,9 @@ mod test { instance_watcher: InstanceWatcherConfig { period_secs: Duration::from_secs(30), }, + instance_watcher: InstanceWatcherConfig { + period_secs: Duration::from_secs(30), + }, service_firewall_propagation: ServiceFirewallPropagationConfig { period_secs: Duration::from_secs(300), @@ -1081,6 +1094,7 @@ mod test { region_replacement.period_secs = 30 region_replacement_driver.period_secs = 30 instance_watcher.period_secs = 30 + instance_updater.period_secs = 30 service_firewall_propagation.period_secs = 300 v2p_mapping_propagation.period_secs = 30 abandoned_vmm_reaper.period_secs = 60 diff --git a/nexus/src/app/background/init.rs b/nexus/src/app/background/init.rs index 05ec3fb8549..34d8d47637c 100644 --- a/nexus/src/app/background/init.rs +++ b/nexus/src/app/background/init.rs @@ -636,22 +636,20 @@ impl BackgroundTasksInitializer { // Background task: schedule update sagas for instances in need of // state updates. { - let updater = { - let updater = instance_updater::InstanceUpdater::new( - datastore.clone(), - saga_request.clone(), - ); - driver.register( - "instance_updater".to_string(), - "detects if instances require update sagas and schedules them" - .to_string(), - config.instance_updater.period_secs, - Box::new(updater), - opctx.child(BTreeMap::new()), - vec![], - task_instance_updaterm - ); - }; + let updater = instance_updater::InstanceUpdater::new( + datastore.clone(), + saga_request.clone(), + ); + driver.register( + "instance_updater".to_string(), + "detects if instances require update sagas and schedules them" + .to_string(), + config.instance_updater.period_secs, + Box::new(updater), + opctx.child(BTreeMap::new()), + vec![], + task_instance_updaterm, + ); } // Background task: service firewall rule propagation diff --git a/nexus/src/app/sagas/instance_update/destroyed.rs b/nexus/src/app/sagas/instance_update/destroyed.rs index 2b4c7a34268..47b81928a2c 100644 --- a/nexus/src/app/sagas/instance_update/destroyed.rs +++ b/nexus/src/app/sagas/instance_update/destroyed.rs @@ -71,7 +71,7 @@ impl NexusSaga for SagaVmmDestroyed { } fn make_saga_dag( - params: &Self::Params, + _params: &Self::Params, mut builder: steno::DagBuilder, ) -> Result { builder.append(delete_sled_resource_action()); From 4b80b2b4e8180235174e2a68769fb17f1b494412 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Thu, 16 May 2024 13:40:48 -0700 Subject: [PATCH 016/234] add configs --- smf/nexus/multi-sled/config-partial.toml | 1 + smf/nexus/single-sled/config-partial.toml | 1 + 2 files changed, 2 insertions(+) diff --git a/smf/nexus/multi-sled/config-partial.toml b/smf/nexus/multi-sled/config-partial.toml index 396e3615b27..c502c20b1ba 100644 --- a/smf/nexus/multi-sled/config-partial.toml +++ b/smf/nexus/multi-sled/config-partial.toml @@ -64,6 +64,7 @@ instance_watcher.period_secs = 30 abandoned_vmm_reaper.period_secs = 60 saga_recovery.period_secs = 600 lookup_region_port.period_secs = 60 +instance_updater.period_secs = 30 [default_region_allocation_strategy] # by default, allocate across 3 distinct sleds diff --git a/smf/nexus/single-sled/config-partial.toml b/smf/nexus/single-sled/config-partial.toml index df49476eed5..30a02431229 100644 --- a/smf/nexus/single-sled/config-partial.toml +++ b/smf/nexus/single-sled/config-partial.toml @@ -64,6 +64,7 @@ instance_watcher.period_secs = 30 abandoned_vmm_reaper.period_secs = 60 saga_recovery.period_secs = 600 lookup_region_port.period_secs = 60 +instance_updater.period_secs = 30 [default_region_allocation_strategy] # by default, allocate without requirement for distinct sleds. From f05e3b30fd88157c95d6bc09dc2c562c816a2a22 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Thu, 16 May 2024 13:51:08 -0700 Subject: [PATCH 017/234] whoops --- nexus-config/src/nexus_config.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nexus-config/src/nexus_config.rs b/nexus-config/src/nexus_config.rs index 05bf15a7751..7df378efa91 100644 --- a/nexus-config/src/nexus_config.rs +++ b/nexus-config/src/nexus_config.rs @@ -1005,7 +1005,7 @@ mod test { instance_watcher: InstanceWatcherConfig { period_secs: Duration::from_secs(30), }, - instance_watcher: InstanceWatcherConfig { + instance_updater: InstanceUpdaterConfig { period_secs: Duration::from_secs(30), }, service_firewall_propagation: From 15770a0f6bac564fa93f0406d707808af322dd40 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Thu, 16 May 2024 14:59:36 -0700 Subject: [PATCH 018/234] remaining configs --- nexus-config/src/nexus_config.rs | 1 + nexus/examples/config.toml | 2 ++ 2 files changed, 3 insertions(+) diff --git a/nexus-config/src/nexus_config.rs b/nexus-config/src/nexus_config.rs index 7df378efa91..49c78dae53b 100644 --- a/nexus-config/src/nexus_config.rs +++ b/nexus-config/src/nexus_config.rs @@ -858,6 +858,7 @@ mod test { region_replacement.period_secs = 30 region_replacement_driver.period_secs = 30 instance_watcher.period_secs = 30 + instance_updater.period_secs = 30 service_firewall_propagation.period_secs = 300 v2p_mapping_propagation.period_secs = 30 abandoned_vmm_reaper.period_secs = 60 diff --git a/nexus/examples/config.toml b/nexus/examples/config.toml index b194ecf1b66..7555c86c2a7 100644 --- a/nexus/examples/config.toml +++ b/nexus/examples/config.toml @@ -118,6 +118,8 @@ region_replacement.period_secs = 30 region_replacement_driver.period_secs = 10 # How frequently to query the status of active instances. instance_watcher.period_secs = 30 +# How frequently to schedule new instance update sagass. +instance_updater.period_secs = 30 service_firewall_propagation.period_secs = 300 v2p_mapping_propagation.period_secs = 30 abandoned_vmm_reaper.period_secs = 60 From 41321d4f4e514d98197aa65d8ad0241368c0660a Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Fri, 24 May 2024 11:10:16 -0700 Subject: [PATCH 019/234] unassign oximeter producer --- .../app/sagas/instance_update/destroyed.rs | 62 ++++++++++++++----- 1 file changed, 48 insertions(+), 14 deletions(-) diff --git a/nexus/src/app/sagas/instance_update/destroyed.rs b/nexus/src/app/sagas/instance_update/destroyed.rs index 47b81928a2c..6789a5c1571 100644 --- a/nexus/src/app/sagas/instance_update/destroyed.rs +++ b/nexus/src/app/sagas/instance_update/destroyed.rs @@ -32,30 +32,43 @@ pub(crate) struct Params { // instance update VMM destroyed subsaga: actions +// This subsaga is responsible for handling an instance update where the +// instance's active VMM has entered the `Destroyed` state. This requires +// deallocating resources assigned to the instance, updating the instance's +// records in the database, and marking the VMM as deleted. declare_saga_actions! { instance_update_destroyed; - DELETE_SLED_RESOURCE -> "no_result1" { - + siud_delete_sled_resource + // Deallocate physical sled resources reserved for the destroyed VMM, as it + // is no longer using them. + RELEASE_SLED_RESOURCES -> "no_result1" { + + siud_release_sled_resources } - DELETE_VIRTUAL_PROVISIONING -> "no_result2" { - + siud_delete_virtual_provisioning + // Deallocate virtual provisioning resources reserved by the instance, as it + // is no longer running. + RELEASE_VIRTUAL_PROVISIONING -> "no_result2" { + + siud_release_virtual_provisioning } - DELETE_V2P_MAPPINGS -> "no_result3" { + // Unassign the instance's Oximeter producer. + UNASSIGN_OXIMETER_PRODUCER -> "no_result3" { + + siud_unassign_oximeter_producer + } + + DELETE_V2P_MAPPINGS -> "no_result4" { + siud_delete_v2p_mappings } - DELETE_NAT_ENTRIES -> "no_result4" { + DELETE_NAT_ENTRIES -> "no_result5" { + siud_delete_nat_entries } - UPDATE_INSTANCE -> "no_result5" { + UPDATE_INSTANCE -> "no_result6" { + siud_update_instance } - MARK_VMM_DELETED -> "no_result6" { + MARK_VMM_DELETED -> "no_result7" { + siud_mark_vmm_deleted } } @@ -74,8 +87,9 @@ impl NexusSaga for SagaVmmDestroyed { _params: &Self::Params, mut builder: steno::DagBuilder, ) -> Result { - builder.append(delete_sled_resource_action()); - builder.append(delete_virtual_provisioning_action()); + builder.append(release_sled_resources_action()); + builder.append(release_virtual_provisioning_action()); + builder.append(unassign_oximeter_producer_action()); builder.append(delete_v2p_mappings_action()); builder.append(delete_nat_entries_action()); builder.append(update_instance_action()); @@ -85,7 +99,7 @@ impl NexusSaga for SagaVmmDestroyed { } } -async fn siud_delete_sled_resource( +async fn siud_release_sled_resources( sagactx: NexusActionContext, ) -> Result<(), ActionError> { let osagactx = sagactx.user_data(); @@ -97,7 +111,7 @@ async fn siud_delete_sled_resource( info!( osagactx.log(), - "instance update (VMM destroyed): deleting sled reservation"; + "instance update (VMM destroyed): deallocating sled resource reservation"; "instance_id" => %instance.id(), "propolis_id" => %vmm.id, "instance_update" => %"VMM destroyed", @@ -117,7 +131,7 @@ async fn siud_delete_sled_resource( .map_err(ActionError::action_failed) } -async fn siud_delete_virtual_provisioning( +async fn siud_release_virtual_provisioning( sagactx: NexusActionContext, ) -> Result<(), ActionError> { let osagactx = sagactx.user_data(); @@ -129,7 +143,7 @@ async fn siud_delete_virtual_provisioning( info!( osagactx.log(), - "instance update (VMM destroyed): deleting virtual provisioning"; + "instance update (VMM destroyed): deallocating virtual provisioning resources"; "instance_id" => %instance.id(), "propolis_id" => %vmm.id, "instance_update" => %"VMM destroyed", @@ -156,6 +170,26 @@ async fn siud_delete_virtual_provisioning( }) } +async fn siud_unassign_oximeter_producer( + sagactx: NexusActionContext, +) -> Result<(), ActionError> { + let osagactx = sagactx.user_data(); + let Params { ref instance, ref serialized_authn, .. } = + sagactx.saga_params::()?; + + let opctx = + crate::context::op_context_for_saga_action(&sagactx, serialized_authn); + + crate::app::oximeter::unassign_producer( + osagactx.datastore(), + osagactx.log(), + &opctx, + &instance.id(), + ) + .await + .map_err(ActionError::action_failed) +} + async fn siud_delete_v2p_mappings( sagactx: NexusActionContext, ) -> Result<(), ActionError> { From a5b6d9e1ba566a6a21ef843b5226b1856e990b57 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Fri, 24 May 2024 11:24:35 -0700 Subject: [PATCH 020/234] update `delete_v2p_mappings` in light of #5568 --- .../app/sagas/instance_update/destroyed.rs | 27 +++++-------------- 1 file changed, 7 insertions(+), 20 deletions(-) diff --git a/nexus/src/app/sagas/instance_update/destroyed.rs b/nexus/src/app/sagas/instance_update/destroyed.rs index 6789a5c1571..77bed8be436 100644 --- a/nexus/src/app/sagas/instance_update/destroyed.rs +++ b/nexus/src/app/sagas/instance_update/destroyed.rs @@ -58,6 +58,9 @@ declare_saga_actions! { DELETE_V2P_MAPPINGS -> "no_result4" { + siud_delete_v2p_mappings + // N.B. that the undo action is the same as the forward action, because + // all this does is kick the V2P manager background task. + // - siud_delete_v2p_mappings } DELETE_NAT_ENTRIES -> "no_result5" { @@ -194,12 +197,9 @@ async fn siud_delete_v2p_mappings( sagactx: NexusActionContext, ) -> Result<(), ActionError> { let osagactx = sagactx.user_data(); - let Params { ref serialized_authn, ref instance, ref vmm, .. } = + let Params { ref instance, ref vmm, .. } = sagactx.saga_params::()?; - let opctx = - crate::context::op_context_for_saga_action(&sagactx, serialized_authn); - info!( osagactx.log(), "instance update (VMM destroyed): deleting V2P mappings"; @@ -208,22 +208,9 @@ async fn siud_delete_v2p_mappings( "instance_update" => %"VMM destroyed", ); - // Per the commentary in instance_network::delete_instance_v2p_mappings`, - // this should be idempotent. - osagactx - .nexus() - .delete_instance_v2p_mappings(&opctx, instance.id()) - .await - .or_else(|err| { - // Necessary for idempotency - match err { - Error::ObjectNotFound { - type_name: ResourceType::Instance, - lookup_type: _, - } => Ok(()), - _ => Err(ActionError::action_failed(err)), - } - }) + let nexus = osagactx.nexus(); + nexus.background_tasks.activate(&nexus.background_tasks.task_v2p_manager); + Ok(()) } async fn siud_delete_nat_entries( From 8d9cdb2101f4ee1723abda879821cbd9a6f95011 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Fri, 24 May 2024 12:41:23 -0700 Subject: [PATCH 021/234] tear apart most of `cpapi_instances_put` --- clients/nexus-client/src/lib.rs | 2 +- clients/sled-agent-client/src/lib.rs | 1 - common/src/api/internal/nexus.rs | 3 - nexus/src/app/instance.rs | 330 ++++++++++++++------------- openapi/nexus-internal.json | 65 ------ sled-agent/src/common/instance.rs | 1 - sled-agent/src/sim/collection.rs | 15 +- sled-agent/src/sim/instance.rs | 2 +- sled-agent/src/sim/sled_agent.rs | 1 - 9 files changed, 177 insertions(+), 243 deletions(-) diff --git a/clients/nexus-client/src/lib.rs b/clients/nexus-client/src/lib.rs index 162c3f4dbf8..f2a3a05fe5b 100644 --- a/clients/nexus-client/src/lib.rs +++ b/clients/nexus-client/src/lib.rs @@ -153,7 +153,6 @@ impl From s: omicron_common::api::internal::nexus::SledInstanceState, ) -> Self { Self { - instance_state: s.instance_state.into(), propolis_id: s.propolis_id, vmm_state: s.vmm_state.into(), migration_state: s.migration_state.map(Into::into), @@ -200,6 +199,7 @@ impl From Input::Completed => Self::Completed, Input::Failed => Self::Failed, } + Self { propolis_id: s.propolis_id, vmm_state: s.vmm_state.into() } } } diff --git a/clients/sled-agent-client/src/lib.rs b/clients/sled-agent-client/src/lib.rs index 4e7a4a72dbf..9ba9138e181 100644 --- a/clients/sled-agent-client/src/lib.rs +++ b/clients/sled-agent-client/src/lib.rs @@ -162,7 +162,6 @@ impl From { fn from(s: types::SledInstanceState) -> Self { Self { - instance_state: s.instance_state.into(), propolis_id: s.propolis_id, vmm_state: s.vmm_state.into(), migration_state: s.migration_state.map(Into::into), diff --git a/common/src/api/internal/nexus.rs b/common/src/api/internal/nexus.rs index d4ed1773f68..39cde8e89a0 100644 --- a/common/src/api/internal/nexus.rs +++ b/common/src/api/internal/nexus.rs @@ -117,9 +117,6 @@ pub struct VmmRuntimeState { /// specific VMM and the instance it incarnates. #[derive(Clone, Debug, Deserialize, Serialize, JsonSchema)] pub struct SledInstanceState { - /// The sled's conception of the state of the instance. - pub instance_state: InstanceRuntimeState, - /// The ID of the VMM whose state is being reported. pub propolis_id: PropolisUuid, diff --git a/nexus/src/app/instance.rs b/nexus/src/app/instance.rs index e6866bfab6a..361cb6547d6 100644 --- a/nexus/src/app/instance.rs +++ b/nexus/src/app/instance.rs @@ -1987,9 +1987,8 @@ pub(crate) async fn notify_instance_updated( ) -> Result, Error> { let propolis_id = new_runtime_state.propolis_id; - info!(log, "received new runtime state from sled agent"; + info!(log, "received new VMM runtime state from sled agent"; "instance_id" => %instance_id, - "instance_state" => ?new_runtime_state.instance_state, "propolis_id" => %propolis_id, "vmm_state" => ?new_runtime_state.vmm_state, "migration_state" => ?new_runtime_state.migration_state); @@ -2001,167 +2000,184 @@ pub(crate) async fn notify_instance_updated( .fetch() .await?; - // Update OPTE and Dendrite if the instance's active sled assignment - // changed or a migration was retired. If these actions fail, sled agent - // is expected to retry this update. - // - // This configuration must be updated before updating any state in CRDB - // so that, if the instance was migrating or has shut down, it will not - // appear to be able to migrate or start again until the appropriate - // networking state has been written. Without this interlock, another - // thread or another Nexus can race with this routine to write - // conflicting configuration. - // - // In the future, this should be replaced by a call to trigger a - // networking state update RPW. - super::instance_network::ensure_updated_instance_network_config( - datastore, - log, - resolver, - opctx, - opctx_alloc, - &authz_instance, - db_instance.runtime(), - &new_runtime_state.instance_state, - v2p_manager, - ) - .await?; - - // If the supplied instance state indicates that the instance no longer - // has an active VMM, attempt to delete the virtual provisioning record, - // and the assignment of the Propolis metric producer to an oximeter - // collector. - // - // As with updating networking state, this must be done before - // committing the new runtime state to the database: once the DB is - // written, a new start saga can arrive and start the instance, which - // will try to create its own virtual provisioning charges, which will - // race with this operation. - if new_runtime_state.instance_state.propolis_id.is_none() { - datastore - .virtual_provisioning_collection_delete_instance( - opctx, - *instance_id, - db_instance.project_id, - i64::from(db_instance.ncpus.0 .0), - db_instance.memory, - (&new_runtime_state.instance_state.gen).into(), - ) - .await?; - - // TODO-correctness: The `notify_instance_updated` method can run - // concurrently with itself in some situations, such as where a - // sled-agent attempts to update Nexus about a stopped instance; - // that times out; and it makes another request to a different - // Nexus. The call to `unassign_producer` is racy in those - // situations, and we may end with instances with no metrics. - // - // This unfortunate case should be handled as part of - // instance-lifecycle improvements, notably using a reliable - // persistent workflow to correctly update the oximete assignment as - // an instance's state changes. - // - // Tracked in https://github.com/oxidecomputer/omicron/issues/3742. - super::oximeter::unassign_producer( - datastore, - log, - opctx, - &instance_id.into_untyped_uuid(), - ) + let updated = datastore + .vmm_update_runtime(&propolis_id, &new_runtime_state.vmm_state) .await?; - } + + // // Update OPTE and Dendrite if the instance's active sled assignment + // // changed or a migration was retired. If these actions fail, sled agent + // // is expected to retry this update. + // // + // // This configuration must be updated before updating any state in CRDB + // // so that, if the instance was migrating or has shut down, it will not + // // appear to be able to migrate or start again until the appropriate + // // networking state has been written. Without this interlock, another + // // thread or another Nexus can race with this routine to write + // // conflicting configuration. + // // + // // In the future, this should be replaced by a call to trigger a + // // networking state update RPW. + // super::instance_network::ensure_updated_instance_network_config( + // datastore, + // log, + // resolver, + // opctx, + // opctx_alloc, + // &authz_instance, + // db_instance.runtime(), + // &new_runtime_state.instance_state, + // v2p_notification_tx.clone(), + // ) + // .await?; + + // // If the supplied instance state indicates that the instance no longer + // // has an active VMM, attempt to delete the virtual provisioning record, + // // and the assignment of the Propolis metric producer to an oximeter + // // collector. + // // + // // As with updating networking state, this must be done before + // // committing the new runtime state to the database: once the DB is + // // written, a new start saga can arrive and start the instance, which + // // will try to create its own virtual provisioning charges, which will + // // race with this operation. + // if new_runtime_state.instance_state.propolis_id.is_none() { + // datastore + // .virtual_provisioning_collection_delete_instance( + // opctx, + // *instance_id, + // db_instance.project_id, + // i64::from(db_instance.ncpus.0 .0), + // db_instance.memory, + // (&new_runtime_state.instance_state.gen).into(), + // ) + // .await?; // Write the new instance and VMM states back to CRDB. This needs to be // done before trying to clean up the VMM, since the datastore will only // allow a VMM to be marked as deleted if it is already in a terminal // state. - let result = datastore - .instance_and_vmm_update_runtime( - instance_id, - &db::model::InstanceRuntimeState::from( - new_runtime_state.instance_state.clone(), - ), - &propolis_id, - &db::model::VmmRuntimeState::from( - new_runtime_state.vmm_state.clone(), - ), - &new_runtime_state.migration_state, - ) - .await; - - // If the VMM is now in a terminal state, make sure its resources get - // cleaned up. - // - // For idempotency, only check to see if the update was successfully - // processed and ignore whether the VMM record was actually updated. - // This is required to handle the case where this routine is called - // once, writes the terminal VMM state, fails before all per-VMM - // resources are released, returns a retriable error, and is retried: - // the per-VMM resources still need to be cleaned up, but the DB update - // will return Ok(_, false) because the database was already updated. - // - // Unlike the pre-update cases, it is legal to do this cleanup *after* - // committing state to the database, because a terminated VMM cannot be - // reused (restarting or migrating its former instance will use new VMM - // IDs). - if result.is_ok() { - let propolis_terminated = matches!( - new_runtime_state.vmm_state.state, - VmmState::Destroyed | VmmState::Failed - ); - - if propolis_terminated { - info!(log, "vmm is terminated, cleaning up resources"; - "instance_id" => %instance_id, - "propolis_id" => %propolis_id); - - datastore - .sled_reservation_delete(opctx, propolis_id.into_untyped_uuid()) - .await?; - - if !datastore.vmm_mark_deleted(opctx, &propolis_id).await? { - warn!(log, "failed to mark vmm record as deleted"; - "instance_id" => %instance_id, - "propolis_id" => %propolis_id, - "vmm_state" => ?new_runtime_state.vmm_state); - } - } - } - - match result { - Ok(result) => { - info!(log, "instance and vmm updated by sled agent"; - "instance_id" => %instance_id, - "propolis_id" => %propolis_id, - "instance_updated" => result.instance_updated, - "vmm_updated" => result.vmm_updated, - "migration_updated" => ?result.migration_updated); - Ok(Some(result)) - } - - // The update command should swallow object-not-found errors and - // return them back as failures to update, so this error case is - // unexpected. There's no work to do if this occurs, however. - Err(Error::ObjectNotFound { .. }) => { - error!(log, "instance/vmm update unexpectedly returned \ - an object not found error"; - "instance_id" => %instance_id, - "propolis_id" => %propolis_id); - Ok(None) - } - - // If the datastore is unavailable, propagate that to the caller. - // TODO-robustness Really this should be any _transient_ error. How - // can we distinguish? Maybe datastore should emit something - // different from Error with an Into. - Err(error) => { - warn!(log, "failed to update instance from sled agent"; - "instance_id" => %instance_id, - "propolis_id" => %propolis_id, - "error" => ?error); - Err(error) - } - } + // let result = datastore + // .instance_and_vmm_update_runtime( + // instance_id, + // &db::model::InstanceRuntimeState::from( + // new_runtime_state.instance_state.clone(), + // ), + // &propolis_id, + // &db::model::VmmRuntimeState::from( + // new_runtime_state.vmm_state.clone(), + // ), + // &new_runtime_state.migration_state, + // ) + // .await; + + // // Has a migration terminated? If so,mark the migration record as deleted if + // // and only if both sides of the migration are in a terminal state. + // if let Some(nexus::MigrationRuntimeState { + // migration_id, + // state, + // role, + // .. + // }) = new_runtime_state.migration_state + // { + // if state.is_terminal() { + // info!( + // log, + // "migration has terminated, trying to delete it..."; + // "instance_id" => %instance_id, + // "propolis_id" => %propolis_id, + // "migration_id" => %propolis_id, + // "migration_state" => %state, + // "migration_role" => %role, + // ); + // if !datastore.migration_terminate(opctx, migration_id).await? { + // info!( + // log, + // "did not mark migration record as deleted (the other half \ + // may not yet have reported termination)"; + // "instance_id" => %instance_id, + // "propolis_id" => %propolis_id, + // "migration_id" => %propolis_id, + // "migration_state" => %state, + // "migration_role" => %role, + // ); + // } + // } + // } + + // // If the VMM is now in a terminal state, make sure its resources get + // // cleaned up. + // // + // // For idempotency, only check to see if the update was successfully + // // processed and ignore whether the VMM record was actually updated. + // // This is required to handle the case where this routine is called + // // once, writes the terminal VMM state, fails before all per-VMM + // // resources are released, returns a retriable error, and is retried: + // // the per-VMM resources still need to be cleaned up, but the DB update + // // will return Ok(_, false) because the database was already updated. + // // + // // Unlike the pre-update cases, it is legal to do this cleanup *after* + // // committing state to the database, because a terminated VMM cannot be + // // reused (restarting or migrating its former instance will use new VMM + // // IDs). + // if result.is_ok() { + // let propolis_terminated = matches!( + // new_runtime_state.vmm_state.state, + // VmmState::Destroyed | VmmState::Failed + // ); + + // if propolis_terminated { + // info!(log, "vmm is terminated, cleaning up resources"; + // "instance_id" => %instance_id, + // "propolis_id" => %propolis_id); + + // datastore + // .sled_reservation_delete(opctx, propolis_id.into_untyped_uuid()) + // .await?; + + // if !datastore.vmm_mark_deleted(opctx, &propolis_id).await? { + // warn!(log, "failed to mark vmm record as deleted"; + // "instance_id" => %instance_id, + // "propolis_id" => %propolis_id, + // "vmm_state" => ?new_runtime_state.vmm_state); + // } + // } + // } + + // match result { + // Ok((instance_updated, vmm_updated)) => { + // info!(log, "instance and vmm updated by sled agent"; + // "instance_id" => %instance_id, + // "propolis_id" => %propolis_id, + // "instance_updated" => instance_updated, + // "vmm_updated" => vmm_updated); + // Ok(Some(InstanceUpdated { instance_updated, vmm_updated })) + // } + + // // The update command should swallow object-not-found errors and + // // return them back as failures to update, so this error case is + // // unexpected. There's no work to do if this occurs, however. + // Err(Error::ObjectNotFound { .. }) => { + // error!(log, "instance/vmm update unexpectedly returned \ + // an object not found error"; + // "instance_id" => %instance_id, + // "propolis_id" => %propolis_id); + // Ok(None) + // } + + // // If the datastore is unavailable, propagate that to the caller. + // // TODO-robustness Really this should be any _transient_ error. How + // // can we distinguish? Maybe datastore should emit something + // // different from Error with an Into. + // Err(error) => { + // warn!(log, "failed to update instance from sled agent"; + // "instance_id" => %instance_id, + // "propolis_id" => %propolis_id, + // "error" => ?error); + // Err(error) + // } + + // } + Ok(Some(InstanceUpdated { vmm_updated: updated, instance_updated: false })) } /// Determines the disposition of a request to start an instance given its state diff --git a/openapi/nexus-internal.json b/openapi/nexus-internal.json index 912ccbcf00b..b3f2a3dd9e4 100644 --- a/openapi/nexus-internal.json +++ b/openapi/nexus-internal.json @@ -3183,53 +3183,6 @@ } ] }, - "InstanceRuntimeState": { - "description": "The dynamic runtime properties of an instance: its current VMM ID (if any), migration information (if any), and the instance state to report if there is no active VMM.", - "type": "object", - "properties": { - "dst_propolis_id": { - "nullable": true, - "description": "If a migration is active, the ID of the target VMM.", - "allOf": [ - { - "$ref": "#/components/schemas/TypedUuidForPropolisKind" - } - ] - }, - "gen": { - "description": "Generation number for this state.", - "allOf": [ - { - "$ref": "#/components/schemas/Generation" - } - ] - }, - "migration_id": { - "nullable": true, - "description": "If a migration is active, the ID of that migration.", - "type": "string", - "format": "uuid" - }, - "propolis_id": { - "nullable": true, - "description": "The instance's currently active VMM ID.", - "allOf": [ - { - "$ref": "#/components/schemas/TypedUuidForPropolisKind" - } - ] - }, - "time_updated": { - "description": "Timestamp for this information.", - "type": "string", - "format": "date-time" - } - }, - "required": [ - "gen", - "time_updated" - ] - }, "IpNet": { "x-rust-type": { "crate": "oxnet", @@ -4716,23 +4669,6 @@ "description": "A wrapper type containing a sled's total knowledge of the state of a specific VMM and the instance it incarnates.", "type": "object", "properties": { - "instance_state": { - "description": "The sled's conception of the state of the instance.", - "allOf": [ - { - "$ref": "#/components/schemas/InstanceRuntimeState" - } - ] - }, - "migration_state": { - "nullable": true, - "description": "The current state of any in-progress migration for this instance, as understood by this sled.", - "allOf": [ - { - "$ref": "#/components/schemas/MigrationRuntimeState" - } - ] - }, "propolis_id": { "description": "The ID of the VMM whose state is being reported.", "allOf": [ @@ -4751,7 +4687,6 @@ } }, "required": [ - "instance_state", "propolis_id", "vmm_state" ] diff --git a/sled-agent/src/common/instance.rs b/sled-agent/src/common/instance.rs index 0fe2e276982..95eda52cdbe 100644 --- a/sled-agent/src/common/instance.rs +++ b/sled-agent/src/common/instance.rs @@ -257,7 +257,6 @@ impl InstanceStates { /// use the `instance` or `vmm` accessors instead. pub fn sled_instance_state(&self) -> SledInstanceState { SledInstanceState { - instance_state: self.instance.clone(), vmm_state: self.vmm.clone(), propolis_id: self.propolis_id, migration_state: self.migration.clone(), diff --git a/sled-agent/src/sim/collection.rs b/sled-agent/src/sim/collection.rs index 8af71ac026e..d4e2c365352 100644 --- a/sled-agent/src/sim/collection.rs +++ b/sled-agent/src/sim/collection.rs @@ -447,12 +447,8 @@ mod test { time_updated: Utc::now(), }; - let state = SledInstanceState { - instance_state: instance_vmm, - vmm_state, - propolis_id, - migration_state: None, - }; + let state = + SledInstanceState { vmm_state, propolis_id, migration_state: None }; SimObject::new_simulated_auto(&state, logctx.log.new(o!())) } @@ -501,14 +497,8 @@ mod test { assert!(dropped.is_none()); assert!(instance.object.desired().is_none()); let rnext = instance.object.current(); - assert!(rnext.instance_state.gen > rprev.instance_state.gen); assert!(rnext.vmm_state.gen > rprev.vmm_state.gen); - assert!( - rnext.instance_state.time_updated - >= rprev.instance_state.time_updated - ); assert!(rnext.vmm_state.time_updated >= rprev.vmm_state.time_updated); - assert!(rnext.instance_state.propolis_id.is_none()); assert_eq!(rnext.vmm_state.state, VmmState::Destroyed); assert!(rx.try_next().is_err()); @@ -632,7 +622,6 @@ mod test { assert!(rnext.vmm_state.time_updated >= rprev.vmm_state.time_updated); assert_eq!(rprev.vmm_state.state, VmmState::Stopping); assert_eq!(rnext.vmm_state.state, VmmState::Destroyed); - assert!(rnext.instance_state.gen > rprev.instance_state.gen); logctx.cleanup_successful(); } diff --git a/sled-agent/src/sim/instance.rs b/sled-agent/src/sim/instance.rs index e94b3b4984d..05315f3f333 100644 --- a/sled-agent/src/sim/instance.rs +++ b/sled-agent/src/sim/instance.rs @@ -466,7 +466,7 @@ impl Simulatable for SimInstance { SimInstance { inner: Arc::new(Mutex::new(SimInstanceInner { state: InstanceStates::new( - current.instance_state, + todo!(), current.vmm_state, current.propolis_id, ), diff --git a/sled-agent/src/sim/sled_agent.rs b/sled-agent/src/sim/sled_agent.rs index 9acfa24b3dc..84c9f8b8328 100644 --- a/sled-agent/src/sim/sled_agent.rs +++ b/sled-agent/src/sim/sled_agent.rs @@ -373,7 +373,6 @@ impl SledAgent { .sim_ensure( &instance_id.into_untyped_uuid(), SledInstanceState { - instance_state: instance_runtime, vmm_state: vmm_runtime, propolis_id, migration_state: None, From 685094694bc600661013c2729885dad01c851ac5 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Fri, 24 May 2024 13:37:31 -0700 Subject: [PATCH 022/234] rewrite most of the saga --- .../app/background/tasks/instance_updater.rs | 12 +- nexus/src/app/instance.rs | 16 +- .../app/sagas/instance_update/destroyed.rs | 102 +++++--- nexus/src/app/sagas/instance_update/mod.rs | 223 +++++++++++------- 4 files changed, 219 insertions(+), 134 deletions(-) diff --git a/nexus/src/app/background/tasks/instance_updater.rs b/nexus/src/app/background/tasks/instance_updater.rs index dcb63aa7e4c..b7a6fb40ab2 100644 --- a/nexus/src/app/background/tasks/instance_updater.rs +++ b/nexus/src/app/background/tasks/instance_updater.rs @@ -60,17 +60,7 @@ impl InstanceUpdater { stats.destroyed_active_vmms = destroyed_active_vmms.len(); for InstanceAndActiveVmm { instance, vmm } in destroyed_active_vmms { - let saga = SagaRequest::InstanceUpdate { - params: sagas::instance_update::Params { - serialized_authn: authn::saga::Serialized::for_opctx(opctx), - state: InstanceSnapshot { - instance, - active_vmm: vmm, - target_vmm: None, - migration: None, // TODO(eliza) - }, - }, - }; + let saga = SagaRequest::InstanceUpdate { params: todo!() }; self.saga_req .send(saga) .await diff --git a/nexus/src/app/instance.rs b/nexus/src/app/instance.rs index 361cb6547d6..00ab1082428 100644 --- a/nexus/src/app/instance.rs +++ b/nexus/src/app/instance.rs @@ -1329,12 +1329,11 @@ impl super::Nexus { if let Some(state) = state { let update_result = self .db_datastore - .instance_and_vmm_update_runtime( - instance_id, - &state.instance_state.into(), + .vmm_update_runtime( &state.propolis_id, &state.vmm_state.into(), - &state.migration_state, + // TODO(eliza): re-enable writing back migrations! + // &state.migration_state, ) .await; @@ -1344,7 +1343,8 @@ impl super::Nexus { "propolis_id" => %state.propolis_id, "result" => ?update_result); - update_result + // TODO(eliza): probably just change the retval to `bool` later... + update_result.map(|vmm_updated| (false, vmm_updated)) } else { // There was no instance state to write back, so --- perhaps // obviously --- nothing happened. @@ -2001,7 +2001,11 @@ pub(crate) async fn notify_instance_updated( .await?; let updated = datastore - .vmm_update_runtime(&propolis_id, &new_runtime_state.vmm_state) + .vmm_update_runtime( + &propolis_id, + // TODO(eliza): probably should take this by value... + &new_runtime_state.vmm_state.clone().into(), + ) .await?; // // Update OPTE and Dendrite if the instance's active sled assignment diff --git a/nexus/src/app/sagas/instance_update/destroyed.rs b/nexus/src/app/sagas/instance_update/destroyed.rs index 77bed8be436..b4cb12c57af 100644 --- a/nexus/src/app/sagas/instance_update/destroyed.rs +++ b/nexus/src/app/sagas/instance_update/destroyed.rs @@ -5,31 +5,21 @@ use super::ActionRegistry; use super::NexusActionContext; use super::NexusSaga; +use super::Params; +use super::STATE; use crate::app::sagas::declare_saga_actions; use crate::app::sagas::ActionError; -use db::lookup::LookupPath; use nexus_db_model::Generation; +use nexus_db_model::Instance; use nexus_db_model::InstanceRuntimeState; +use nexus_db_model::Vmm; +use nexus_db_queries::db::datastore::InstanceAndVmms; use nexus_db_queries::db::identity::Resource; -use nexus_db_queries::{authn, authz, db}; use omicron_common::api::external; use omicron_common::api::external::Error; -use omicron_common::api::external::ResourceType; -use serde::{Deserialize, Serialize}; +use omicron_common::api::external::InstanceState; use slog::info; -/// Parameters to the instance update VMM destroyed sub-saga. -#[derive(Debug, Deserialize, Serialize)] -pub(crate) struct Params { - /// Authentication context to use to fetch the instance's current state from - /// the database. - pub serialized_authn: authn::saga::Serialized, - - pub instance: db::model::Instance, - - pub vmm: db::model::Vmm, -} - // instance update VMM destroyed subsaga: actions // This subsaga is responsible for handling an instance update where the @@ -102,11 +92,29 @@ impl NexusSaga for SagaVmmDestroyed { } } +fn get_destroyed_vmm( + sagactx: &NexusActionContext, +) -> Result, ActionError> { + let state = sagactx.lookup::(STATE)?; + match state.active_vmm { + Some(vmm) if vmm.runtime.state.state() == &InstanceState::Destroyed => { + Ok(Some((state.instance, vmm))) + } + _ => Ok(None), + } +} + async fn siud_release_sled_resources( sagactx: NexusActionContext, ) -> Result<(), ActionError> { + let Some((_, vmm)) = get_destroyed_vmm(&sagactx)? else { + // if the update we are handling is not an active VMM destroyed update, + // bail --- there's nothing to do here. + return Ok(()); + }; + let osagactx = sagactx.user_data(); - let Params { ref serialized_authn, ref vmm, ref instance, .. } = + let Params { ref serialized_authn, ref authz_instance } = sagactx.saga_params::()?; let opctx = @@ -114,8 +122,8 @@ async fn siud_release_sled_resources( info!( osagactx.log(), - "instance update (VMM destroyed): deallocating sled resource reservation"; - "instance_id" => %instance.id(), + "instance update (active VMM destroyed): deallocating sled resource reservation"; + "instance_id" => %authz_instance.id(), "propolis_id" => %vmm.id, "instance_update" => %"VMM destroyed", ); @@ -137,8 +145,14 @@ async fn siud_release_sled_resources( async fn siud_release_virtual_provisioning( sagactx: NexusActionContext, ) -> Result<(), ActionError> { + let Some((instance, vmm)) = get_destroyed_vmm(&sagactx)? else { + // if the update we are handling is not an active VMM destroyed update, + // bail --- there's nothing to do here. + return Ok(()); + }; + let osagactx = sagactx.user_data(); - let Params { ref serialized_authn, ref instance, ref vmm, .. } = + let Params { ref serialized_authn, ref authz_instance } = sagactx.saga_params::()?; let opctx = @@ -147,7 +161,7 @@ async fn siud_release_virtual_provisioning( info!( osagactx.log(), "instance update (VMM destroyed): deallocating virtual provisioning resources"; - "instance_id" => %instance.id(), + "instance_id" => %authz_instance.id(), "propolis_id" => %vmm.id, "instance_update" => %"VMM destroyed", ); @@ -156,7 +170,7 @@ async fn siud_release_virtual_provisioning( .datastore() .virtual_provisioning_collection_delete_instance( &opctx, - instance.id(), + authz_instance.id(), instance.project_id, i64::from(instance.ncpus.0 .0), instance.memory, @@ -176,8 +190,13 @@ async fn siud_release_virtual_provisioning( async fn siud_unassign_oximeter_producer( sagactx: NexusActionContext, ) -> Result<(), ActionError> { + let Some((_, vmm)) = get_destroyed_vmm(&sagactx)? else { + // if the update we are handling is not an active VMM destroyed update, + // bail --- there's nothing to do here. + return Ok(()); + }; let osagactx = sagactx.user_data(); - let Params { ref instance, ref serialized_authn, .. } = + let Params { ref serialized_authn, ref authz_instance, .. } = sagactx.saga_params::()?; let opctx = @@ -187,7 +206,7 @@ async fn siud_unassign_oximeter_producer( osagactx.datastore(), osagactx.log(), &opctx, - &instance.id(), + &authz_instance.id(), ) .await .map_err(ActionError::action_failed) @@ -196,10 +215,12 @@ async fn siud_unassign_oximeter_producer( async fn siud_delete_v2p_mappings( sagactx: NexusActionContext, ) -> Result<(), ActionError> { + let Some((instance, vmm)) = get_destroyed_vmm(&sagactx)? else { + // if the update we are handling is not an active VMM destroyed update, + // bail --- there's nothing to do here. + return Ok(()); + }; let osagactx = sagactx.user_data(); - let Params { ref instance, ref vmm, .. } = - sagactx.saga_params::()?; - info!( osagactx.log(), "instance update (VMM destroyed): deleting V2P mappings"; @@ -216,8 +237,13 @@ async fn siud_delete_v2p_mappings( async fn siud_delete_nat_entries( sagactx: NexusActionContext, ) -> Result<(), ActionError> { + let Some((_, vmm)) = get_destroyed_vmm(&sagactx)? else { + // if the update we are handling is not an active VMM destroyed update, + // bail --- there's nothing to do here. + return Ok(()); + }; let osagactx = sagactx.user_data(); - let Params { ref serialized_authn, ref vmm, ref instance, .. } = + let Params { ref serialized_authn, ref authz_instance, .. } = sagactx.saga_params::()?; let opctx = @@ -226,16 +252,11 @@ async fn siud_delete_nat_entries( info!( osagactx.log(), "instance update (VMM destroyed): deleting NAT entries"; - "instance_id" => %instance.id(), + "instance_id" => %authz_instance.id(), "propolis_id" => %vmm.id, "instance_update" => %"VMM destroyed", ); - let (.., authz_instance) = LookupPath::new(&opctx, &osagactx.datastore()) - .instance_id(instance.id()) - .lookup_for(authz::Action::Modify) - .await - .map_err(ActionError::action_failed)?; osagactx .nexus() .instance_delete_dpd_config(&opctx, &authz_instance) @@ -247,8 +268,12 @@ async fn siud_delete_nat_entries( async fn siud_update_instance( sagactx: NexusActionContext, ) -> Result<(), ActionError> { + let Some((instance, vmm)) = get_destroyed_vmm(&sagactx)? else { + // if the update we are handling is not an active VMM destroyed update, + // bail --- there's nothing to do here. + return Ok(()); + }; let osagactx = sagactx.user_data(); - let Params { instance, vmm, .. } = sagactx.saga_params::()?; let new_runtime = InstanceRuntimeState { propolis_id: None, nexus_state: external::InstanceState::Stopped.into(), @@ -276,8 +301,13 @@ async fn siud_update_instance( async fn siud_mark_vmm_deleted( sagactx: NexusActionContext, ) -> Result<(), ActionError> { + let Some((instance, vmm)) = get_destroyed_vmm(&sagactx)? else { + // if the update we are handling is not an active VMM destroyed update, + // bail --- there's nothing to do here. + return Ok(()); + }; let osagactx = sagactx.user_data(); - let Params { ref serialized_authn, ref vmm, ref instance, .. } = + let Params { ref serialized_authn, .. } = sagactx.saga_params::()?; let opctx = diff --git a/nexus/src/app/sagas/instance_update/mod.rs b/nexus/src/app/sagas/instance_update/mod.rs index 066af7f9716..5a2940f9d98 100644 --- a/nexus/src/app/sagas/instance_update/mod.rs +++ b/nexus/src/app/sagas/instance_update/mod.rs @@ -25,26 +25,32 @@ pub(crate) struct Params { /// the database. pub serialized_authn: authn::saga::Serialized, - pub state: InstanceSnapshot, + pub authz_instance: authz::Instance, } const INSTANCE_LOCK_ID: &str = "saga_instance_lock_id"; -const INSTANCE_LOCK_GEN: &str = "saga_instance_lock_gen"; +const STATE: &str = "state"; // instance update saga: actions declare_saga_actions! { instance_update; - // Read the target Instance from CRDB and join with its active VMM and - // migration target VMM records if they exist, and then acquire the - // "instance updater" lock with this saga's ID if no other saga is currently - // updating the instance. + // Acquire the instance updater" lock with this saga's ID if no other saga + // is currently updating the instance. LOCK_INSTANCE -> "saga_instance_lock_gen" { + siu_lock_instance - siu_lock_instance_undo } + // Fetch the instance and VMM's state. + // N.B. that this must be performed as a separate action from + // `LOCK_INSTANCE`, so that if the lookup fails, we will still unwind the + // `LOCK_INSTANCE` action and release the lock. + FETCH_STATE -> "state" { + + siu_fetch_state + } + UNLOCK_INSTANCE -> "no_result7" { + siu_unlock_instance } @@ -72,52 +78,38 @@ impl NexusSaga for SagaInstanceUpdate { ACTION_GENERATE_ID.as_ref(), )); builder.append(lock_instance_action()); + builder.append(fetch_state_action()); // determine which subsaga to execute based on the state of the instance // and the VMMs associated with it. - match params.state { - // VMM destroyed subsaga - InstanceSnapshot { - instance, active_vmm: Some(ref vmm), .. - } if vmm.runtime.state.state() == &VmmState::Destroyed => { - const DESTROYED_SUBSAGA_PARAMS: &str = - "params_for_vmm_destroyed_subsaga"; - let subsaga_params = destroyed::Params { - serialized_authn: params.serialized_authn.clone(), - instance: instance.clone(), - vmm: vmm.clone(), - }; - let subsaga_dag = { - let subsaga_builder = DagBuilder::new(SagaName::new( - destroyed::SagaVmmDestroyed::NAME, - )); - destroyed::SagaVmmDestroyed::make_saga_dag( - &subsaga_params, - subsaga_builder, - )? - }; - - builder.append(Node::constant( - DESTROYED_SUBSAGA_PARAMS, - serde_json::to_value(&subsaga_params).map_err(|e| { - SagaInitError::SerializeError( - DESTROYED_SUBSAGA_PARAMS.to_string(), - e, - ) - })?, - )); - - builder.append(Node::subsaga( - "vmm_destroyed_subsaga_no_result", - subsaga_dag, - DESTROYED_SUBSAGA_PARAMS, - )); - } - _ => { - // TODO(eliza): other subsagas - } + const DESTROYED_SUBSAGA_PARAMS: &str = + "params_for_vmm_destroyed_subsaga"; + let subsaga_dag = { + let subsaga_builder = DagBuilder::new(SagaName::new( + destroyed::SagaVmmDestroyed::NAME, + )); + destroyed::SagaVmmDestroyed::make_saga_dag( + ¶ms, + subsaga_builder, + )? }; + builder.append(Node::constant( + DESTROYED_SUBSAGA_PARAMS, + serde_json::to_value(¶ms).map_err(|e| { + SagaInitError::SerializeError( + DESTROYED_SUBSAGA_PARAMS.to_string(), + e, + ) + })?, + )); + + builder.append(Node::subsaga( + "vmm_destroyed_subsaga_no_result", + subsaga_dag, + DESTROYED_SUBSAGA_PARAMS, + )); + builder.append(unlock_instance_action()); Ok(builder.build()?) @@ -130,44 +122,113 @@ async fn siu_lock_instance( sagactx: NexusActionContext, ) -> Result { let osagactx = sagactx.user_data(); - let Params { ref serialized_authn, ref state, .. } = + let Params { ref serialized_authn, ref authz_instance, .. } = sagactx.saga_params::()?; let lock_id = sagactx.lookup::(INSTANCE_LOCK_ID)?; let opctx = crate::context::op_context_for_saga_action(&sagactx, serialized_authn); let datastore = osagactx.datastore(); + let log = osagactx.log(); + let instance_id = authz_instance.id(); + slog::info!( + log, + "instance update: attempting to lock instance"; + "instance_id" => %instance_id, + "saga_id" => %lock_id, + ); - let (.., authz_instance) = LookupPath::new(&opctx, datastore) - .instance_id(state.instance.id()) - .lookup_for(authz::Action::Modify) - .await - .map_err(ActionError::action_failed)?; - - // try to acquire the instance updater lock - datastore - .instance_updater_try_lock( - &opctx, - &authz_instance, - state.instance.runtime_state.updater_gen, - &lock_id, - ) + loop { + let instance = datastore + .instance_refetch(&opctx, &authz_instance) + .await + .map_err(ActionError::action_failed)?; + // Look at the current lock state of the instance and determine whether + // we can lock it. + match instance.runtime_state.updater_id { + Some(ref id) if id == &lock_id => { + slog::info!( + log, + "instance update: instance already locked by this saga"; + "instance_id" => %instance_id, + "saga_id" => %lock_id, + ); + return Ok(instance.runtime_state.updater_gen); + } + Some(ref id) => { + slog::info!( + log, + "instance update: instance locked by another saga"; + "instance_id" => %instance_id, + "saga_id" => %lock_id, + "locked_by" => %lock_id, + ); + return Err(ActionError::action_failed(serde_json::json!({ + "error": "instance locked by another saga", + "saga_id": lock_id, + "locked_by": id, + }))); + } + None => {} + }; + let gen = instance.runtime_state.updater_gen; + slog::debug!( + log, + "instance update: trying to acquire updater lock..."; + "instance_id" => %instance_id, + "saga_id" => %lock_id, + "updater_gen" => ?gen, + ); + let lock = datastore + .instance_updater_try_lock(&opctx, &authz_instance, gen, &lock_id) + .await + .map_err(ActionError::action_failed)?; + match lock { + Some(lock_gen) => { + slog::info!( + log, + "instance update: acquired updater lock"; + "instance_id" => %instance_id, + "saga_id" => %lock_id, + "updater_gen" => ?gen, + ); + return Ok(lock_gen); + } + None => { + slog::debug!( + log, + "instance update: generation has advanced, retrying..."; + "instance_id" => %instance_id, + "saga_id" => %lock_id, + "updater_gen" => ?gen, + ); + } + } + } +} + +async fn siu_fetch_state( + sagactx: NexusActionContext, +) -> Result { + let osagactx = sagactx.user_data(); + let Params { ref serialized_authn, ref authz_instance, .. } = + sagactx.saga_params::()?; + let opctx = + crate::context::op_context_for_saga_action(&sagactx, serialized_authn); + + osagactx + .datastore() + .instance_fetch_with_vmms(&opctx, authz_instance) .await - .map_err(ActionError::action_failed)? - .ok_or_else(|| { - ActionError::action_failed( - serde_json::json!({"error": "can't get ye lock"}), - ) - }) + .map_err(ActionError::action_failed) } async fn siu_unlock_instance( sagactx: NexusActionContext, ) -> Result<(), anyhow::Error> { let osagactx = sagactx.user_data(); - let Params { ref serialized_authn, ref state, .. } = + let Params { ref serialized_authn, ref authz_instance, .. } = sagactx.saga_params::()?; let lock_id = sagactx.lookup::(INSTANCE_LOCK_ID)?; - let gen = sagactx.lookup::(INSTANCE_LOCK_GEN)?; let opctx = crate::context::op_context_for_saga_action(&sagactx, serialized_authn); osagactx @@ -177,27 +238,27 @@ async fn siu_unlock_instance( Ok(()) } -// this is different from "lock instance" lol +// N.B. that this has to be a separate function just because the undo action +// must return `anyhow::Error` rather than `ActionError`. async fn siu_lock_instance_undo( sagactx: NexusActionContext, ) -> Result<(), anyhow::Error> { let osagactx = sagactx.user_data(); - let Params { ref serialized_authn, ref state, .. } = + let Params { ref serialized_authn, ref authz_instance, .. } = sagactx.saga_params::()?; let lock_id = sagactx.lookup::(INSTANCE_LOCK_ID)?; let opctx = crate::context::op_context_for_saga_action(&sagactx, serialized_authn); let datastore = osagactx.datastore(); - let (.., authz_instance) = LookupPath::new(&opctx, datastore) - .instance_id(state.instance.id()) - .lookup_for(authz::Action::Modify) - .await - .map_err(ActionError::action_failed)?; + slog::info!( + osagactx.log(), + "instance update: unlocking instance on unwind"; + "instance_id" => %authz_instance.id(), + "saga_id" => %lock_id, + ); + + datastore.instance_updater_unlock(&opctx, authz_instance, &lock_id).await?; - let updater_gen = state.instance.runtime_state.updater_gen.next().into(); - datastore - .instance_updater_unlock(&opctx, &authz_instance, &lock_id, updater_gen) - .await?; Ok(()) } From dc807a963739e9c7a85e7bd4084d296f11f490bd Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Fri, 24 May 2024 13:50:51 -0700 Subject: [PATCH 023/234] fixup --- clients/nexus-client/src/lib.rs | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/clients/nexus-client/src/lib.rs b/clients/nexus-client/src/lib.rs index f2a3a05fe5b..bb44faba8d1 100644 --- a/clients/nexus-client/src/lib.rs +++ b/clients/nexus-client/src/lib.rs @@ -122,22 +122,6 @@ impl From for omicron_common::api::internal::nexus::VmmState { } } -impl From - for types::InstanceRuntimeState -{ - fn from( - s: omicron_common::api::internal::nexus::InstanceRuntimeState, - ) -> Self { - Self { - dst_propolis_id: s.dst_propolis_id, - gen: s.gen, - migration_id: s.migration_id, - propolis_id: s.propolis_id, - time_updated: s.time_updated, - } - } -} - impl From for types::VmmRuntimeState { From 3a421eda3e1afc8f2d84eb92c1528bb9c5794a8e Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Fri, 24 May 2024 15:39:06 -0700 Subject: [PATCH 024/234] rm unneeded comment --- nexus/src/app/sagas/instance_update/destroyed.rs | 3 --- 1 file changed, 3 deletions(-) diff --git a/nexus/src/app/sagas/instance_update/destroyed.rs b/nexus/src/app/sagas/instance_update/destroyed.rs index b4cb12c57af..0ed87c74f12 100644 --- a/nexus/src/app/sagas/instance_update/destroyed.rs +++ b/nexus/src/app/sagas/instance_update/destroyed.rs @@ -48,9 +48,6 @@ declare_saga_actions! { DELETE_V2P_MAPPINGS -> "no_result4" { + siud_delete_v2p_mappings - // N.B. that the undo action is the same as the forward action, because - // all this does is kick the V2P manager background task. - // - siud_delete_v2p_mappings } DELETE_NAT_ENTRIES -> "no_result5" { From d8c0e63b12b60cb9aab13f561c4b15eb27906e12 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Fri, 24 May 2024 17:36:57 -0700 Subject: [PATCH 025/234] WHEW OKAY --- dev-tools/omdb/src/bin/omdb/nexus.rs | 6 +- nexus/src/app/background/init.rs | 3 +- .../app/background/tasks/instance_updater.rs | 22 +++- .../app/background/tasks/instance_watcher.rs | 46 ++++--- nexus/src/app/instance.rs | 123 +++++++++--------- nexus/src/app/mod.rs | 1 - .../app/sagas/instance_update/destroyed.rs | 2 +- 7 files changed, 111 insertions(+), 92 deletions(-) diff --git a/dev-tools/omdb/src/bin/omdb/nexus.rs b/dev-tools/omdb/src/bin/omdb/nexus.rs index 8649d15aa64..e19c998c3d3 100644 --- a/dev-tools/omdb/src/bin/omdb/nexus.rs +++ b/dev-tools/omdb/src/bin/omdb/nexus.rs @@ -929,6 +929,9 @@ fn print_task_details(bgtask: &BackgroundTask, details: &serde_json::Value) { /// number of stale instance metrics that were deleted pruned_instances: usize, + /// update sagas queued due to instance updates. + update_sagas_queued: usize, + /// instance states from completed checks. /// /// this is a mapping of stringified instance states to the number @@ -970,6 +973,7 @@ fn print_task_details(bgtask: &BackgroundTask, details: &serde_json::Value) { ), Ok(TaskSuccess { total_instances, + update_sagas_queued, pruned_instances, instance_states, failed_checks, @@ -987,7 +991,7 @@ fn print_task_details(bgtask: &BackgroundTask, details: &serde_json::Value) { for (state, count) in &instance_states { println!(" -> {count} instances {state}") } - + println!(" update sagas queued: {update_sagas_queued}"); println!(" failed checks: {total_failures}"); for (failure, count) in &failed_checks { println!(" -> {count} {failure}") diff --git a/nexus/src/app/background/init.rs b/nexus/src/app/background/init.rs index 34d8d47637c..e808b37557a 100644 --- a/nexus/src/app/background/init.rs +++ b/nexus/src/app/background/init.rs @@ -617,10 +617,9 @@ impl BackgroundTasksInitializer { { let watcher = instance_watcher::InstanceWatcher::new( datastore.clone(), - resolver.clone(), producer_registry, instance_watcher::WatcherIdentity { nexus_id, rack_id }, - task_v2p_manager.clone(), + saga_request.clone(), ); driver.register(TaskDefinition { name: "instance_watcher", diff --git a/nexus/src/app/background/tasks/instance_updater.rs b/nexus/src/app/background/tasks/instance_updater.rs index b7a6fb40ab2..4db099645cb 100644 --- a/nexus/src/app/background/tasks/instance_updater.rs +++ b/nexus/src/app/background/tasks/instance_updater.rs @@ -7,15 +7,17 @@ //! TODO this is currently a placeholder for a future PR use super::common::BackgroundTask; -use crate::app::authn; -use crate::app::sagas::{self, SagaRequest}; +use crate::app::sagas::instance_update; +use crate::app::sagas::SagaRequest; use anyhow::Context; use futures::future::BoxFuture; use futures::FutureExt; use nexus_db_queries::context::OpContext; use nexus_db_queries::db::datastore::InstanceAndActiveVmm; -use nexus_db_queries::db::datastore::InstanceSnapshot; +use nexus_db_queries::db::lookup::LookupPath; use nexus_db_queries::db::DataStore; +use nexus_db_queries::{authn, authz}; +use nexus_types::identity::Resource; use serde_json::json; use std::sync::Arc; use tokio::sync::mpsc::Sender; @@ -59,8 +61,18 @@ impl InstanceUpdater { stats.destroyed_active_vmms = destroyed_active_vmms.len(); - for InstanceAndActiveVmm { instance, vmm } in destroyed_active_vmms { - let saga = SagaRequest::InstanceUpdate { params: todo!() }; + for InstanceAndActiveVmm { instance, .. } in destroyed_active_vmms { + let serialized_authn = authn::saga::Serialized::for_opctx(opctx); + let (.., authz_instance) = LookupPath::new(&opctx, &self.datastore) + .instance_id(instance.id()) + .lookup_for(authz::Action::Modify) + .await?; + let saga = SagaRequest::InstanceUpdate { + params: instance_update::Params { + serialized_authn, + authz_instance, + }, + }; self.saga_req .send(saga) .await diff --git a/nexus/src/app/background/tasks/instance_watcher.rs b/nexus/src/app/background/tasks/instance_watcher.rs index 8a41e2d0622..361cae98733 100644 --- a/nexus/src/app/background/tasks/instance_watcher.rs +++ b/nexus/src/app/background/tasks/instance_watcher.rs @@ -6,6 +6,7 @@ use crate::app::background::Activator; use crate::app::background::BackgroundTask; +use crate::app::sagas; use futures::{future::BoxFuture, FutureExt}; use http::StatusCode; use nexus_db_model::Instance; @@ -29,6 +30,7 @@ use std::future::Future; use std::num::NonZeroU32; use std::sync::Arc; use std::sync::Mutex; +use tokio::sync::mpsc::Sender; use uuid::Uuid; oximeter::use_timeseries!("vm-health-check.toml"); @@ -37,7 +39,6 @@ use virtual_machine::VirtualMachine; /// Background task that periodically checks instance states. pub(crate) struct InstanceWatcher { datastore: Arc, - resolver: internal_dns::resolver::Resolver, metrics: Arc>, id: WatcherIdentity, v2p_manager: Activator, @@ -51,7 +52,6 @@ const MAX_SLED_AGENTS: NonZeroU32 = unsafe { impl InstanceWatcher { pub(crate) fn new( datastore: Arc, - resolver: internal_dns::resolver::Resolver, producer_registry: &ProducerRegistry, id: WatcherIdentity, v2p_manager: Activator, @@ -70,7 +70,6 @@ impl InstanceWatcher { target: VirtualMachine, ) -> impl Future + Send + 'static { let datastore = self.datastore.clone(); - let resolver = self.resolver.clone(); let opctx = opctx.child( std::iter::once(( @@ -89,8 +88,12 @@ impl InstanceWatcher { target.instance_id, )) .await; - let mut check = - Check { target, outcome: Default::default(), result: Ok(()) }; + let mut check = Check { + target, + outcome: Default::default(), + result: Ok(()), + update_saga_queued: false, + }; let state = match rsp { Ok(rsp) => rsp.into_inner(), Err(ClientError::ErrorResponse(rsp)) => { @@ -181,22 +184,18 @@ impl InstanceWatcher { updated.ok_or_else(|| { slog::warn!( opctx.log, - "error updating instance: not found in database"; - "state" => ?new_runtime_state.vmm_state.state, + "error updating instance"; + "error" => ?e, ); - Incomplete::InstanceNotFound + Incomplete::UpdateFailed }) - }) - .map(|updated| { - slog::debug!( - opctx.log, - "update successful"; - "instance_updated" => updated.instance_updated, - "vmm_updated" => updated.vmm_updated, - "state" => ?new_runtime_state.vmm_state.state, - ); - }); - + .map(|updated| { + slog::debug!( + opctx.log, "update successful"; + "vmm_updated" => ?updated, + ); + check.update_saga_queued = updated; + }); check } } @@ -259,6 +258,8 @@ struct Check { /// Depending on when the error occurred, the `outcome` field may also /// be populated. result: Result<(), Incomplete>, + + update_saga_queued: bool, } #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Default)] @@ -418,6 +419,7 @@ impl BackgroundTask for InstanceWatcher { // Now, wait for the check results to come back. let mut total: usize = 0; + let mut update_sagas_queued: usize = 0; let mut instance_states: BTreeMap = BTreeMap::new(); let mut check_failures: BTreeMap = @@ -446,7 +448,11 @@ impl BackgroundTask for InstanceWatcher { if let Err(ref reason) = check.result { *check_errors.entry(reason.as_str().into_owned()).or_default() += 1; } + if check.update_saga_queued { + update_sagas_queued += 1; + } self.metrics.lock().unwrap().record_check(check); + } // All requests completed! Prune any old instance metrics for @@ -460,6 +466,7 @@ impl BackgroundTask for InstanceWatcher { "total_completed" => instance_states.len() + check_failures.len(), "total_failed" => check_failures.len(), "total_incomplete" => check_errors.len(), + "update_sagas_queued" => update_sagas_queued, "pruned_instances" => pruned, ); serde_json::json!({ @@ -467,6 +474,7 @@ impl BackgroundTask for InstanceWatcher { "instance_states": instance_states, "failed_checks": check_failures, "incomplete_checks": check_errors, + "update_sagas_queued": update_sagas_queued, "pruned_instances": pruned, }) } diff --git a/nexus/src/app/instance.rs b/nexus/src/app/instance.rs index 00ab1082428..bfb4d2ac733 100644 --- a/nexus/src/app/instance.rs +++ b/nexus/src/app/instance.rs @@ -1517,23 +1517,40 @@ impl super::Nexus { /// Invoked by a sled agent to publish an updated runtime state for an /// Instance. pub(crate) async fn notify_instance_updated( - &self, + self: &Arc, opctx: &OpContext, instance_id: &InstanceUuid, new_runtime_state: &nexus::SledInstanceState, ) -> Result<(), Error> { - notify_instance_updated( - &self.datastore(), - self.resolver(), - &self.opctx_alloc, - opctx, - &self.log, - instance_id, - new_runtime_state, - &self.background_tasks.task_v2p_manager, - ) - .await?; - self.vpc_needed_notify_sleds(); + let propolis_id = new_runtime_state.propolis_id; + info!(opctx.log, "received new VMM runtime state from sled agent"; + "instance_id" => %instance_id, + "propolis_id" => %propolis_id, + "vmm_state" => ?new_runtime_state.vmm_state); + + let updated = self + .db_datastore + .vmm_update_runtime( + &propolis_id, + // TODO(eliza): probably should take this by value... + &new_runtime_state.vmm_state.clone().into(), + ) + .await?; + if updated { + let (.., authz_instance) = + LookupPath::new(&opctx, &self.db_datastore) + .instance_id(*instance_id) + .lookup_for(authz::Action::Modify) + .await?; + let saga_params = sagas::instance_update::Params { + serialized_authn: authn::saga::Serialized::for_opctx(opctx), + authz_instance, + }; + self.execute_saga::( + saga_params, + ) + .await?; + } Ok(()) } @@ -1972,33 +1989,20 @@ impl super::Nexus { } } -/// Invoked by a sled agent to publish an updated runtime state for an -/// Instance. -#[allow(clippy::too_many_arguments)] // :( -pub(crate) async fn notify_instance_updated( +/// [`Nexus::notify_instance_updated`] (~~Taylor~~ background task's version) +pub(crate) async fn notify_instance_updated_background( datastore: &DataStore, - resolver: &internal_dns::resolver::Resolver, - opctx_alloc: &OpContext, opctx: &OpContext, - log: &slog::Logger, - instance_id: &InstanceUuid, - new_runtime_state: &nexus::SledInstanceState, - v2p_manager: &crate::app::background::Activator, -) -> Result, Error> { + saga_request: &tokio::sync::mpsc::Sender, + instance_id: InstanceUuid, + new_runtime_state: nexus::SledInstanceState, +) -> Result { let propolis_id = new_runtime_state.propolis_id; - - info!(log, "received new VMM runtime state from sled agent"; - "instance_id" => %instance_id, - "propolis_id" => %propolis_id, - "vmm_state" => ?new_runtime_state.vmm_state, - "migration_state" => ?new_runtime_state.migration_state); - - // Grab the current state of the instance in the DB to reason about - // whether this update is stale or not. - let (.., authz_instance, db_instance) = LookupPath::new(&opctx, &datastore) - .instance_id(instance_id.into_untyped_uuid()) - .fetch() - .await?; + info!(opctx.log, "received new VMM runtime state from sled agent"; + "instance_id" => %instance_id, + "propolis_id" => %propolis_id, + "vmm_state" => ?new_runtime_state.vmm_state, + "migration_state" => ?new_runtime_state.migration_state); let updated = datastore .vmm_update_runtime( @@ -2008,31 +2012,24 @@ pub(crate) async fn notify_instance_updated( ) .await?; - // // Update OPTE and Dendrite if the instance's active sled assignment - // // changed or a migration was retired. If these actions fail, sled agent - // // is expected to retry this update. - // // - // // This configuration must be updated before updating any state in CRDB - // // so that, if the instance was migrating or has shut down, it will not - // // appear to be able to migrate or start again until the appropriate - // // networking state has been written. Without this interlock, another - // // thread or another Nexus can race with this routine to write - // // conflicting configuration. - // // - // // In the future, this should be replaced by a call to trigger a - // // networking state update RPW. - // super::instance_network::ensure_updated_instance_network_config( - // datastore, - // log, - // resolver, - // opctx, - // opctx_alloc, - // &authz_instance, - // db_instance.runtime(), - // &new_runtime_state.instance_state, - // v2p_notification_tx.clone(), - // ) - // .await?; + if updated { + let (.., authz_instance) = LookupPath::new(&opctx, datastore) + .instance_id(instance_id.into_untyped_uuid()) + .lookup_for(authz::Action::Modify) + .await?; + let params = sagas::instance_update::Params { + serialized_authn: authn::saga::Serialized::for_opctx(opctx), + authz_instance, + }; + saga_request + .send(sagas::SagaRequest::InstanceUpdate { params }) + .await + .map_err(|_| { + Error::internal_error( + "background saga executor is gone! this is not supposed to happen" + ) + })?; + } // // If the supplied instance state indicates that the instance no longer // // has an active VMM, attempt to delete the virtual provisioning record, @@ -2181,7 +2178,7 @@ pub(crate) async fn notify_instance_updated( // } // } - Ok(Some(InstanceUpdated { vmm_updated: updated, instance_updated: false })) + Ok(updated) } /// Determines the disposition of a request to start an instance given its state diff --git a/nexus/src/app/mod.rs b/nexus/src/app/mod.rs index 9508d5e7e31..60ed611bd7e 100644 --- a/nexus/src/app/mod.rs +++ b/nexus/src/app/mod.rs @@ -23,7 +23,6 @@ use nexus_db_queries::authn; use nexus_db_queries::authz; use nexus_db_queries::context::OpContext; use nexus_db_queries::db; -use nexus_types::identity::Resource; use omicron_common::address::DENDRITE_PORT; use omicron_common::address::MGD_PORT; use omicron_common::address::MGS_PORT; diff --git a/nexus/src/app/sagas/instance_update/destroyed.rs b/nexus/src/app/sagas/instance_update/destroyed.rs index 0ed87c74f12..70e82ee6b15 100644 --- a/nexus/src/app/sagas/instance_update/destroyed.rs +++ b/nexus/src/app/sagas/instance_update/destroyed.rs @@ -187,7 +187,7 @@ async fn siud_release_virtual_provisioning( async fn siud_unassign_oximeter_producer( sagactx: NexusActionContext, ) -> Result<(), ActionError> { - let Some((_, vmm)) = get_destroyed_vmm(&sagactx)? else { + let Some((_, _)) = get_destroyed_vmm(&sagactx)? else { // if the update we are handling is not an active VMM destroyed update, // bail --- there's nothing to do here. return Ok(()); From 0bc3ae33c8bbb267d2863329402a45282bd04af5 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Tue, 28 May 2024 11:24:17 -0700 Subject: [PATCH 026/234] start ripping out sled-agent instance state munging --- clients/nexus-client/src/lib.rs | 3 +- nexus/src/app/instance.rs | 7 +- .../app/sagas/instance_update/destroyed.rs | 11 +- nexus/src/app/sagas/instance_update/mod.rs | 6 +- sled-agent/src/common/instance.rs | 336 +----------------- sled-agent/src/instance.rs | 47 +-- sled-agent/src/sim/instance.rs | 29 +- 7 files changed, 50 insertions(+), 389 deletions(-) diff --git a/clients/nexus-client/src/lib.rs b/clients/nexus-client/src/lib.rs index bb44faba8d1..ea6f53deaaf 100644 --- a/clients/nexus-client/src/lib.rs +++ b/clients/nexus-client/src/lib.rs @@ -139,7 +139,7 @@ impl From Self { propolis_id: s.propolis_id, vmm_state: s.vmm_state.into(), - migration_state: s.migration_state.map(Into::into), + // migration_state: s.migration_state.map(Into::into), } } } @@ -183,7 +183,6 @@ impl From Input::Completed => Self::Completed, Input::Failed => Self::Failed, } - Self { propolis_id: s.propolis_id, vmm_state: s.vmm_state.into() } } } diff --git a/nexus/src/app/instance.rs b/nexus/src/app/instance.rs index bfb4d2ac733..112d54f9c92 100644 --- a/nexus/src/app/instance.rs +++ b/nexus/src/app/instance.rs @@ -1343,8 +1343,11 @@ impl super::Nexus { "propolis_id" => %state.propolis_id, "result" => ?update_result); - // TODO(eliza): probably just change the retval to `bool` later... - update_result.map(|vmm_updated| (false, vmm_updated)) + Ok(InstanceUpdateResult { + instance_updated: false, + vmm_updated: update_result?, + migration_updated: None, + }) } else { // There was no instance state to write back, so --- perhaps // obviously --- nothing happened. diff --git a/nexus/src/app/sagas/instance_update/destroyed.rs b/nexus/src/app/sagas/instance_update/destroyed.rs index 70e82ee6b15..b141a9d3ead 100644 --- a/nexus/src/app/sagas/instance_update/destroyed.rs +++ b/nexus/src/app/sagas/instance_update/destroyed.rs @@ -12,12 +12,13 @@ use crate::app::sagas::ActionError; use nexus_db_model::Generation; use nexus_db_model::Instance; use nexus_db_model::InstanceRuntimeState; +use nexus_db_model::InstanceState; use nexus_db_model::Vmm; -use nexus_db_queries::db::datastore::InstanceAndVmms; +use nexus_db_model::VmmState; +use nexus_db_queries::db::datastore::InstanceSnapshot; use nexus_db_queries::db::identity::Resource; use omicron_common::api::external; use omicron_common::api::external::Error; -use omicron_common::api::external::InstanceState; use slog::info; // instance update VMM destroyed subsaga: actions @@ -92,9 +93,9 @@ impl NexusSaga for SagaVmmDestroyed { fn get_destroyed_vmm( sagactx: &NexusActionContext, ) -> Result, ActionError> { - let state = sagactx.lookup::(STATE)?; + let state = sagactx.lookup::(STATE)?; match state.active_vmm { - Some(vmm) if vmm.runtime.state.state() == &InstanceState::Destroyed => { + Some(vmm) if vmm.runtime.state.state() == &VmmState::Destroyed => { Ok(Some((state.instance, vmm))) } _ => Ok(None), @@ -273,7 +274,7 @@ async fn siud_update_instance( let osagactx = sagactx.user_data(); let new_runtime = InstanceRuntimeState { propolis_id: None, - nexus_state: external::InstanceState::Stopped.into(), + nexus_state: InstanceState::NoVmm.into(), gen: Generation(instance.runtime_state.gen.0.next()), ..instance.runtime_state }; diff --git a/nexus/src/app/sagas/instance_update/mod.rs b/nexus/src/app/sagas/instance_update/mod.rs index 5a2940f9d98..a186bf3c755 100644 --- a/nexus/src/app/sagas/instance_update/mod.rs +++ b/nexus/src/app/sagas/instance_update/mod.rs @@ -179,7 +179,7 @@ async fn siu_lock_instance( "updater_gen" => ?gen, ); let lock = datastore - .instance_updater_try_lock(&opctx, &authz_instance, gen, &lock_id) + .instance_updater_lock(&opctx, &authz_instance, gen, &lock_id) .await .map_err(ActionError::action_failed)?; match lock { @@ -208,7 +208,7 @@ async fn siu_lock_instance( async fn siu_fetch_state( sagactx: NexusActionContext, -) -> Result { +) -> Result { let osagactx = sagactx.user_data(); let Params { ref serialized_authn, ref authz_instance, .. } = sagactx.saga_params::()?; @@ -217,7 +217,7 @@ async fn siu_fetch_state( osagactx .datastore() - .instance_fetch_with_vmms(&opctx, authz_instance) + .instance_fetch_all(&opctx, authz_instance) .await .map_err(ActionError::action_failed) } diff --git a/sled-agent/src/common/instance.rs b/sled-agent/src/common/instance.rs index 95eda52cdbe..b0f642188be 100644 --- a/sled-agent/src/common/instance.rs +++ b/sled-agent/src/common/instance.rs @@ -8,8 +8,8 @@ use crate::params::InstanceMigrationSourceParams; use chrono::{DateTime, Utc}; use omicron_common::api::external::Generation; use omicron_common::api::internal::nexus::{ - InstanceRuntimeState, MigrationRole, MigrationRuntimeState, MigrationState, - SledInstanceState, VmmRuntimeState, VmmState, + MigrationRole, MigrationRuntimeState, MigrationState, SledInstanceState, + VmmRuntimeState, VmmState, }; use omicron_uuid_kinds::PropolisUuid; use propolis_client::types::{ @@ -20,7 +20,6 @@ use propolis_client::types::{ /// The instance and VMM state that sled agent maintains on a per-VMM basis. #[derive(Clone, Debug)] pub struct InstanceStates { - instance: InstanceRuntimeState, vmm: VmmRuntimeState, propolis_id: PropolisUuid, migration: Option, @@ -111,10 +110,10 @@ pub(crate) struct ObservedPropolisState { impl ObservedPropolisState { /// Constructs a Propolis state observation from an instance's current - /// runtime state and an instance state monitor response received from + /// state and an instance state monitor response received from /// Propolis. pub fn new( - instance_runtime: &InstanceRuntimeState, + state: &InstanceStates, propolis_state: &InstanceStateMonitorResponse, ) -> Self { // If there's no migration currently registered with this sled, report @@ -213,31 +212,8 @@ pub enum Action { } impl InstanceStates { - pub fn new( - instance: InstanceRuntimeState, - vmm: VmmRuntimeState, - propolis_id: PropolisUuid, - ) -> Self { - let migration = instance.migration_id.map(|migration_id| { - let dst_propolis_id = instance.dst_propolis_id.expect("if an instance has a migration ID, it should also have a target VMM ID"); - let role = if dst_propolis_id == propolis_id { - MigrationRole::Target - } else { - MigrationRole::Source - }; - MigrationRuntimeState { - migration_id, - state: MigrationState::InProgress, - role, - gen: Generation::new(), - time_updated: Utc::now(), - } - }); - InstanceStates { instance, vmm, propolis_id, migration } - } - - pub fn instance(&self) -> &InstanceRuntimeState { - &self.instance + pub fn new(vmm: VmmRuntimeState, propolis_id: Uuid) -> Self { + InstanceStates { vmm, propolis_id, migration: None } } pub fn vmm(&self) -> &VmmRuntimeState { @@ -308,63 +284,12 @@ impl InstanceStates { MigrationState::Completed, observed.time, ); - match self.propolis_role() { - // This is a successful migration out. Point the instance to the - // target VMM, but don't clear migration IDs; let the target do - // that so that the instance will continue to appear to be - // migrating until it is safe to migrate again. - PropolisRole::Active => { - self.switch_propolis_id_to_target(observed.time); - - assert_eq!(self.propolis_role(), PropolisRole::Retired); - } - - // This is a successful migration in. Point the instance to the - // target VMM and clear migration IDs so that another migration - // in can begin. Propolis will continue reporting that this - // migration was successful, but because its ID has been - // discarded the observed migration status will change from - // Succeeded to NoMigration. - // - // Note that these calls increment the instance's generation - // number twice. This is by design and allows the target's - // migration-ID-clearing update to overtake the source's update. - PropolisRole::MigrationTarget => { - self.switch_propolis_id_to_target(observed.time); - self.clear_migration_ids(observed.time); - - assert_eq!(self.propolis_role(), PropolisRole::Active); - } - - // This is a migration source that previously reported success - // and removed itself from the active Propolis position. Don't - // touch the instance. - PropolisRole::Retired => {} - } } ObservedMigrationStatus::Failed => { self.transition_migration( MigrationState::Failed, observed.time, ); - - match self.propolis_role() { - // This is a failed migration out. CLear migration IDs so that - // Nexus can try again. - PropolisRole::Active => { - self.clear_migration_ids(observed.time); - } - - // This is a failed migration in. Leave the migration IDs alone - // so that the migration won't appear to have concluded until - // the source is ready to start a new one. - PropolisRole::MigrationTarget => {} - - // This VMM was part of a failed migration and was subsequently - // removed from the instance record entirely. There's nothing to - // update. - PropolisRole::Retired => {} - } } ObservedMigrationStatus::InProgress => { self.transition_migration( @@ -388,10 +313,6 @@ impl InstanceStates { // been transferred to the target, and what was once an active VMM // is now retired.) if vmm_gone { - if self.propolis_role() == PropolisRole::Active { - self.clear_migration_ids(observed.time); - self.retire_active_propolis(observed.time); - } // If there's an active migration and the VMM is suddenly gone, // that should constitute a migration failure! if let Some(MigrationState::Pending | MigrationState::InProgress) = @@ -408,54 +329,6 @@ impl InstanceStates { } } - /// Yields the role that this structure's VMM has given the structure's - /// current instance state. - fn propolis_role(&self) -> PropolisRole { - if let Some(active_id) = self.instance.propolis_id { - if active_id == self.propolis_id { - return PropolisRole::Active; - } - } - - if let Some(dst_id) = self.instance.dst_propolis_id { - if dst_id == self.propolis_id { - return PropolisRole::MigrationTarget; - } - } - - PropolisRole::Retired - } - - /// Sets the no-VMM fallback state of the current instance to reflect the - /// state of its terminated VMM and clears the instance's current Propolis - /// ID. Note that this routine does not touch any migration IDs. - /// - /// This should only be called by the state block for an active VMM and only - /// when that VMM is in a terminal state (Destroyed or Failed). - fn retire_active_propolis(&mut self, now: DateTime) { - assert!(self.propolis_role() == PropolisRole::Active); - - self.instance.propolis_id = None; - self.instance.gen = self.instance.gen.next(); - self.instance.time_updated = now; - } - - /// Moves the instance's destination Propolis ID into the current active - /// position and updates the generation number, but does not clear the - /// destination ID or the active migration ID. This promotes a migration - /// target VMM into the active position without actually allowing a new - /// migration to begin. - /// - /// This routine should only be called when - /// `instance.dst_propolis_id.is_some()`. - fn switch_propolis_id_to_target(&mut self, now: DateTime) { - assert!(self.instance.dst_propolis_id.is_some()); - - self.instance.propolis_id = self.instance.dst_propolis_id; - self.instance.gen = self.instance.gen.next(); - self.instance.time_updated = now; - } - /// Forcibly transitions this instance's VMM into the specified `next` /// state and updates its generation number. pub(crate) fn transition_vmm( @@ -494,7 +367,7 @@ impl InstanceStates { let fake_observed = ObservedPropolisState { vmm_state, - migration_status: if self.instance.migration_id.is_some() { + migration_status: if self.migration.is_some() { ObservedMigrationStatus::Failed } else { ObservedMigrationStatus::NoMigration @@ -517,8 +390,6 @@ impl InstanceStates { dst_propolis_id, }) = *ids { - self.instance.migration_id = Some(migration_id); - self.instance.dst_propolis_id = Some(dst_propolis_id); let role = if dst_propolis_id == self.propolis_id { MigrationRole::Target } else { @@ -532,22 +403,8 @@ impl InstanceStates { time_updated: now, }) } else { - self.instance.migration_id = None; - self.instance.dst_propolis_id = None; self.migration = None; } - - self.instance.gen = self.instance.gen.next(); - self.instance.time_updated = now; - } - - /// Unconditionally clears the instance's migration IDs and advances its - /// Propolis generation. Not public; used internally to conclude migrations. - fn clear_migration_ids(&mut self, now: DateTime) { - self.instance.migration_id = None; - self.instance.dst_propolis_id = None; - self.instance.gen = self.instance.gen.next(); - self.instance.time_updated = now; } /// Returns true if the migration IDs in this instance are already set as they @@ -555,47 +412,17 @@ impl InstanceStates { /// `old_runtime` to the ones in `migration_ids`. pub(crate) fn migration_ids_already_set( &self, - old_runtime: &InstanceRuntimeState, migration_ids: &Option, ) -> bool { - // For the old and new records to match, the new record's Propolis - // generation must immediately succeed the old record's. - // - // This is an equality check to try to avoid the following A-B-A - // problem: - // - // 1. Instance starts on sled 1. - // 2. Parallel sagas start, one to migrate the instance to sled 2 - // and one to migrate the instance to sled 3. - // 3. The "migrate to sled 2" saga completes. - // 4. A new migration starts that migrates the instance back to sled 1. - // 5. The "migrate to sled 3" saga attempts to set its migration - // ID. - // - // A simple less-than check allows the migration to sled 3 to proceed - // even though the most-recently-expressed intent to migrate put the - // instance on sled 1. - if old_runtime.gen.next() != self.instance.gen { - return false; - } - - match (self.instance.migration_id, migration_ids) { + match (self.migration, migration_ids) { // If the migration ID is already set, and this is a request to set // IDs, the records match if the relevant IDs match. - (Some(current_migration_id), Some(ids)) => { - let current_dst_id = self.instance.dst_propolis_id.expect( - "migration ID and destination ID must be set together", - ); - - current_migration_id == ids.migration_id - && current_dst_id == ids.dst_propolis_id + (Some(migration), Some(ids)) => { + migration.migration_id == ids.migration_id } // If the migration ID is already cleared, and this is a request to // clear IDs, the records match. - (None, None) => { - assert!(self.instance.dst_propolis_id.is_none()); - true - } + (None, None) => true, _ => false, } } @@ -605,8 +432,6 @@ impl InstanceStates { mod test { use super::*; - use crate::params::InstanceMigrationSourceParams; - use chrono::Utc; use omicron_common::api::external::Generation; use omicron_common::api::internal::nexus::InstanceRuntimeState; @@ -630,15 +455,13 @@ mod test { time_updated: now, }; - InstanceStates::new(instance, vmm, propolis_id) + InstanceStates::new(vmm, propolis_id) } fn make_migration_source_instance() -> InstanceStates { let mut state = make_instance(); state.vmm.state = VmmState::Migrating; let migration_id = Uuid::new_v4(); - state.instance.migration_id = Some(migration_id); - state.instance.dst_propolis_id = Some(PropolisUuid::new_v4()); state.migration = Some(MigrationRuntimeState { migration_id, state: MigrationState::InProgress, @@ -656,9 +479,7 @@ mod test { let mut state = make_instance(); state.vmm.state = VmmState::Migrating; let migration_id = Uuid::new_v4(); - state.instance.migration_id = Some(migration_id); - state.propolis_id = PropolisUuid::new_v4(); - state.instance.dst_propolis_id = Some(state.propolis_id); + state.propolis_id = Uuid::new_v4(); state.migration = Some(MigrationRuntimeState { migration_id, state: MigrationState::InProgress, @@ -688,36 +509,6 @@ mod test { prev: &InstanceStates, next: &InstanceStates, ) { - // The predicate under test below is "if an interesting field changed, - // then the generation number changed." Testing the contrapositive is a - // little nicer because the assertion that trips identifies exactly - // which field changed without updating the generation number. - // - // The else branch tests the converse to make sure the generation number - // does not update unexpectedly. While this won't cause an important - // state update to be dropped, it can interfere with updates from other - // sleds that expect their own attempts to advance the generation number - // to cause new state to be recorded. - if prev.instance.gen == next.instance.gen { - assert_eq!(prev.instance.propolis_id, next.instance.propolis_id); - assert_eq!( - prev.instance.dst_propolis_id, - next.instance.dst_propolis_id - ); - assert_eq!(prev.instance.migration_id, next.instance.migration_id); - } else { - assert!( - (prev.instance.propolis_id != next.instance.propolis_id) - || (prev.instance.dst_propolis_id - != next.instance.dst_propolis_id) - || (prev.instance.migration_id - != next.instance.migration_id), - "prev: {:?}, next: {:?}", - prev, - next - ); - } - // Propolis is free to publish no-op VMM state updates (e.g. when an // in-progress migration's state changes but the migration is not yet // complete), so don't test the converse here. @@ -735,10 +526,6 @@ mod test { .apply_propolis_observation(&make_observed_state(state.into())); assert!(matches!(requested_action, Some(Action::Destroy))); - assert!( - instance_state.instance.gen - > original_instance_state.instance.gen - ); } } @@ -776,8 +563,6 @@ mod test { #[test] fn destruction_after_migration_out_does_not_transition() { let mut state = make_migration_source_instance(); - assert!(state.instance.dst_propolis_id.is_some()); - assert_ne!(state.instance.propolis_id, state.instance.dst_propolis_id); // After a migration succeeds, the source VM appears to stop but reports // that the migration has succeeded. @@ -793,13 +578,6 @@ mod test { let prev = state.clone(); assert!(state.apply_propolis_observation(&observed).is_none()); assert_state_change_has_gen_change(&prev, &state); - assert!(state.instance.gen > prev.instance.gen); - assert_eq!( - state.instance.dst_propolis_id, - prev.instance.dst_propolis_id - ); - assert_eq!(state.instance.propolis_id, state.instance.dst_propolis_id); - assert!(state.instance.migration_id.is_some()); // The migration state should transition to "completed" let migration = state @@ -819,7 +597,6 @@ mod test { observed.vmm_state = PropolisInstanceState(Observed::Stopped); assert!(state.apply_propolis_observation(&observed).is_none()); assert_state_change_has_gen_change(&prev, &state); - assert_eq!(state.instance.gen, prev.instance.gen); // The Stopped state is translated internally to Stopping to prevent // external viewers from perceiving that the instance is stopped before @@ -843,7 +620,6 @@ mod test { Some(Action::Destroy) )); assert_state_change_has_gen_change(&prev, &state); - assert_eq!(state.instance.gen, prev.instance.gen); assert_eq!(state.vmm.state, VmmState::Destroyed); assert!(state.vmm.gen > prev.vmm.gen); @@ -863,7 +639,6 @@ mod test { // but should not change the instance's migration IDs. let observed = ObservedPropolisState { vmm_state: PropolisInstanceState(Observed::Failed), - migration_status: ObservedMigrationStatus::Failed, time: Utc::now(), }; @@ -873,7 +648,6 @@ mod test { Some(Action::Destroy) )); assert_state_change_has_gen_change(&prev, &state); - assert_eq!(state.instance.gen, prev.instance.gen); assert_eq!(state.vmm.state, VmmState::Failed); assert!(state.vmm.gen > prev.vmm.gen); @@ -895,14 +669,12 @@ mod test { #[test] fn rude_terminate_of_migration_target_does_not_transition_instance() { let mut state = make_migration_target_instance(); - assert_eq!(state.propolis_role(), PropolisRole::MigrationTarget); let prev = state.clone(); let mark_failed = false; state.terminate_rudely(mark_failed); assert_state_change_has_gen_change(&prev, &state); - assert_eq!(state.instance.gen, prev.instance.gen); // The migration state should transition. let migration = @@ -927,9 +699,6 @@ mod test { let prev = state.clone(); assert!(state.apply_propolis_observation(&observed).is_none()); assert_state_change_has_gen_change(&prev, &state); - assert!(state.instance.migration_id.is_none()); - assert!(state.instance.dst_propolis_id.is_none()); - assert!(state.instance.gen > prev.instance.gen); assert_eq!(state.vmm.state, VmmState::Running); assert!(state.vmm.gen > prev.vmm.gen); @@ -954,7 +723,6 @@ mod test { Utc::now(), ); assert_state_change_has_gen_change(&prev, &state); - assert!(state.instance.gen > prev.instance.gen); assert_eq!(state.vmm.gen, prev.vmm.gen); // There should be a new, pending migration state. @@ -971,20 +739,10 @@ mod test { // generation. let prev = state.clone(); observed.vmm_state = PropolisInstanceState(Observed::Migrating); - observed.migration_status = ObservedMigrationStatus::InProgress; assert!(state.apply_propolis_observation(&observed).is_none()); assert_state_change_has_gen_change(&prev, &state); - assert_eq!( - state.instance.migration_id.unwrap(), - prev.instance.migration_id.unwrap() - ); - assert_eq!( - state.instance.dst_propolis_id.unwrap(), - prev.instance.dst_propolis_id.unwrap() - ); assert_eq!(state.vmm.state, VmmState::Migrating); assert!(state.vmm.gen > prev.vmm.gen); - assert_eq!(state.instance.gen, prev.instance.gen); // The migration state should transition to in progress. let migration = state @@ -1000,18 +758,10 @@ mod test { // touch the migration ID (that is the new target's job). let prev = state.clone(); observed.vmm_state = PropolisInstanceState(Observed::Migrating); - observed.migration_status = ObservedMigrationStatus::Succeeded; assert!(state.apply_propolis_observation(&observed).is_none()); assert_state_change_has_gen_change(&prev, &state); assert_eq!(state.vmm.state, VmmState::Migrating); assert!(state.vmm.gen > prev.vmm.gen); - assert_eq!(state.instance.migration_id, prev.instance.migration_id); - assert_eq!( - state.instance.dst_propolis_id, - prev.instance.dst_propolis_id, - ); - assert_eq!(state.instance.propolis_id, state.instance.dst_propolis_id); - assert!(state.instance.gen > prev.instance.gen); // The migration state should transition to completed. let migration = state @@ -1023,64 +773,4 @@ mod test { // The rest of the destruction sequence is covered by other tests. } - - #[test] - fn test_migration_ids_already_set() { - let orig_instance = make_instance(); - let mut old_instance = orig_instance.clone(); - let mut new_instance = old_instance.clone(); - - // Advancing the old instance's migration IDs and then asking if the - // new IDs are present should indicate that they are indeed present. - let migration_ids = InstanceMigrationSourceParams { - migration_id: Uuid::new_v4(), - dst_propolis_id: PropolisUuid::new_v4(), - }; - - new_instance.set_migration_ids(&Some(migration_ids), Utc::now()); - assert!(new_instance.migration_ids_already_set( - old_instance.instance(), - &Some(migration_ids) - )); - - // The IDs aren't already set if the new record has an ID that's - // advanced from the old record by more than one generation. - let mut newer_instance = new_instance.clone(); - newer_instance.instance.gen = newer_instance.instance.gen.next(); - assert!(!newer_instance.migration_ids_already_set( - old_instance.instance(), - &Some(migration_ids) - )); - - // They also aren't set if the old generation has somehow equaled or - // surpassed the current generation. - old_instance.instance.gen = old_instance.instance.gen.next(); - assert!(!new_instance.migration_ids_already_set( - old_instance.instance(), - &Some(migration_ids) - )); - - // If the generation numbers are right, but either requested ID is not - // present in the current instance, the requested IDs aren't set. - old_instance = orig_instance; - new_instance.instance.migration_id = Some(Uuid::new_v4()); - assert!(!new_instance.migration_ids_already_set( - old_instance.instance(), - &Some(migration_ids) - )); - - new_instance.instance.migration_id = Some(migration_ids.migration_id); - new_instance.instance.dst_propolis_id = Some(PropolisUuid::new_v4()); - assert!(!new_instance.migration_ids_already_set( - old_instance.instance(), - &Some(migration_ids) - )); - - new_instance.instance.migration_id = None; - new_instance.instance.dst_propolis_id = None; - assert!(!new_instance.migration_ids_already_set( - old_instance.instance(), - &Some(migration_ids) - )); - } } diff --git a/sled-agent/src/instance.rs b/sled-agent/src/instance.rs index 7bfe308f94e..ea635c42dc8 100644 --- a/sled-agent/src/instance.rs +++ b/sled-agent/src/instance.rs @@ -385,7 +385,7 @@ impl InstanceRunner { match request { Some(Update { state, tx }) => { let observed = ObservedPropolisState::new( - self.state.instance(), + &self.state, &state, ); let reaction = self.observe_state(&observed).await; @@ -433,10 +433,7 @@ impl InstanceRunner { }, Some(PutMigrationIds{ old_runtime, migration_ids, tx }) => { tx.send( - self.put_migration_ids( - &old_runtime, - &migration_ids - ).await.map_err(|e| e.into()) + self.put_migration_ids(&migration_ids).await.map_err(|e| e.into()) ) .map_err(|_| Error::FailedSendClientClosed) }, @@ -649,7 +646,6 @@ impl InstanceRunner { self.log, "updated state after observing Propolis state change"; "propolis_id" => %self.state.propolis_id(), - "new_instance_state" => ?self.state.instance(), "new_vmm_state" => ?self.state.vmm() ); @@ -711,10 +707,13 @@ impl InstanceRunner { let migrate = match migrate { Some(params) => { - let migration_id = - self.state.instance().migration_id.ok_or_else(|| { + let migration_id = self + .state + .migration() + .ok_or_else(|| { Error::Migration(anyhow!("Missing Migration UUID")) - })?; + })? + .migration_id; Some(propolis_client::types::InstanceMigrateInitiateRequest { src_addr: params.src_propolis_addr.to_string(), src_uuid: params.src_propolis_id, @@ -1098,11 +1097,7 @@ impl Instance { dhcp_config, requested_disks: hardware.disks, cloud_init_bytes: hardware.cloud_init_bytes, - state: InstanceStates::new( - instance_runtime, - vmm_runtime, - propolis_id, - ), + state: InstanceStates::new(vmm_runtime, propolis_id), running_state: None, nexus_client, storage, @@ -1378,28 +1373,12 @@ impl InstanceRunner { async fn put_migration_ids( &mut self, - old_runtime: &InstanceRuntimeState, migration_ids: &Option, ) -> Result { - // Check that the instance's current generation matches the one the - // caller expects to transition from. This helps Nexus ensure that if - // multiple migration sagas launch at Propolis generation N, then only - // one of them will successfully set the instance's migration IDs. - if self.state.instance().gen != old_runtime.gen { - // Allow this transition for idempotency if the instance is - // already in the requested goal state. - if self.state.migration_ids_already_set(old_runtime, migration_ids) - { - return Ok(self.state.sled_instance_state()); - } - - return Err(Error::Transition( - omicron_common::api::external::Error::conflict(format!( - "wrong instance state generation: expected {}, got {}", - self.state.instance().gen, - old_runtime.gen - )), - )); + // Allow this transition for idempotency if the instance is + // already in the requested goal state. + if self.state.migration_ids_already_set(migration_ids) { + return Ok(self.state.sled_instance_state()); } self.state.set_migration_ids(migration_ids, Utc::now()); diff --git a/sled-agent/src/sim/instance.rs b/sled-agent/src/sim/instance.rs index 05315f3f333..33b9cc10a1e 100644 --- a/sled-agent/src/sim/instance.rs +++ b/sled-agent/src/sim/instance.rs @@ -85,14 +85,13 @@ impl SimInstanceInner { // Propolis transitions to the Migrating state once before // actually starting migration. self.queue_propolis_state(PropolisInstanceState::Migrating); - let migration_id = - self.state.instance().migration_id.unwrap_or_else(|| { - panic!( - "should have migration ID set before getting request to + let migration_id = self.state.migration().unwrap_or_else(|| { + panic!( + "should have migration ID set before getting request to migrate in (current state: {:?})", - self - ) - }); + self + ) + }); match role { MigrationRole::Source => { @@ -279,7 +278,7 @@ impl SimInstanceInner { } self.state.apply_propolis_observation(&ObservedPropolisState::new( - self.state.instance(), + &self.state, &self.last_response, )) } else { @@ -374,21 +373,12 @@ impl SimInstanceInner { /// Stores a set of migration IDs in the instance's runtime state. fn put_migration_ids( &mut self, - old_runtime: &InstanceRuntimeState, ids: &Option, ) -> Result { - if self.state.migration_ids_already_set(old_runtime, ids) { + if self.state.migration_ids_already_set(ids) { return Ok(self.state.sled_instance_state()); } - if self.state.instance().gen != old_runtime.gen { - return Err(Error::invalid_request(format!( - "wrong Propolis ID generation: expected {}, got {}", - self.state.instance().gen, - old_runtime.gen - ))); - } - self.state.set_migration_ids(ids, Utc::now()); // If we set migration IDs and are the migration source, ensure that we @@ -443,7 +433,7 @@ impl SimInstance { ids: &Option, ) -> Result { let mut inner = self.inner.lock().unwrap(); - inner.put_migration_ids(old_runtime, ids) + inner.put_migration_ids(ids) } } @@ -466,7 +456,6 @@ impl Simulatable for SimInstance { SimInstance { inner: Arc::new(Mutex::new(SimInstanceInner { state: InstanceStates::new( - todo!(), current.vmm_state, current.propolis_id, ), From b2cf79cafe88925b4a251b6a2585dd91892377f5 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Tue, 28 May 2024 12:04:36 -0700 Subject: [PATCH 027/234] whoops forgot this one --- nexus/tests/config.test.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/nexus/tests/config.test.toml b/nexus/tests/config.test.toml index e231f665fa9..23d3d2f5bc7 100644 --- a/nexus/tests/config.test.toml +++ b/nexus/tests/config.test.toml @@ -124,6 +124,7 @@ v2p_mapping_propagation.period_secs = 30 abandoned_vmm_reaper.period_secs = 60 saga_recovery.period_secs = 600 lookup_region_port.period_secs = 60 +instance_updater.period_secs = 30 [default_region_allocation_strategy] # we only have one sled in the test environment, so we need to use the From a69caf515a17d1249c50813350829ca6009521d8 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Tue, 28 May 2024 12:43:17 -0700 Subject: [PATCH 028/234] wip dead code cleanup --- .../app/background/tasks/instance_watcher.rs | 8 +++- nexus/src/app/instance_network.rs | 38 +------------------ sled-agent/src/common/instance.rs | 14 ------- 3 files changed, 8 insertions(+), 52 deletions(-) diff --git a/nexus/src/app/background/tasks/instance_watcher.rs b/nexus/src/app/background/tasks/instance_watcher.rs index 361cae98733..511451d7b48 100644 --- a/nexus/src/app/background/tasks/instance_watcher.rs +++ b/nexus/src/app/background/tasks/instance_watcher.rs @@ -18,6 +18,7 @@ use nexus_db_queries::db::pagination::Paginator; use nexus_db_queries::db::DataStore; use nexus_types::identity::Asset; use nexus_types::identity::Resource; +use omicron_common::api::external::Error; use omicron_common::api::external::InstanceState; use omicron_common::api::internal::nexus::SledInstanceState; use omicron_uuid_kinds::GenericUuid; @@ -187,7 +188,12 @@ impl InstanceWatcher { "error updating instance"; "error" => ?e, ); - Incomplete::UpdateFailed + match e { + Error::ObjectNotFound { .. } => { + Incomplete::InstanceNotFound + } + _ => Incomplete::UpdateFailed, + } }) .map(|updated| { slog::debug!( diff --git a/nexus/src/app/instance_network.rs b/nexus/src/app/instance_network.rs index 5f5274dea23..946a215c7c7 100644 --- a/nexus/src/app/instance_network.rs +++ b/nexus/src/app/instance_network.rs @@ -258,6 +258,7 @@ pub(crate) async fn boundary_switches( /// `Ok(())` if this routine completed all the operations it wanted to /// complete, or an appropriate `Err` otherwise. #[allow(clippy::too_many_arguments)] // Yeah, I know, I know, Clippy... +#[allow(dead_code)] // TODO(eliza): this probably needs to be deleted eventually pub(crate) async fn ensure_updated_instance_network_config( datastore: &DataStore, log: &slog::Logger, @@ -685,43 +686,6 @@ pub(crate) async fn probe_ensure_dpd_config( Ok(()) } -/// Deletes an instance's OPTE V2P mappings and the boundary switch NAT -/// entries for its external IPs. -/// -/// This routine returns immediately upon encountering any errors (and will -/// not try to destroy any more objects after the point of failure). -async fn clear_instance_networking_state( - datastore: &DataStore, - log: &slog::Logger, - resolver: &internal_dns::resolver::Resolver, - opctx: &OpContext, - opctx_alloc: &OpContext, - authz_instance: &authz::Instance, - v2p_manager: &background::Activator, -) -> Result<(), Error> { - v2p_manager.activate(); - - instance_delete_dpd_config( - datastore, - log, - resolver, - opctx, - opctx_alloc, - authz_instance, - ) - .await?; - - notify_dendrite_nat_state( - datastore, - log, - resolver, - opctx_alloc, - Some(InstanceUuid::from_untyped_uuid(authz_instance.id())), - true, - ) - .await -} - /// Attempts to delete all of the Dendrite NAT configuration for the /// instance identified by `authz_instance`. /// diff --git a/sled-agent/src/common/instance.rs b/sled-agent/src/common/instance.rs index b0f642188be..97abed9c57c 100644 --- a/sled-agent/src/common/instance.rs +++ b/sled-agent/src/common/instance.rs @@ -190,20 +190,6 @@ impl From for VmmState { } } -/// The possible roles a VMM can have vis-a-vis an instance. -#[derive(Clone, Copy, Debug, PartialEq)] -enum PropolisRole { - /// The VMM is its instance's current active VMM. - Active, - - /// The VMM is its instance's migration target VMM. - MigrationTarget, - - /// The instance does not refer to this VMM (but it may have done so in the - /// past). - Retired, -} - /// Action to be taken on behalf of state transition. #[derive(Clone, Copy, Debug, PartialEq)] pub enum Action { From d1709a56732f649fad592017287b094cad4e0ca3 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Wed, 29 May 2024 12:59:40 -0700 Subject: [PATCH 029/234] post merge fixy-wixy --- nexus/src/app/sagas/instance_update/mod.rs | 79 +++------------------- 1 file changed, 8 insertions(+), 71 deletions(-) diff --git a/nexus/src/app/sagas/instance_update/mod.rs b/nexus/src/app/sagas/instance_update/mod.rs index a186bf3c755..fef9a08c7d2 100644 --- a/nexus/src/app/sagas/instance_update/mod.rs +++ b/nexus/src/app/sagas/instance_update/mod.rs @@ -129,81 +129,18 @@ async fn siu_lock_instance( crate::context::op_context_for_saga_action(&sagactx, serialized_authn); let datastore = osagactx.datastore(); let log = osagactx.log(); - let instance_id = authz_instance.id(); slog::info!( - log, + osagactx.log(), "instance update: attempting to lock instance"; - "instance_id" => %instance_id, + "instance_id" => %instance.id(), "saga_id" => %lock_id, ); - - loop { - let instance = datastore - .instance_refetch(&opctx, &authz_instance) - .await - .map_err(ActionError::action_failed)?; - // Look at the current lock state of the instance and determine whether - // we can lock it. - match instance.runtime_state.updater_id { - Some(ref id) if id == &lock_id => { - slog::info!( - log, - "instance update: instance already locked by this saga"; - "instance_id" => %instance_id, - "saga_id" => %lock_id, - ); - return Ok(instance.runtime_state.updater_gen); - } - Some(ref id) => { - slog::info!( - log, - "instance update: instance locked by another saga"; - "instance_id" => %instance_id, - "saga_id" => %lock_id, - "locked_by" => %lock_id, - ); - return Err(ActionError::action_failed(serde_json::json!({ - "error": "instance locked by another saga", - "saga_id": lock_id, - "locked_by": id, - }))); - } - None => {} - }; - let gen = instance.runtime_state.updater_gen; - slog::debug!( - log, - "instance update: trying to acquire updater lock..."; - "instance_id" => %instance_id, - "saga_id" => %lock_id, - "updater_gen" => ?gen, - ); - let lock = datastore - .instance_updater_lock(&opctx, &authz_instance, gen, &lock_id) - .await - .map_err(ActionError::action_failed)?; - match lock { - Some(lock_gen) => { - slog::info!( - log, - "instance update: acquired updater lock"; - "instance_id" => %instance_id, - "saga_id" => %lock_id, - "updater_gen" => ?gen, - ); - return Ok(lock_gen); - } - None => { - slog::debug!( - log, - "instance update: generation has advanced, retrying..."; - "instance_id" => %instance_id, - "saga_id" => %lock_id, - "updater_gen" => ?gen, - ); - } - } - } + osagactx + .datastore() + .instance_updater_lock(&opctx, authz_instance, &lock_id) + .await + .map_err(ActionError::action_failed) + .map(|_| ()) } async fn siu_fetch_state( From 6efd3742a4d4c27e7c5c7c538c2edafa7b735536 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Wed, 29 May 2024 14:13:17 -0700 Subject: [PATCH 030/234] it's sagas all the way down sickos dot png --- .../app/sagas/instance_update/destroyed.rs | 91 ++++--- nexus/src/app/sagas/instance_update/mod.rs | 229 +++++++++++++----- nexus/src/app/sagas/instance_update/start.rs | 0 3 files changed, 211 insertions(+), 109 deletions(-) create mode 100644 nexus/src/app/sagas/instance_update/start.rs diff --git a/nexus/src/app/sagas/instance_update/destroyed.rs b/nexus/src/app/sagas/instance_update/destroyed.rs index b141a9d3ead..d8e55eb4b17 100644 --- a/nexus/src/app/sagas/instance_update/destroyed.rs +++ b/nexus/src/app/sagas/instance_update/destroyed.rs @@ -5,7 +5,6 @@ use super::ActionRegistry; use super::NexusActionContext; use super::NexusSaga; -use super::Params; use super::STATE; use crate::app::sagas::declare_saga_actions; use crate::app::sagas::ActionError; @@ -15,13 +14,17 @@ use nexus_db_model::InstanceRuntimeState; use nexus_db_model::InstanceState; use nexus_db_model::Vmm; use nexus_db_model::VmmState; +use nexus_db_queries::authn; +use nexus_db_queries::authz; use nexus_db_queries::db::datastore::InstanceSnapshot; use nexus_db_queries::db::identity::Resource; use omicron_common::api::external; use omicron_common::api::external::Error; +use serde::{Deserialize, Serialize}; use slog::info; +use uuid::Uuid; -// instance update VMM destroyed subsaga: actions +// instance update (active VMM destroyed) subsaga: actions // This subsaga is responsible for handling an instance update where the // instance's active VMM has entered the `Destroyed` state. This requires @@ -64,6 +67,21 @@ declare_saga_actions! { } } +/// Parameters to the instance update (active VMM destroyed) sub-saga. +#[derive(Debug, Deserialize, Serialize)] +pub(super) struct Params { + /// Authentication context to use to fetch the instance's current state from + /// the database. + pub(super) serialized_authn: authn::saga::Serialized, + + pub(super) authz_instance: authz::Instance, + + /// The UUID of the VMM that was destroyed. + pub(super) vmm_id: Uuid, + + pub(super) instance: Instance, +} + #[derive(Debug)] pub(crate) struct SagaVmmDestroyed; impl NexusSaga for SagaVmmDestroyed { @@ -105,14 +123,8 @@ fn get_destroyed_vmm( async fn siud_release_sled_resources( sagactx: NexusActionContext, ) -> Result<(), ActionError> { - let Some((_, vmm)) = get_destroyed_vmm(&sagactx)? else { - // if the update we are handling is not an active VMM destroyed update, - // bail --- there's nothing to do here. - return Ok(()); - }; - let osagactx = sagactx.user_data(); - let Params { ref serialized_authn, ref authz_instance } = + let Params { ref serialized_authn, ref authz_instance, vmm_id, .. } = sagactx.saga_params::()?; let opctx = @@ -122,13 +134,13 @@ async fn siud_release_sled_resources( osagactx.log(), "instance update (active VMM destroyed): deallocating sled resource reservation"; "instance_id" => %authz_instance.id(), - "propolis_id" => %vmm.id, + "propolis_id" => %vmm_id, "instance_update" => %"VMM destroyed", ); osagactx .datastore() - .sled_reservation_delete(&opctx, vmm.id) + .sled_reservation_delete(&opctx, vmm_id) .await .or_else(|err| { // Necessary for idempotency @@ -150,7 +162,7 @@ async fn siud_release_virtual_provisioning( }; let osagactx = sagactx.user_data(); - let Params { ref serialized_authn, ref authz_instance } = + let Params { ref serialized_authn, ref authz_instance, vmm_id, .. } = sagactx.saga_params::()?; let opctx = @@ -160,7 +172,7 @@ async fn siud_release_virtual_provisioning( osagactx.log(), "instance update (VMM destroyed): deallocating virtual provisioning resources"; "instance_id" => %authz_instance.id(), - "propolis_id" => %vmm.id, + "propolis_id" => %vmm_id, "instance_update" => %"VMM destroyed", ); @@ -188,11 +200,6 @@ async fn siud_release_virtual_provisioning( async fn siud_unassign_oximeter_producer( sagactx: NexusActionContext, ) -> Result<(), ActionError> { - let Some((_, _)) = get_destroyed_vmm(&sagactx)? else { - // if the update we are handling is not an active VMM destroyed update, - // bail --- there's nothing to do here. - return Ok(()); - }; let osagactx = sagactx.user_data(); let Params { ref serialized_authn, ref authz_instance, .. } = sagactx.saga_params::()?; @@ -213,17 +220,15 @@ async fn siud_unassign_oximeter_producer( async fn siud_delete_v2p_mappings( sagactx: NexusActionContext, ) -> Result<(), ActionError> { - let Some((instance, vmm)) = get_destroyed_vmm(&sagactx)? else { - // if the update we are handling is not an active VMM destroyed update, - // bail --- there's nothing to do here. - return Ok(()); - }; + let Params { ref authz_instance, vmm_id, .. } = + sagactx.saga_params::()?; + let osagactx = sagactx.user_data(); info!( osagactx.log(), "instance update (VMM destroyed): deleting V2P mappings"; - "instance_id" => %instance.id(), - "propolis_id" => %vmm.id, + "instance_id" => %authz_instance.id(), + "propolis_id" => %vmm_id, "instance_update" => %"VMM destroyed", ); @@ -235,13 +240,8 @@ async fn siud_delete_v2p_mappings( async fn siud_delete_nat_entries( sagactx: NexusActionContext, ) -> Result<(), ActionError> { - let Some((_, vmm)) = get_destroyed_vmm(&sagactx)? else { - // if the update we are handling is not an active VMM destroyed update, - // bail --- there's nothing to do here. - return Ok(()); - }; let osagactx = sagactx.user_data(); - let Params { ref serialized_authn, ref authz_instance, .. } = + let Params { ref serialized_authn, ref authz_instance, vmm_id, .. } = sagactx.saga_params::()?; let opctx = @@ -251,7 +251,7 @@ async fn siud_delete_nat_entries( osagactx.log(), "instance update (VMM destroyed): deleting NAT entries"; "instance_id" => %authz_instance.id(), - "propolis_id" => %vmm.id, + "propolis_id" => %vmm_id, "instance_update" => %"VMM destroyed", ); @@ -266,11 +266,9 @@ async fn siud_delete_nat_entries( async fn siud_update_instance( sagactx: NexusActionContext, ) -> Result<(), ActionError> { - let Some((instance, vmm)) = get_destroyed_vmm(&sagactx)? else { - // if the update we are handling is not an active VMM destroyed update, - // bail --- there's nothing to do here. - return Ok(()); - }; + let Params { ref authz_instance, ref vmm_id, instance, .. } = + sagactx.saga_params::()?; + let osagactx = sagactx.user_data(); let new_runtime = InstanceRuntimeState { propolis_id: None, @@ -282,8 +280,8 @@ async fn siud_update_instance( info!( osagactx.log(), "instance update (VMM destroyed): updating runtime state"; - "instance_id" => %instance.id(), - "propolis_id" => %vmm.id, + "instance_id" => %authz_instance.id(), + "propolis_id" => %vmm_id, "new_runtime_state" => ?new_runtime, "instance_update" => %"VMM destroyed", ); @@ -291,7 +289,7 @@ async fn siud_update_instance( // It's okay for this to fail, it just means that the active VMM ID has changed. let _ = osagactx .datastore() - .instance_update_runtime(&instance.id(), &new_runtime) + .instance_update_runtime(&authz_instance.id(), &new_runtime) .await; Ok(()) } @@ -299,13 +297,8 @@ async fn siud_update_instance( async fn siud_mark_vmm_deleted( sagactx: NexusActionContext, ) -> Result<(), ActionError> { - let Some((instance, vmm)) = get_destroyed_vmm(&sagactx)? else { - // if the update we are handling is not an active VMM destroyed update, - // bail --- there's nothing to do here. - return Ok(()); - }; let osagactx = sagactx.user_data(); - let Params { ref serialized_authn, .. } = + let Params { ref authz_instance, ref vmm_id, ref serialized_authn, .. } = sagactx.saga_params::()?; let opctx = @@ -314,14 +307,14 @@ async fn siud_mark_vmm_deleted( info!( osagactx.log(), "instance update (VMM destroyed): marking VMM record deleted"; - "instance_id" => %instance.id(), - "propolis_id" => %vmm.id, + "instance_id" => %authz_instance.id(), + "propolis_id" => %vmm_id, "instance_update" => %"VMM destroyed", ); osagactx .datastore() - .vmm_mark_deleted(&opctx, &vmm.id) + .vmm_mark_deleted(&opctx, vmm_id) .await .map(|_| ()) .map_err(ActionError::action_failed) diff --git a/nexus/src/app/sagas/instance_update/mod.rs b/nexus/src/app/sagas/instance_update/mod.rs index fef9a08c7d2..a6daefd87ce 100644 --- a/nexus/src/app/sagas/instance_update/mod.rs +++ b/nexus/src/app/sagas/instance_update/mod.rs @@ -7,9 +7,7 @@ use super::{ ACTION_GENERATE_ID, }; use crate::app::db::datastore::InstanceSnapshot; -use crate::app::db::lookup::LookupPath; use crate::app::sagas::declare_saga_actions; -use nexus_db_model::Generation; use nexus_db_queries::{authn, authz}; use nexus_types::identity::Resource; use omicron_common::api::external::InstanceState; @@ -18,7 +16,8 @@ use steno::{ActionError, DagBuilder, Node, SagaName}; use uuid::Uuid; mod destroyed; -/// Parameters to the instance update saga. + +/// Parameters to the start instance update saga. #[derive(Debug, Deserialize, Serialize)] pub(crate) struct Params { /// Authentication context to use to fetch the instance's current state from @@ -28,6 +27,16 @@ pub(crate) struct Params { pub authz_instance: authz::Instance, } +/// Parameters to the "real" instance update saga. +#[derive(Debug, Deserialize, Serialize)] +struct RealParams { + serialized_authn: authn::saga::Serialized, + + authz_instance: authz::Instance, + + state: InstanceSnapshot, +} + const INSTANCE_LOCK_ID: &str = "saga_instance_lock_id"; const STATE: &str = "state"; @@ -43,15 +52,21 @@ declare_saga_actions! { - siu_lock_instance_undo } - // Fetch the instance and VMM's state. + // Fetch the instance and VMM's state, and start the "real" instance update saga. // N.B. that this must be performed as a separate action from // `LOCK_INSTANCE`, so that if the lookup fails, we will still unwind the // `LOCK_INSTANCE` action and release the lock. - FETCH_STATE -> "state" { - + siu_fetch_state + FETCH_STATE_AND_START_REAL_SAGA -> "state" { + + siu_fetch_state_and_start_real_saga } - UNLOCK_INSTANCE -> "no_result7" { + // Become the instance updater + BECOME_UPDATER -> "generation" { + + siu_become_updater + - siu_lock_instance_undo + } + + UNLOCK_INSTANCE -> "unlocked" { + siu_unlock_instance } } @@ -61,7 +76,7 @@ declare_saga_actions! { #[derive(Debug)] pub(crate) struct SagaInstanceUpdate; impl NexusSaga for SagaInstanceUpdate { - const NAME: &'static str = "instance-update"; + const NAME: &'static str = "start-instance-update"; type Params = Params; fn register_actions(registry: &mut ActionRegistry) { @@ -78,40 +93,76 @@ impl NexusSaga for SagaInstanceUpdate { ACTION_GENERATE_ID.as_ref(), )); builder.append(lock_instance_action()); - builder.append(fetch_state_action()); + builder.append(fetch_state_and_start_real_saga_action()); - // determine which subsaga to execute based on the state of the instance - // and the VMMs associated with it. - const DESTROYED_SUBSAGA_PARAMS: &str = - "params_for_vmm_destroyed_subsaga"; - let subsaga_dag = { - let subsaga_builder = DagBuilder::new(SagaName::new( - destroyed::SagaVmmDestroyed::NAME, - )); - destroyed::SagaVmmDestroyed::make_saga_dag( - ¶ms, - subsaga_builder, - )? - }; - - builder.append(Node::constant( - DESTROYED_SUBSAGA_PARAMS, - serde_json::to_value(¶ms).map_err(|e| { - SagaInitError::SerializeError( - DESTROYED_SUBSAGA_PARAMS.to_string(), - e, - ) - })?, - )); + Ok(builder.build()?) + } +} + +struct SagaRealInstanceUpdate; + +impl NexusSaga for SagaRealInstanceUpdate { + const NAME: &'static str = "instance-update"; + type Params = RealParams; + + fn register_actions(registry: &mut ActionRegistry) { + instance_update_register_actions(registry); + } - builder.append(Node::subsaga( - "vmm_destroyed_subsaga_no_result", - subsaga_dag, - DESTROYED_SUBSAGA_PARAMS, + fn make_saga_dag( + params: &Self::Params, + mut builder: DagBuilder, + ) -> Result { + builder.append(Node::action( + INSTANCE_LOCK_ID, + "GenerateInstanceLockId", + ACTION_GENERATE_ID.as_ref(), )); + builder.append(become_updater_action()); - builder.append(unlock_instance_action()); + // determine which subsaga(s) to execute based on the state of the instance + // and the VMMs associated with it. + if let Some(ref active_vmm) = params.state.active_vmm { + // If the active VMM is `Destroyed`, schedule the active VMM + // destroyed subsaga. + if active_vmm.runtime.state.state() == &InstanceState::Destroyed { + const DESTROYED_SUBSAGA_PARAMS: &str = + "params_for_vmm_destroyed_subsaga"; + let subsaga_params = destroyed::Params { + serialized_authn: params.serialized_authn.clone(), + authz_instance: params.authz_instance.clone(), + vmm_id: active_vmm.id, + instance: params.state.instance.clone(), + }; + let subsaga_dag = { + let subsaga_builder = DagBuilder::new(SagaName::new( + destroyed::SagaVmmDestroyed::NAME, + )); + destroyed::SagaVmmDestroyed::make_saga_dag( + &subsaga_params, + subsaga_builder, + )? + }; + builder.append(Node::constant( + DESTROYED_SUBSAGA_PARAMS, + serde_json::to_value(&subsaga_params).map_err(|e| { + SagaInitError::SerializeError( + DESTROYED_SUBSAGA_PARAMS.to_string(), + e, + ) + })?, + )); + + builder.append(Node::subsaga( + "vmm_destroyed_subsaga_no_result", + subsaga_dag, + DESTROYED_SUBSAGA_PARAMS, + )); + } + } + + builder.append(unlock_instance_action()); Ok(builder.build()?) } } @@ -120,19 +171,17 @@ impl NexusSaga for SagaInstanceUpdate { async fn siu_lock_instance( sagactx: NexusActionContext, -) -> Result { +) -> Result<(), ActionError> { let osagactx = sagactx.user_data(); let Params { ref serialized_authn, ref authz_instance, .. } = sagactx.saga_params::()?; let lock_id = sagactx.lookup::(INSTANCE_LOCK_ID)?; let opctx = crate::context::op_context_for_saga_action(&sagactx, serialized_authn); - let datastore = osagactx.datastore(); - let log = osagactx.log(); slog::info!( osagactx.log(), "instance update: attempting to lock instance"; - "instance_id" => %instance.id(), + "instance_id" => %authz_instance.id(), "saga_id" => %lock_id, ); osagactx @@ -143,59 +192,119 @@ async fn siu_lock_instance( .map(|_| ()) } -async fn siu_fetch_state( +async fn siu_fetch_state_and_start_real_saga( sagactx: NexusActionContext, -) -> Result { +) -> Result<(), ActionError> { let osagactx = sagactx.user_data(); - let Params { ref serialized_authn, ref authz_instance, .. } = + let Params { serialized_authn, authz_instance, .. } = sagactx.saga_params::()?; let opctx = - crate::context::op_context_for_saga_action(&sagactx, serialized_authn); + crate::context::op_context_for_saga_action(&sagactx, &serialized_authn); - osagactx + let state = osagactx .datastore() - .instance_fetch_all(&opctx, authz_instance) + .instance_fetch_with_vmms(&opctx, &authz_instance) .await - .map_err(ActionError::action_failed) + .map_err(ActionError::action_failed)?; + osagactx + .nexus() + .execute_saga::(RealParams { + serialized_authn, + authz_instance, + state, + }) + .await + .map_err(ActionError::action_failed); + + Ok(()) } -async fn siu_unlock_instance( +async fn siu_become_updater( sagactx: NexusActionContext, -) -> Result<(), anyhow::Error> { - let osagactx = sagactx.user_data(); - let Params { ref serialized_authn, ref authz_instance, .. } = - sagactx.saga_params::()?; +) -> Result<(), ActionError> { + let RealParams { + ref serialized_authn, ref authz_instance, ref state, .. + } = sagactx.saga_params::()?; + let lock_id = sagactx.lookup::(INSTANCE_LOCK_ID)?; let opctx = crate::context::op_context_for_saga_action(&sagactx, serialized_authn); + let osagactx = sagactx.user_data(); + slog::debug!( + osagactx.log(), + "instance update: trying to become instance updater..."; + "instance_id" => %authz_instance.id(), + "saga_id" => %lock_id, + "parent_id" => ?state.instance.runtime_state.updater_id, + ); + osagactx .datastore() - .instance_updater_unlock(&opctx, &authz_instance, &lock_id) - .await?; + .instance_updater_inherit_lock(&opctx, &state.instance, &lock_id) + .await + .map_err(ActionError::action_failed)?; + + slog::info!( + osagactx.log(), + "instance update: became instance updater"; + "instance_id" => %authz_instance.id(), + "saga_id" => %lock_id, + "parent_id" => ?state.instance.runtime_state.updater_id, + ); + + Ok(()) +} + +async fn siu_unbecome_updater( + sagactx: NexusActionContext, +) -> Result<(), anyhow::Error> { + let RealParams { ref serialized_authn, ref authz_instance, .. } = + sagactx.saga_params::()?; + unlock_instance_inner(serialized_authn, authz_instance, &sagactx).await?; + Ok(()) } +async fn siu_unlock_instance( + sagactx: NexusActionContext, +) -> Result<(), ActionError> { + let RealParams { ref serialized_authn, ref authz_instance, .. } = + sagactx.saga_params::()?; + unlock_instance_inner(serialized_authn, authz_instance, &sagactx).await +} + // N.B. that this has to be a separate function just because the undo action // must return `anyhow::Error` rather than `ActionError`. async fn siu_lock_instance_undo( sagactx: NexusActionContext, ) -> Result<(), anyhow::Error> { - let osagactx = sagactx.user_data(); let Params { ref serialized_authn, ref authz_instance, .. } = sagactx.saga_params::()?; + unlock_instance_inner(serialized_authn, authz_instance, &sagactx).await?; + Ok(()) +} + +async fn unlock_instance_inner( + serialized_authn: &authn::saga::Serialized, + authz_instance: &authz::Instance, + sagactx: &NexusActionContext, +) -> Result<(), ActionError> { let lock_id = sagactx.lookup::(INSTANCE_LOCK_ID)?; let opctx = crate::context::op_context_for_saga_action(&sagactx, serialized_authn); - let datastore = osagactx.datastore(); - + let osagactx = sagactx.user_data(); slog::info!( osagactx.log(), - "instance update: unlocking instance on unwind"; + "instance update: unlocking instance"; "instance_id" => %authz_instance.id(), "saga_id" => %lock_id, ); - datastore.instance_updater_unlock(&opctx, authz_instance, &lock_id).await?; + osagactx + .datastore() + .instance_updater_unlock(&opctx, authz_instance, &lock_id) + .await + .map_err(ActionError::action_failed)?; Ok(()) } diff --git a/nexus/src/app/sagas/instance_update/start.rs b/nexus/src/app/sagas/instance_update/start.rs new file mode 100644 index 00000000000..e69de29bb2d From 610d5e044779368f2903a1595f45aec5b1a0146f Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Wed, 29 May 2024 14:29:12 -0700 Subject: [PATCH 031/234] deal with dead code in a more compiley way --- nexus/db-queries/src/db/datastore/instance.rs | 133 +++++-- nexus/src/app/instance_network.rs | 340 +++++++++--------- .../app/sagas/instance_update/destroyed.rs | 16 +- nexus/src/app/sagas/instance_update/mod.rs | 4 +- nexus/src/app/sagas/snapshot_create.rs | 2 +- sled-agent/src/common/instance.rs | 9 - sled-agent/src/instance.rs | 6 +- sled-agent/src/instance_manager.rs | 1 - sled-agent/src/sim/collection.rs | 11 +- sled-agent/src/sim/sled_agent.rs | 2 +- 10 files changed, 290 insertions(+), 234 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/instance.rs b/nexus/db-queries/src/db/datastore/instance.rs index 6100a628498..80b8d6df95b 100644 --- a/nexus/db-queries/src/db/datastore/instance.rs +++ b/nexus/db-queries/src/db/datastore/instance.rs @@ -152,12 +152,14 @@ pub struct InstanceSnapshot { /// when the lock is released. #[derive(Debug, serde::Serialize, serde::Deserialize)] pub struct UpdaterLock { - saga_lock_id: Uuid, + updater_id: Uuid, locked_gen: Generation, } /// Errors returned by [`DataStore::instance_updater_lock`]. -#[derive(Debug, thiserror::Error, PartialEq)] +#[derive( + Debug, thiserror::Error, PartialEq, serde::Serialize, serde::Deserialize, +)] pub enum UpdaterLockError { /// The instance was already locked by another saga. #[error("instance already locked by another saga")] @@ -754,7 +756,7 @@ impl DataStore { /// # Arguments /// /// - `authz_instance`: the instance to attempt to lock to lock - /// - `saga_lock_id`: the UUID of the saga that's attempting to lock this + /// - `updater_id`: the UUID of the saga that's attempting to lock this /// instance. /// /// # Returns @@ -769,7 +771,7 @@ impl DataStore { &self, opctx: &OpContext, authz_instance: &authz::Instance, - saga_lock_id: Uuid, + updater_id: Uuid, ) -> Result { use db::schema::instance::dsl; @@ -790,22 +792,21 @@ impl DataStore { // *same* instance at the same time. So, idempotency is probably more // important than handling that extremely unlikely edge case. let mut did_lock = false; + let mut locked_gen = instance.updater_gen; loop { match instance.updater_id { // If the `updater_id` field is not null and the ID equals this // saga's ID, we already have the lock. We're done here! - Some(lock_id) if lock_id == saga_lock_id => { - slog::info!( + Some(lock_id) if lock_id == updater_id => { + slog::debug!( &opctx.log, "instance updater lock acquired!"; "instance_id" => %instance_id, - "saga_id" => %saga_lock_id, + "updater_id" => %updater_id, + "locked_gen" => ?locked_gen, "already_locked" => !did_lock, ); - return Ok(UpdaterLock { - saga_lock_id, - locked_gen: instance.updater_gen, - }); + return Ok(UpdaterLock { updater_id, locked_gen }); } // The `updater_id` field is set, but it's not our ID. The instance // is locked by a different saga, so give up. @@ -815,7 +816,7 @@ impl DataStore { "instance is locked by another saga"; "instance_id" => %instance_id, "locked_by" => %lock_id, - "saga_id" => %saga_lock_id, + "updater_id" => %updater_id, ); return Err(UpdaterLockError::AlreadyLocked); } @@ -826,11 +827,12 @@ impl DataStore { // Okay, now attempt to acquire the lock let current_gen = instance.updater_gen; + locked_gen = Generation(current_gen.0.next()); slog::debug!( &opctx.log, "attempting to acquire instance updater lock"; "instance_id" => %instance_id, - "saga_id" => %saga_lock_id, + "updater_id" => %updater_id, "current_gen" => ?current_gen, ); @@ -848,8 +850,8 @@ impl DataStore { // of a non-distributed, single-process mutex. .filter(dsl::updater_gen.eq(current_gen)) .set(( - dsl::updater_gen.eq(dsl::updater_gen + 1), - dsl::updater_id.eq(Some(saga_lock_id)), + dsl::updater_gen.eq(locked_gen), + dsl::updater_id.eq(Some(updater_id)), )) .check_if_exists::(instance_id) .execute_and_check( @@ -878,11 +880,83 @@ impl DataStore { } } + pub async fn instance_updater_inherit_lock( + &self, + opctx: &OpContext, + authz_instance: &authz::Instance, + UpdaterLock { updater_id: parent_id, locked_gen }: UpdaterLock, + child_lock_id: Uuid, + ) -> Result { + use db::schema::instance::dsl; + + let instance_id = authz_instance.id(); + let new_gen = Generation(locked_gen.0.next()); + + let result = diesel::update(dsl::instance) + .filter(dsl::time_deleted.is_null()) + .filter(dsl::id.eq(instance_id)) + .filter(dsl::updater_gen.eq(locked_gen)) + .filter(dsl::updater_id.eq(parent_id)) + .set(( + dsl::updater_gen.eq(new_gen), + dsl::updater_id.eq(Some(child_lock_id)), + )) + .check_if_exists::(instance_id) + .execute_and_check(&*self.pool_connection_authorized(opctx).await?) + .await + .map_err(|e| { + public_error_from_diesel( + e, + ErrorHandler::NotFoundByLookup( + ResourceType::Instance, + LookupType::ById(instance_id), + ), + ) + })?; + + match result { + // If we updated the record, the lock has been released! Return + // `Ok(true)` to indicate that we released the lock successfully. + UpdateAndQueryResult { status: UpdateStatus::Updated, .. } => { + slog::info!( + &opctx.log, + "inherited lock from {parent_id} to {child_lock_id}"; + "instance_id" => %instance_id, + "updater_id" => %child_lock_id, + "locked_gen" => ?new_gen, + "parent_id" => %parent_id, + "parent_gen" => ?locked_gen, + ); + Ok(UpdaterLock { + updater_id: child_lock_id, + locked_gen: new_gen, + }) + } + // The generation has advanced past the generation at which the + // lock was held. This means that we have already inherited the + // lock. Return `Ok(false)` here for idempotency. + UpdateAndQueryResult { + status: UpdateStatus::NotUpdatedButExists, + ref found, + } if found.updater_id == Some(child_lock_id) => { + debug_assert_eq!(found.updater_gen, new_gen,); + Ok(UpdaterLock { + updater_id: child_lock_id, + locked_gen: new_gen, + }) + } + // The instance exists, but the lock ID doesn't match our lock ID. + // This means we were trying to release a lock we never held, whcih + // is almost certainly a programmer error. + UpdateAndQueryResult { .. } => Err(UpdaterLockError::AlreadyLocked), + } + } + /// Release the instance-updater lock acquired by /// [`DataStore::instance_updater_lock`]. /// /// This method will unlock the instance if (and only if) the lock is - /// currently held by the provided `saga_lock_id`. If the lock is held by a + /// currently held by the provided `updater_id`. If the lock is held by a /// different saga UUID, the instance will remain locked. If the instance /// has already been unlocked, this method will return `false`. /// @@ -895,7 +969,7 @@ impl DataStore { &self, opctx: &OpContext, authz_instance: &authz::Instance, - UpdaterLock { saga_lock_id, locked_gen }: UpdaterLock, + UpdaterLock { updater_id, locked_gen }: UpdaterLock, ) -> Result { use db::schema::instance::dsl; @@ -907,7 +981,7 @@ impl DataStore { // Only unlock the instance if: // - the provided updater ID matches that of the saga that has // currently locked this instance. - .filter(dsl::updater_id.eq(Some(saga_lock_id))) + .filter(dsl::updater_id.eq(Some(updater_id))) // - the provided updater generation matches the current updater // generation. .filter(dsl::updater_gen.eq(locked_gen)) @@ -946,8 +1020,17 @@ impl DataStore { // is almost certainly a programmer error. UpdateAndQueryResult { ref found, .. } => { match found.updater_id { - Some(lock_holder) => { - debug_assert_ne!(lock_holder, saga_lock_id); + Some(actual_id) => { + slog::error!( + &opctx.log, + "attempted to release a lock held by another saga"; + "instance_id" => %instance_id, + "updater_id" => %updater_id, + "actual_id" => %actual_id, + "found_gen" => ?found.updater_gen, + "locked_gen" => ?locked_gen, + ); + debug_assert_ne!(actual_id, updater_id); Err(Error::internal_error( "attempted to release a lock held by another saga! this is a bug!", )) @@ -1057,7 +1140,7 @@ mod tests { stringify!($id) )); assert_eq!( - lock.saga_lock_id, + lock.updater_id, $id, "instance's `updater_id` must be set to {}", stringify!($id), @@ -1127,7 +1210,7 @@ mod tests { .await ) .expect("instance should be locked"); - assert_eq!(lock1.saga_lock_id, saga1); + assert_eq!(lock1.updater_id, saga1); // doing it again should be fine. let lock2 = dbg!( @@ -1138,7 +1221,7 @@ mod tests { .expect( "instance_updater_lock should succeed again with the same saga ID", ); - assert_eq!(lock2.saga_lock_id, saga1); + assert_eq!(lock2.updater_id, saga1); // the generation should not have changed as a result of the second // update. assert_eq!(lock1.locked_gen, lock2.locked_gen); @@ -1199,7 +1282,7 @@ mod tests { // an incorrect one is constructed, or a raw database query // attempts an invalid unlock operation. UpdaterLock { - saga_lock_id: saga2, + updater_id: saga2, locked_gen: lock1.locked_gen, }, ) @@ -1236,7 +1319,7 @@ mod tests { // Again, these fields are private specifically to prevent // you from doing this exact thing. But, we should still // test that we handle it gracefully. - UpdaterLock { saga_lock_id: saga1, locked_gen: next_gen }, + UpdaterLock { updater_id: saga1, locked_gen: next_gen }, ) .await ) diff --git a/nexus/src/app/instance_network.rs b/nexus/src/app/instance_network.rs index 946a215c7c7..3ac0757b47e 100644 --- a/nexus/src/app/instance_network.rs +++ b/nexus/src/app/instance_network.rs @@ -14,11 +14,9 @@ use nexus_db_model::Ipv4NatValues; use nexus_db_model::Vni as DbVni; use nexus_db_queries::authz; use nexus_db_queries::context::OpContext; -use nexus_db_queries::db; use nexus_db_queries::db::lookup::LookupPath; use nexus_db_queries::db::DataStore; use omicron_common::api::external::Error; -use omicron_common::api::internal::nexus; use omicron_common::api::internal::shared::NetworkInterface; use omicron_common::api::internal::shared::SwitchLocation; use omicron_uuid_kinds::GenericUuid; @@ -230,175 +228,175 @@ pub(crate) async fn boundary_switches( Ok(boundary_switches) } -/// Given old and new instance runtime states, determines the desired -/// networking configuration for a given instance and ensures it has been -/// propagated to all relevant sleds. -/// -/// # Arguments -/// -/// - `datastore`: the datastore to use for lookups and updates. -/// - `log`: the [`slog::Logger`] to log to. -/// - `resolver`: an internal DNS resolver to look up DPD service addresses. -/// - `opctx`: An operation context for this operation. -/// - `opctx_alloc`: An operational context list permissions for all sleds. When -/// called by methods on the [`Nexus`] type, this is the `OpContext` used for -/// instance allocation. In a background task, this may be the background -/// task's operational context; nothing stops you from passing the same -/// `OpContext` as both `opctx` and `opctx_alloc`. -/// - `authz_instance``: A resolved authorization context for the instance of -/// interest. -/// - `prev_instance_state``: The most-recently-recorded instance runtime -/// state for this instance. -/// - `new_instance_state`: The instance state that the caller of this routine -/// has observed and that should be used to set up this instance's -/// networking state. -/// -/// # Return value -/// -/// `Ok(())` if this routine completed all the operations it wanted to -/// complete, or an appropriate `Err` otherwise. -#[allow(clippy::too_many_arguments)] // Yeah, I know, I know, Clippy... -#[allow(dead_code)] // TODO(eliza): this probably needs to be deleted eventually -pub(crate) async fn ensure_updated_instance_network_config( - datastore: &DataStore, - log: &slog::Logger, - resolver: &internal_dns::resolver::Resolver, - opctx: &OpContext, - opctx_alloc: &OpContext, - authz_instance: &authz::Instance, - prev_instance_state: &db::model::InstanceRuntimeState, - new_instance_state: &nexus::InstanceRuntimeState, - v2p_manager: &background::Activator, -) -> Result<(), Error> { - let instance_id = InstanceUuid::from_untyped_uuid(authz_instance.id()); - - // If this instance update is stale, do nothing, since the superseding - // update may have allowed the instance's location to change further. - if prev_instance_state.gen >= new_instance_state.gen.into() { - debug!(log, - "instance state generation already advanced, \ - won't touch network config"; - "instance_id" => %instance_id); - - return Ok(()); - } - - // If this update will retire the instance's active VMM, delete its - // networking state. It will be re-established the next time the - // instance starts. - if new_instance_state.propolis_id.is_none() { - info!(log, - "instance cleared its Propolis ID, cleaning network config"; - "instance_id" => %instance_id, - "propolis_id" => ?prev_instance_state.propolis_id); - - clear_instance_networking_state( - datastore, - log, - resolver, - opctx, - opctx_alloc, - authz_instance, - v2p_manager, - ) - .await?; - return Ok(()); - } - - // If the instance still has a migration in progress, don't change - // any networking state until an update arrives that retires that - // migration. - // - // This is needed to avoid the following race: - // - // 1. Migration from S to T completes. - // 2. Migration source sends an update that changes the instance's - // active VMM but leaves the migration ID in place. - // 3. Meanwhile, migration target sends an update that changes the - // instance's active VMM and clears the migration ID. - // 4. The migration target's call updates networking state and commits - // the new instance record. - // 5. The instance migrates from T to T' and Nexus applies networking - // configuration reflecting that the instance is on T'. - // 6. The update in step 2 applies configuration saying the instance - // is on sled T. - if new_instance_state.migration_id.is_some() { - debug!(log, - "instance still has a migration in progress, won't touch \ - network config"; - "instance_id" => %instance_id, - "migration_id" => ?new_instance_state.migration_id); - - return Ok(()); - } - - let new_propolis_id = new_instance_state.propolis_id.unwrap(); - - // Updates that end live migration need to push OPTE V2P state even if - // the instance's active sled did not change (see below). - let migration_retired = prev_instance_state.migration_id.is_some() - && new_instance_state.migration_id.is_none(); - - if (prev_instance_state.propolis_id - == new_instance_state.propolis_id.map(GenericUuid::into_untyped_uuid)) - && !migration_retired - { - debug!(log, "instance didn't move, won't touch network config"; - "instance_id" => %instance_id); - - return Ok(()); - } - - // Either the instance moved from one sled to another, or it attempted - // to migrate and failed. Ensure the correct networking configuration - // exists for its current home. - // - // TODO(#3107) This is necessary even if the instance didn't move, - // because registering a migration target on a sled creates OPTE ports - // for its VNICs, and that creates new V2P mappings on that sled that - // place the relevant virtual IPs on the local sled. Once OPTE stops - // creating these mappings, this path only needs to be taken if an - // instance has changed sleds. - let new_sled_id = match datastore - .vmm_fetch(&opctx, authz_instance, &new_propolis_id) - .await - { - Ok(vmm) => vmm.sled_id, - - // A VMM in the active position should never be destroyed. If the - // sled sending this message is the owner of the instance's last - // active VMM and is destroying it, it should also have retired that - // VMM. - Err(Error::ObjectNotFound { .. }) => { - error!(log, "instance's active vmm unexpectedly not found"; - "instance_id" => %instance_id, - "propolis_id" => %new_propolis_id); - - return Ok(()); - } - - Err(e) => return Err(e), - }; - - v2p_manager.activate(); - - let (.., sled) = - LookupPath::new(opctx, datastore).sled_id(new_sled_id).fetch().await?; - - instance_ensure_dpd_config( - datastore, - log, - resolver, - opctx, - opctx_alloc, - instance_id, - &sled.address(), - None, - ) - .await?; - - Ok(()) -} +// /// Given old and new instance runtime states, determines the desired +// /// networking configuration for a given instance and ensures it has been +// /// propagated to all relevant sleds. +// /// +// /// # Arguments +// /// +// /// - `datastore`: the datastore to use for lookups and updates. +// /// - `log`: the [`slog::Logger`] to log to. +// /// - `resolver`: an internal DNS resolver to look up DPD service addresses. +// /// - `opctx`: An operation context for this operation. +// /// - `opctx_alloc`: An operational context list permissions for all sleds. When +// /// called by methods on the [`Nexus`] type, this is the `OpContext` used for +// /// instance allocation. In a background task, this may be the background +// /// task's operational context; nothing stops you from passing the same +// /// `OpContext` as both `opctx` and `opctx_alloc`. +// /// - `authz_instance``: A resolved authorization context for the instance of +// /// interest. +// /// - `prev_instance_state``: The most-recently-recorded instance runtime +// /// state for this instance. +// /// - `new_instance_state`: The instance state that the caller of this routine +// /// has observed and that should be used to set up this instance's +// /// networking state. +// /// +// /// # Return value +// /// +// /// `Ok(())` if this routine completed all the operations it wanted to +// /// complete, or an appropriate `Err` otherwise. +// #[allow(clippy::too_many_arguments)] // Yeah, I know, I know, Clippy... +// #[allow(dead_code)] // TODO(eliza): this probably needs to be deleted eventually +// pub(crate) async fn ensure_updated_instance_network_config( +// datastore: &DataStore, +// log: &slog::Logger, +// resolver: &internal_dns::resolver::Resolver, +// opctx: &OpContext, +// opctx_alloc: &OpContext, +// authz_instance: &authz::Instance, +// prev_instance_state: &db::model::InstanceRuntimeState, +// new_instance_state: &nexus::InstanceRuntimeState, +// v2p_manager: &background::Activator, +// ) -> Result<(), Error> { +// let instance_id = InstanceUuid::from_untyped_uuid(authz_instance.id()); + +// // If this instance update is stale, do nothing, since the superseding +// // update may have allowed the instance's location to change further. +// if prev_instance_state.gen >= new_instance_state.gen.into() { +// debug!(log, +// "instance state generation already advanced, \ +// won't touch network config"; +// "instance_id" => %instance_id); + +// return Ok(()); +// } + +// // If this update will retire the instance's active VMM, delete its +// // networking state. It will be re-established the next time the +// // instance starts. +// if new_instance_state.propolis_id.is_none() { +// info!(log, +// "instance cleared its Propolis ID, cleaning network config"; +// "instance_id" => %instance_id, +// "propolis_id" => ?prev_instance_state.propolis_id); + +// clear_instance_networking_state( +// datastore, +// log, +// resolver, +// opctx, +// opctx_alloc, +// authz_instance, +// v2p_manager, +// ) +// .await?; +// return Ok(()); +// } + +// // If the instance still has a migration in progress, don't change +// // any networking state until an update arrives that retires that +// // migration. +// // +// // This is needed to avoid the following race: +// // +// // 1. Migration from S to T completes. +// // 2. Migration source sends an update that changes the instance's +// // active VMM but leaves the migration ID in place. +// // 3. Meanwhile, migration target sends an update that changes the +// // instance's active VMM and clears the migration ID. +// // 4. The migration target's call updates networking state and commits +// // the new instance record. +// // 5. The instance migrates from T to T' and Nexus applies networking +// // configuration reflecting that the instance is on T'. +// // 6. The update in step 2 applies configuration saying the instance +// // is on sled T. +// if new_instance_state.migration_id.is_some() { +// debug!(log, +// "instance still has a migration in progress, won't touch \ +// network config"; +// "instance_id" => %instance_id, +// "migration_id" => ?new_instance_state.migration_id); + +// return Ok(()); +// } + +// let new_propolis_id = new_instance_state.propolis_id.unwrap(); + +// // Updates that end live migration need to push OPTE V2P state even if +// // the instance's active sled did not change (see below). +// let migration_retired = prev_instance_state.migration_id.is_some() +// && new_instance_state.migration_id.is_none(); + +// if (prev_instance_state.propolis_id +// == new_instance_state.propolis_id.map(GenericUuid::into_untyped_uuid)) +// && !migration_retired +// { +// debug!(log, "instance didn't move, won't touch network config"; +// "instance_id" => %instance_id); + +// return Ok(()); +// } + +// // Either the instance moved from one sled to another, or it attempted +// // to migrate and failed. Ensure the correct networking configuration +// // exists for its current home. +// // +// // TODO(#3107) This is necessary even if the instance didn't move, +// // because registering a migration target on a sled creates OPTE ports +// // for its VNICs, and that creates new V2P mappings on that sled that +// // place the relevant virtual IPs on the local sled. Once OPTE stops +// // creating these mappings, this path only needs to be taken if an +// // instance has changed sleds. +// let new_sled_id = match datastore +// .vmm_fetch(&opctx, authz_instance, &new_propolis_id) +// .await +// { +// Ok(vmm) => vmm.sled_id, + +// // A VMM in the active position should never be destroyed. If the +// // sled sending this message is the owner of the instance's last +// // active VMM and is destroying it, it should also have retired that +// // VMM. +// Err(Error::ObjectNotFound { .. }) => { +// error!(log, "instance's active vmm unexpectedly not found"; +// "instance_id" => %instance_id, +// "propolis_id" => %new_propolis_id); + +// return Ok(()); +// } + +// Err(e) => return Err(e), +// }; + +// v2p_manager.activate(); + +// let (.., sled) = +// LookupPath::new(opctx, datastore).sled_id(new_sled_id).fetch().await?; + +// instance_ensure_dpd_config( +// datastore, +// log, +// resolver, +// opctx, +// opctx_alloc, +// instance_id, +// &sled.address(), +// None, +// ) +// .await?; + +// Ok(()) +// } /// Ensures that the Dendrite configuration for the supplied instance is /// up-to-date. diff --git a/nexus/src/app/sagas/instance_update/destroyed.rs b/nexus/src/app/sagas/instance_update/destroyed.rs index d8e55eb4b17..30416692bcb 100644 --- a/nexus/src/app/sagas/instance_update/destroyed.rs +++ b/nexus/src/app/sagas/instance_update/destroyed.rs @@ -17,7 +17,6 @@ use nexus_db_model::VmmState; use nexus_db_queries::authn; use nexus_db_queries::authz; use nexus_db_queries::db::datastore::InstanceSnapshot; -use nexus_db_queries::db::identity::Resource; use omicron_common::api::external; use omicron_common::api::external::Error; use serde::{Deserialize, Serialize}; @@ -155,15 +154,14 @@ async fn siud_release_sled_resources( async fn siud_release_virtual_provisioning( sagactx: NexusActionContext, ) -> Result<(), ActionError> { - let Some((instance, vmm)) = get_destroyed_vmm(&sagactx)? else { - // if the update we are handling is not an active VMM destroyed update, - // bail --- there's nothing to do here. - return Ok(()); - }; - let osagactx = sagactx.user_data(); - let Params { ref serialized_authn, ref authz_instance, vmm_id, .. } = - sagactx.saga_params::()?; + let Params { + ref serialized_authn, + ref authz_instance, + vmm_id, + instance, + .. + } = sagactx.saga_params::()?; let opctx = crate::context::op_context_for_saga_action(&sagactx, serialized_authn); diff --git a/nexus/src/app/sagas/instance_update/mod.rs b/nexus/src/app/sagas/instance_update/mod.rs index a6daefd87ce..06c7c6ee0bb 100644 --- a/nexus/src/app/sagas/instance_update/mod.rs +++ b/nexus/src/app/sagas/instance_update/mod.rs @@ -186,7 +186,7 @@ async fn siu_lock_instance( ); osagactx .datastore() - .instance_updater_lock(&opctx, authz_instance, &lock_id) + .instance_updater_lock(&opctx, authz_instance, lock_id) .await .map_err(ActionError::action_failed) .map(|_| ()) @@ -203,7 +203,7 @@ async fn siu_fetch_state_and_start_real_saga( let state = osagactx .datastore() - .instance_fetch_with_vmms(&opctx, &authz_instance) + .instance_fetch_all(&opctx, &authz_instance) .await .map_err(ActionError::action_failed)?; osagactx diff --git a/nexus/src/app/sagas/snapshot_create.rs b/nexus/src/app/sagas/snapshot_create.rs index ab5a8bcbf47..76a82e74912 100644 --- a/nexus/src/app/sagas/snapshot_create.rs +++ b/nexus/src/app/sagas/snapshot_create.rs @@ -828,7 +828,7 @@ async fn ssc_send_snapshot_request_to_sled_agent( let sled_id = osagactx .datastore() - .instance_fetch_with_active_vmm(&opctx, &authz_instance) + .instance_fetch_with_vmm(&opctx, &authz_instance) .await .map_err(ActionError::action_failed)? .sled_id(); diff --git a/sled-agent/src/common/instance.rs b/sled-agent/src/common/instance.rs index 97abed9c57c..017243a50be 100644 --- a/sled-agent/src/common/instance.rs +++ b/sled-agent/src/common/instance.rs @@ -420,20 +420,12 @@ mod test { use chrono::Utc; use omicron_common::api::external::Generation; - use omicron_common::api::internal::nexus::InstanceRuntimeState; use propolis_client::types::InstanceState as Observed; use uuid::Uuid; fn make_instance() -> InstanceStates { let propolis_id = PropolisUuid::new_v4(); let now = Utc::now(); - let instance = InstanceRuntimeState { - propolis_id: Some(propolis_id), - dst_propolis_id: None, - migration_id: None, - gen: Generation::new(), - time_updated: now, - }; let vmm = VmmRuntimeState { state: VmmState::Starting, @@ -507,7 +499,6 @@ mod test { fn propolis_terminal_states_request_destroy_action() { for state in [Observed::Destroyed, Observed::Failed] { let mut instance_state = make_instance(); - let original_instance_state = instance_state.clone(); let requested_action = instance_state .apply_propolis_observation(&make_observed_state(state.into())); diff --git a/sled-agent/src/instance.rs b/sled-agent/src/instance.rs index ea635c42dc8..688ed195b81 100644 --- a/sled-agent/src/instance.rs +++ b/sled-agent/src/instance.rs @@ -968,7 +968,6 @@ pub struct Instance { #[derive(Debug)] pub(crate) struct InstanceInitialState { pub hardware: InstanceHardware, - pub instance_runtime: InstanceRuntimeState, pub vmm_runtime: VmmRuntimeState, pub propolis_addr: SocketAddr, } @@ -1004,10 +1003,7 @@ impl Instance { "state" => ?state); let InstanceInitialState { - hardware, - instance_runtime, - vmm_runtime, - propolis_addr, + hardware, vmm_runtime, propolis_addr, .. } = state; let InstanceManagerServices { diff --git a/sled-agent/src/instance_manager.rs b/sled-agent/src/instance_manager.rs index bb9303f5e28..afa1e7797e1 100644 --- a/sled-agent/src/instance_manager.rs +++ b/sled-agent/src/instance_manager.rs @@ -647,7 +647,6 @@ impl InstanceManagerRunner { let state = crate::instance::InstanceInitialState { hardware, - instance_runtime, vmm_runtime, propolis_addr, }; diff --git a/sled-agent/src/sim/collection.rs b/sled-agent/src/sim/collection.rs index d4e2c365352..c9197fc3b86 100644 --- a/sled-agent/src/sim/collection.rs +++ b/sled-agent/src/sim/collection.rs @@ -422,7 +422,6 @@ mod test { use omicron_common::api::external::Error; use omicron_common::api::external::Generation; use omicron_common::api::internal::nexus::DiskRuntimeState; - use omicron_common::api::internal::nexus::InstanceRuntimeState; use omicron_common::api::internal::nexus::SledInstanceState; use omicron_common::api::internal::nexus::VmmRuntimeState; use omicron_common::api::internal::nexus::VmmState; @@ -432,15 +431,7 @@ mod test { fn make_instance( logctx: &LogContext, ) -> (SimObject, Receiver<()>) { - let propolis_id = PropolisUuid::new_v4(); - let instance_vmm = InstanceRuntimeState { - propolis_id: Some(propolis_id), - dst_propolis_id: None, - migration_id: None, - gen: Generation::new(), - time_updated: Utc::now(), - }; - + let propolis_id = Uuid::new_v4(); let vmm_state = VmmRuntimeState { state: VmmState::Starting, gen: Generation::new(), diff --git a/sled-agent/src/sim/sled_agent.rs b/sled-agent/src/sim/sled_agent.rs index 84c9f8b8328..ff93d598e03 100644 --- a/sled-agent/src/sim/sled_agent.rs +++ b/sled-agent/src/sim/sled_agent.rs @@ -265,7 +265,7 @@ impl SledAgent { instance_id: InstanceUuid, propolis_id: PropolisUuid, hardware: InstanceHardware, - instance_runtime: InstanceRuntimeState, + _instance_runtime: InstanceRuntimeState, vmm_runtime: VmmRuntimeState, metadata: InstanceMetadata, ) -> Result { From d3f8b0be379d3a56b1b3d32e47acda2693aa3876 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Wed, 29 May 2024 14:32:13 -0700 Subject: [PATCH 032/234] fix up stuff --- .../app/sagas/instance_update/destroyed.rs | 19 +------------------ nexus/src/app/sagas/instance_update/mod.rs | 8 +++----- 2 files changed, 4 insertions(+), 23 deletions(-) diff --git a/nexus/src/app/sagas/instance_update/destroyed.rs b/nexus/src/app/sagas/instance_update/destroyed.rs index 30416692bcb..179509fb23d 100644 --- a/nexus/src/app/sagas/instance_update/destroyed.rs +++ b/nexus/src/app/sagas/instance_update/destroyed.rs @@ -5,18 +5,13 @@ use super::ActionRegistry; use super::NexusActionContext; use super::NexusSaga; -use super::STATE; use crate::app::sagas::declare_saga_actions; use crate::app::sagas::ActionError; use nexus_db_model::Generation; use nexus_db_model::Instance; use nexus_db_model::InstanceRuntimeState; -use nexus_db_model::InstanceState; -use nexus_db_model::Vmm; -use nexus_db_model::VmmState; use nexus_db_queries::authn; use nexus_db_queries::authz; -use nexus_db_queries::db::datastore::InstanceSnapshot; use omicron_common::api::external; use omicron_common::api::external::Error; use serde::{Deserialize, Serialize}; @@ -82,7 +77,7 @@ pub(super) struct Params { } #[derive(Debug)] -pub(crate) struct SagaVmmDestroyed; +pub(super) struct SagaVmmDestroyed; impl NexusSaga for SagaVmmDestroyed { const NAME: &'static str = "instance-update-vmm-destroyed"; type Params = Params; @@ -107,18 +102,6 @@ impl NexusSaga for SagaVmmDestroyed { } } -fn get_destroyed_vmm( - sagactx: &NexusActionContext, -) -> Result, ActionError> { - let state = sagactx.lookup::(STATE)?; - match state.active_vmm { - Some(vmm) if vmm.runtime.state.state() == &VmmState::Destroyed => { - Ok(Some((state.instance, vmm))) - } - _ => Ok(None), - } -} - async fn siud_release_sled_resources( sagactx: NexusActionContext, ) -> Result<(), ActionError> { diff --git a/nexus/src/app/sagas/instance_update/mod.rs b/nexus/src/app/sagas/instance_update/mod.rs index 06c7c6ee0bb..4ab3226e6ca 100644 --- a/nexus/src/app/sagas/instance_update/mod.rs +++ b/nexus/src/app/sagas/instance_update/mod.rs @@ -9,7 +9,6 @@ use super::{ use crate::app::db::datastore::InstanceSnapshot; use crate::app::sagas::declare_saga_actions; use nexus_db_queries::{authn, authz}; -use nexus_types::identity::Resource; use omicron_common::api::external::InstanceState; use serde::{Deserialize, Serialize}; use steno::{ActionError, DagBuilder, Node, SagaName}; @@ -38,7 +37,6 @@ struct RealParams { } const INSTANCE_LOCK_ID: &str = "saga_instance_lock_id"; -const STATE: &str = "state"; // instance update saga: actions @@ -63,7 +61,7 @@ declare_saga_actions! { // Become the instance updater BECOME_UPDATER -> "generation" { + siu_become_updater - - siu_lock_instance_undo + - siu_unbecome_updater } UNLOCK_INSTANCE -> "unlocked" { @@ -84,7 +82,7 @@ impl NexusSaga for SagaInstanceUpdate { } fn make_saga_dag( - params: &Self::Params, + _params: &Self::Params, mut builder: DagBuilder, ) -> Result { builder.append(Node::action( @@ -214,7 +212,7 @@ async fn siu_fetch_state_and_start_real_saga( state, }) .await - .map_err(ActionError::action_failed); + .map_err(ActionError::action_failed)?; Ok(()) } From f25792f8aa745177381bf5f2f45687a12a98fa06 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Wed, 29 May 2024 14:45:45 -0700 Subject: [PATCH 033/234] clean things up a bit --- nexus/src/app/sagas/instance_update/mod.rs | 134 +++--------------- nexus/src/app/sagas/instance_update/start.rs | 140 +++++++++++++++++++ 2 files changed, 157 insertions(+), 117 deletions(-) diff --git a/nexus/src/app/sagas/instance_update/mod.rs b/nexus/src/app/sagas/instance_update/mod.rs index 4ab3226e6ca..21c40ba8242 100644 --- a/nexus/src/app/sagas/instance_update/mod.rs +++ b/nexus/src/app/sagas/instance_update/mod.rs @@ -16,15 +16,20 @@ use uuid::Uuid; mod destroyed; -/// Parameters to the start instance update saga. -#[derive(Debug, Deserialize, Serialize)] -pub(crate) struct Params { - /// Authentication context to use to fetch the instance's current state from - /// the database. - pub serialized_authn: authn::saga::Serialized, - - pub authz_instance: authz::Instance, -} +// The public interface to this saga is actually a smaller saga that starts the +// "real" update saga, which inherits the lock from the start saga. This is +// because the decision of which subsaga(s) to run depends on the state of the +// instance record read from the database *once the lock has been acquired*, +// and the saga DAG for the "real" instance update saga may be constructed only +// after the instance state has been fetched. However, since the the instance +// state must be read inside the lock, that *also* needs to happen in a saga, +// so that the lock is always dropped when unwinding. Thus, we have a second, +// smaller saga which starts our real saga, and then the real saga, which +// decides what DAG to build based on the instance fetched by the start saga. +// +// Don't worry, this won't be on the test. +mod start; +pub(crate) use self::start::{Params, SagaInstanceUpdate}; /// Parameters to the "real" instance update saga. #[derive(Debug, Deserialize, Serialize)] @@ -43,21 +48,6 @@ const INSTANCE_LOCK_ID: &str = "saga_instance_lock_id"; declare_saga_actions! { instance_update; - // Acquire the instance updater" lock with this saga's ID if no other saga - // is currently updating the instance. - LOCK_INSTANCE -> "saga_instance_lock_gen" { - + siu_lock_instance - - siu_lock_instance_undo - } - - // Fetch the instance and VMM's state, and start the "real" instance update saga. - // N.B. that this must be performed as a separate action from - // `LOCK_INSTANCE`, so that if the lookup fails, we will still unwind the - // `LOCK_INSTANCE` action and release the lock. - FETCH_STATE_AND_START_REAL_SAGA -> "state" { - + siu_fetch_state_and_start_real_saga - } - // Become the instance updater BECOME_UPDATER -> "generation" { + siu_become_updater @@ -70,36 +60,9 @@ declare_saga_actions! { } // instance update saga: definition +struct SagaDoActualInstanceUpdate; -#[derive(Debug)] -pub(crate) struct SagaInstanceUpdate; -impl NexusSaga for SagaInstanceUpdate { - const NAME: &'static str = "start-instance-update"; - type Params = Params; - - fn register_actions(registry: &mut ActionRegistry) { - instance_update_register_actions(registry); - } - - fn make_saga_dag( - _params: &Self::Params, - mut builder: DagBuilder, - ) -> Result { - builder.append(Node::action( - INSTANCE_LOCK_ID, - "GenerateInstanceLockId", - ACTION_GENERATE_ID.as_ref(), - )); - builder.append(lock_instance_action()); - builder.append(fetch_state_and_start_real_saga_action()); - - Ok(builder.build()?) - } -} - -struct SagaRealInstanceUpdate; - -impl NexusSaga for SagaRealInstanceUpdate { +impl NexusSaga for SagaDoActualInstanceUpdate { const NAME: &'static str = "instance-update"; type Params = RealParams; @@ -165,58 +128,6 @@ impl NexusSaga for SagaRealInstanceUpdate { } } -// instance update saga: action implementations - -async fn siu_lock_instance( - sagactx: NexusActionContext, -) -> Result<(), ActionError> { - let osagactx = sagactx.user_data(); - let Params { ref serialized_authn, ref authz_instance, .. } = - sagactx.saga_params::()?; - let lock_id = sagactx.lookup::(INSTANCE_LOCK_ID)?; - let opctx = - crate::context::op_context_for_saga_action(&sagactx, serialized_authn); - slog::info!( - osagactx.log(), - "instance update: attempting to lock instance"; - "instance_id" => %authz_instance.id(), - "saga_id" => %lock_id, - ); - osagactx - .datastore() - .instance_updater_lock(&opctx, authz_instance, lock_id) - .await - .map_err(ActionError::action_failed) - .map(|_| ()) -} - -async fn siu_fetch_state_and_start_real_saga( - sagactx: NexusActionContext, -) -> Result<(), ActionError> { - let osagactx = sagactx.user_data(); - let Params { serialized_authn, authz_instance, .. } = - sagactx.saga_params::()?; - let opctx = - crate::context::op_context_for_saga_action(&sagactx, &serialized_authn); - - let state = osagactx - .datastore() - .instance_fetch_all(&opctx, &authz_instance) - .await - .map_err(ActionError::action_failed)?; - osagactx - .nexus() - .execute_saga::(RealParams { - serialized_authn, - authz_instance, - state, - }) - .await - .map_err(ActionError::action_failed)?; - - Ok(()) -} - async fn siu_become_updater( sagactx: NexusActionContext, ) -> Result<(), ActionError> { @@ -244,7 +155,7 @@ async fn siu_become_updater( slog::info!( osagactx.log(), - "instance update: became instance updater"; + "Now, I am become Updater, the destroyer of VMMs."; "instance_id" => %authz_instance.id(), "saga_id" => %lock_id, "parent_id" => ?state.instance.runtime_state.updater_id, @@ -271,17 +182,6 @@ async fn siu_unlock_instance( unlock_instance_inner(serialized_authn, authz_instance, &sagactx).await } -// N.B. that this has to be a separate function just because the undo action -// must return `anyhow::Error` rather than `ActionError`. -async fn siu_lock_instance_undo( - sagactx: NexusActionContext, -) -> Result<(), anyhow::Error> { - let Params { ref serialized_authn, ref authz_instance, .. } = - sagactx.saga_params::()?; - unlock_instance_inner(serialized_authn, authz_instance, &sagactx).await?; - Ok(()) -} - async fn unlock_instance_inner( serialized_authn: &authn::saga::Serialized, authz_instance: &authz::Instance, diff --git a/nexus/src/app/sagas/instance_update/start.rs b/nexus/src/app/sagas/instance_update/start.rs index e69de29bb2d..529d23c0cbe 100644 --- a/nexus/src/app/sagas/instance_update/start.rs +++ b/nexus/src/app/sagas/instance_update/start.rs @@ -0,0 +1,140 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +// instance update start saga + +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use super::{ + ActionRegistry, NexusActionContext, NexusSaga, SagaInitError, + ACTION_GENERATE_ID, INSTANCE_LOCK_ID, +}; +use crate::app::sagas::declare_saga_actions; +use nexus_db_queries::{authn, authz}; +use serde::{Deserialize, Serialize}; +use steno::{ActionError, DagBuilder, Node}; +use uuid::Uuid; + +/// Parameters to the start instance update saga. +#[derive(Debug, Deserialize, Serialize)] +pub(crate) struct Params { + /// Authentication context to use to fetch the instance's current state from + /// the database. + pub serialized_authn: authn::saga::Serialized, + + pub authz_instance: authz::Instance, +} + +// instance update saga: actions + +declare_saga_actions! { + instance_update; + + // Acquire the instance updater" lock with this saga's ID if no other saga + // is currently updating the instance. + LOCK_INSTANCE -> "saga_instance_lock_gen" { + + siu_lock_instance + - siu_lock_instance_undo + } + + // Fetch the instance and VMM's state, and start the "real" instance update saga. + // N.B. that this must be performed as a separate action from + // `LOCK_INSTANCE`, so that if the lookup fails, we will still unwind the + // `LOCK_INSTANCE` action and release the lock. + FETCH_STATE_AND_START_REAL_SAGA -> "state" { + + siu_fetch_state_and_start_real_saga + } +} + +// instance update saga: definition + +#[derive(Debug)] +pub(crate) struct SagaInstanceUpdate; +impl NexusSaga for SagaInstanceUpdate { + const NAME: &'static str = "start-instance-update"; + type Params = Params; + + fn register_actions(registry: &mut ActionRegistry) { + instance_update_register_actions(registry); + } + + fn make_saga_dag( + _params: &Self::Params, + mut builder: DagBuilder, + ) -> Result { + builder.append(Node::action( + INSTANCE_LOCK_ID, + "GenerateInstanceLockId", + ACTION_GENERATE_ID.as_ref(), + )); + builder.append(lock_instance_action()); + builder.append(fetch_state_and_start_real_saga_action()); + + Ok(builder.build()?) + } +} + +// start instance update saga: action implementations + +async fn siu_lock_instance( + sagactx: NexusActionContext, +) -> Result<(), ActionError> { + let osagactx = sagactx.user_data(); + let Params { ref serialized_authn, ref authz_instance, .. } = + sagactx.saga_params::()?; + let lock_id = sagactx.lookup::(INSTANCE_LOCK_ID)?; + let opctx = + crate::context::op_context_for_saga_action(&sagactx, serialized_authn); + slog::info!( + osagactx.log(), + "instance update: attempting to lock instance"; + "instance_id" => %authz_instance.id(), + "saga_id" => %lock_id, + ); + osagactx + .datastore() + .instance_updater_lock(&opctx, authz_instance, &lock_id) + .await + .map_err(ActionError::action_failed) + .map(|_| ()) +} + +async fn siu_lock_instance_undo( + sagactx: NexusActionContext, +) -> Result<(), anyhow::Error> { + let Params { ref serialized_authn, ref authz_instance, .. } = + sagactx.saga_params::()?; + super::unlock_instance_inner(serialized_authn, authz_instance, &sagactx) + .await?; + Ok(()) +} + +async fn siu_fetch_state_and_start_real_saga( + sagactx: NexusActionContext, +) -> Result<(), ActionError> { + let osagactx = sagactx.user_data(); + let Params { serialized_authn, authz_instance, .. } = + sagactx.saga_params::()?; + let opctx = + crate::context::op_context_for_saga_action(&sagactx, &serialized_authn); + + let state = osagactx + .datastore() + .instance_fetch_all(&opctx, &authz_instance) + .await + .map_err(ActionError::action_failed)?; + osagactx + .nexus() + .execute_saga::(super::RealParams { + serialized_authn, + authz_instance, + state, + }) + .await + .map_err(ActionError::action_failed)?; + + Ok(()) +} From 78cb0b383959dede307c22b2ead072ab2fd0450d Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Thu, 30 May 2024 14:32:40 -0700 Subject: [PATCH 034/234] big post-merge update --- nexus/src/app/sagas/instance_update/mod.rs | 38 ++++++++++++-------- nexus/src/app/sagas/instance_update/start.rs | 13 ++++--- 2 files changed, 31 insertions(+), 20 deletions(-) diff --git a/nexus/src/app/sagas/instance_update/mod.rs b/nexus/src/app/sagas/instance_update/mod.rs index 21c40ba8242..7846d1bef96 100644 --- a/nexus/src/app/sagas/instance_update/mod.rs +++ b/nexus/src/app/sagas/instance_update/mod.rs @@ -6,6 +6,7 @@ use super::{ ActionRegistry, NexusActionContext, NexusSaga, SagaInitError, ACTION_GENERATE_ID, }; +use crate::app::db::datastore::instance; use crate::app::db::datastore::InstanceSnapshot; use crate::app::sagas::declare_saga_actions; use nexus_db_queries::{authn, authz}; @@ -39,9 +40,12 @@ struct RealParams { authz_instance: authz::Instance, state: InstanceSnapshot, + + orig_lock: instance::UpdaterLock, } const INSTANCE_LOCK_ID: &str = "saga_instance_lock_id"; +const INSTANCE_LOCK: &str = "updater_lock"; // instance update saga: actions @@ -49,7 +53,7 @@ declare_saga_actions! { instance_update; // Become the instance updater - BECOME_UPDATER -> "generation" { + BECOME_UPDATER -> "updater_lock" { + siu_become_updater - siu_unbecome_updater } @@ -130,26 +134,31 @@ impl NexusSaga for SagaDoActualInstanceUpdate { async fn siu_become_updater( sagactx: NexusActionContext, -) -> Result<(), ActionError> { +) -> Result { let RealParams { - ref serialized_authn, ref authz_instance, ref state, .. + ref serialized_authn, ref authz_instance, orig_lock, .. } = sagactx.saga_params::()?; - - let lock_id = sagactx.lookup::(INSTANCE_LOCK_ID)?; + let saga_id = sagactx.lookup::(INSTANCE_LOCK_ID)?; let opctx = crate::context::op_context_for_saga_action(&sagactx, serialized_authn); let osagactx = sagactx.user_data(); + slog::debug!( osagactx.log(), "instance update: trying to become instance updater..."; "instance_id" => %authz_instance.id(), - "saga_id" => %lock_id, - "parent_id" => ?state.instance.runtime_state.updater_id, + "saga_id" => %saga_id, + "parent_lock" => ?orig_lock, ); - osagactx + let lock = osagactx .datastore() - .instance_updater_inherit_lock(&opctx, &state.instance, &lock_id) + .instance_updater_inherit_lock( + &opctx, + &authz_instance, + orig_lock, + saga_id, + ) .await .map_err(ActionError::action_failed)?; @@ -157,11 +166,10 @@ async fn siu_become_updater( osagactx.log(), "Now, I am become Updater, the destroyer of VMMs."; "instance_id" => %authz_instance.id(), - "saga_id" => %lock_id, - "parent_id" => ?state.instance.runtime_state.updater_id, + "saga_id" => %saga_id, ); - Ok(()) + Ok(lock) } async fn siu_unbecome_updater( @@ -187,7 +195,7 @@ async fn unlock_instance_inner( authz_instance: &authz::Instance, sagactx: &NexusActionContext, ) -> Result<(), ActionError> { - let lock_id = sagactx.lookup::(INSTANCE_LOCK_ID)?; + let lock = sagactx.lookup::(INSTANCE_LOCK)?; let opctx = crate::context::op_context_for_saga_action(&sagactx, serialized_authn); let osagactx = sagactx.user_data(); @@ -195,12 +203,12 @@ async fn unlock_instance_inner( osagactx.log(), "instance update: unlocking instance"; "instance_id" => %authz_instance.id(), - "saga_id" => %lock_id, + "lock" => ?lock, ); osagactx .datastore() - .instance_updater_unlock(&opctx, authz_instance, &lock_id) + .instance_updater_unlock(&opctx, authz_instance, lock) .await .map_err(ActionError::action_failed)?; diff --git a/nexus/src/app/sagas/instance_update/start.rs b/nexus/src/app/sagas/instance_update/start.rs index 529d23c0cbe..e7953d19013 100644 --- a/nexus/src/app/sagas/instance_update/start.rs +++ b/nexus/src/app/sagas/instance_update/start.rs @@ -13,6 +13,7 @@ use super::{ ACTION_GENERATE_ID, INSTANCE_LOCK_ID, }; use crate::app::sagas::declare_saga_actions; +use nexus_db_queries::db::datastore::instance; use nexus_db_queries::{authn, authz}; use serde::{Deserialize, Serialize}; use steno::{ActionError, DagBuilder, Node}; @@ -35,7 +36,7 @@ declare_saga_actions! { // Acquire the instance updater" lock with this saga's ID if no other saga // is currently updating the instance. - LOCK_INSTANCE -> "saga_instance_lock_gen" { + LOCK_INSTANCE -> "instance_lock" { + siu_lock_instance - siu_lock_instance_undo } @@ -81,7 +82,7 @@ impl NexusSaga for SagaInstanceUpdate { async fn siu_lock_instance( sagactx: NexusActionContext, -) -> Result<(), ActionError> { +) -> Result { let osagactx = sagactx.user_data(); let Params { ref serialized_authn, ref authz_instance, .. } = sagactx.saga_params::()?; @@ -96,10 +97,9 @@ async fn siu_lock_instance( ); osagactx .datastore() - .instance_updater_lock(&opctx, authz_instance, &lock_id) + .instance_updater_lock(&opctx, authz_instance, lock_id) .await .map_err(ActionError::action_failed) - .map(|_| ()) } async fn siu_lock_instance_undo( @@ -115,9 +115,11 @@ async fn siu_lock_instance_undo( async fn siu_fetch_state_and_start_real_saga( sagactx: NexusActionContext, ) -> Result<(), ActionError> { - let osagactx = sagactx.user_data(); let Params { serialized_authn, authz_instance, .. } = sagactx.saga_params::()?; + let orig_lock = + sagactx.lookup::(INSTANCE_LOCK_ID)?; + let osagactx = sagactx.user_data(); let opctx = crate::context::op_context_for_saga_action(&sagactx, &serialized_authn); @@ -132,6 +134,7 @@ async fn siu_fetch_state_and_start_real_saga( serialized_authn, authz_instance, state, + orig_lock, }) .await .map_err(ActionError::action_failed)?; From 65c77f01c411d4b8a361e72b01d82fb4f46e0c45 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Mon, 3 Jun 2024 11:25:23 -0700 Subject: [PATCH 035/234] post-rebase fixup --- nexus/src/app/sagas/instance_update/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nexus/src/app/sagas/instance_update/mod.rs b/nexus/src/app/sagas/instance_update/mod.rs index 7846d1bef96..f382dac24b0 100644 --- a/nexus/src/app/sagas/instance_update/mod.rs +++ b/nexus/src/app/sagas/instance_update/mod.rs @@ -90,7 +90,7 @@ impl NexusSaga for SagaDoActualInstanceUpdate { if let Some(ref active_vmm) = params.state.active_vmm { // If the active VMM is `Destroyed`, schedule the active VMM // destroyed subsaga. - if active_vmm.runtime.state.state() == &InstanceState::Destroyed { + if active_vmm.runtime.state.state() == InstanceState::Destroyed { const DESTROYED_SUBSAGA_PARAMS: &str = "params_for_vmm_destroyed_subsaga"; let subsaga_params = destroyed::Params { From c59d8f90fc88e2f73c262f93a933a3076b419f0f Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Mon, 3 Jun 2024 11:25:33 -0700 Subject: [PATCH 036/234] regenerate sled-agent openapi --- openapi/sled-agent.json | 9 --------- 1 file changed, 9 deletions(-) diff --git a/openapi/sled-agent.json b/openapi/sled-agent.json index 3e96ab3a0c8..b041bd69703 100644 --- a/openapi/sled-agent.json +++ b/openapi/sled-agent.json @@ -4615,14 +4615,6 @@ "description": "A wrapper type containing a sled's total knowledge of the state of a specific VMM and the instance it incarnates.", "type": "object", "properties": { - "instance_state": { - "description": "The sled's conception of the state of the instance.", - "allOf": [ - { - "$ref": "#/components/schemas/InstanceRuntimeState" - } - ] - }, "migration_state": { "nullable": true, "description": "The current state of any in-progress migration for this instance, as understood by this sled.", @@ -4650,7 +4642,6 @@ } }, "required": [ - "instance_state", "propolis_id", "vmm_state" ] From 63e514e72c1bbb007926ecc076dc79ffb4df12a8 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Mon, 3 Jun 2024 13:22:32 -0700 Subject: [PATCH 037/234] bunch of saga plumbing fixes --- nexus/src/app/sagas/instance_update/start.rs | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/nexus/src/app/sagas/instance_update/start.rs b/nexus/src/app/sagas/instance_update/start.rs index e7953d19013..37c00d4efbc 100644 --- a/nexus/src/app/sagas/instance_update/start.rs +++ b/nexus/src/app/sagas/instance_update/start.rs @@ -10,7 +10,7 @@ use super::{ ActionRegistry, NexusActionContext, NexusSaga, SagaInitError, - ACTION_GENERATE_ID, INSTANCE_LOCK_ID, + ACTION_GENERATE_ID, INSTANCE_LOCK, INSTANCE_LOCK_ID, }; use crate::app::sagas::declare_saga_actions; use nexus_db_queries::db::datastore::instance; @@ -32,11 +32,11 @@ pub(crate) struct Params { // instance update saga: actions declare_saga_actions! { - instance_update; + start_instance_update; // Acquire the instance updater" lock with this saga's ID if no other saga // is currently updating the instance. - LOCK_INSTANCE -> "instance_lock" { + LOCK_INSTANCE -> "updater_lock" { + siu_lock_instance - siu_lock_instance_undo } @@ -59,7 +59,9 @@ impl NexusSaga for SagaInstanceUpdate { type Params = Params; fn register_actions(registry: &mut ActionRegistry) { - instance_update_register_actions(registry); + start_instance_update_register_actions(registry); + super::SagaDoActualInstanceUpdate::register_actions(registry); + super::destroyed::SagaVmmDestroyed::register_actions(registry); } fn make_saga_dag( @@ -117,8 +119,7 @@ async fn siu_fetch_state_and_start_real_saga( ) -> Result<(), ActionError> { let Params { serialized_authn, authz_instance, .. } = sagactx.saga_params::()?; - let orig_lock = - sagactx.lookup::(INSTANCE_LOCK_ID)?; + let orig_lock = sagactx.lookup::(INSTANCE_LOCK)?; let osagactx = sagactx.user_data(); let opctx = crate::context::op_context_for_saga_action(&sagactx, &serialized_authn); From 6404be6b1bb187caf3dd57cee6f3457190855cc2 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Mon, 3 Jun 2024 13:36:43 -0700 Subject: [PATCH 038/234] handle unable-to-lock more gracefully --- nexus/src/app/sagas/instance_update/start.rs | 33 +++++++++++++++++--- 1 file changed, 28 insertions(+), 5 deletions(-) diff --git a/nexus/src/app/sagas/instance_update/start.rs b/nexus/src/app/sagas/instance_update/start.rs index 37c00d4efbc..f6ccb1053fe 100644 --- a/nexus/src/app/sagas/instance_update/start.rs +++ b/nexus/src/app/sagas/instance_update/start.rs @@ -84,7 +84,7 @@ impl NexusSaga for SagaInstanceUpdate { async fn siu_lock_instance( sagactx: NexusActionContext, -) -> Result { +) -> Result, ActionError> { let osagactx = sagactx.user_data(); let Params { ref serialized_authn, ref authz_instance, .. } = sagactx.saga_params::()?; @@ -97,11 +97,20 @@ async fn siu_lock_instance( "instance_id" => %authz_instance.id(), "saga_id" => %lock_id, ); - osagactx + let locked = osagactx .datastore() .instance_updater_lock(&opctx, authz_instance, lock_id) - .await - .map_err(ActionError::action_failed) + .await; + match locked { + Ok(lock) => Ok(Some(lock)), + // Don't return an error if we can't take the lock. This saga will + // simply not start the real instance update saga, rather than having to unwind. + Err(instance::UpdaterLockError::AlreadyLocked) => Ok(None), + // Okay, that's a real error. Time to die! + Err(instance::UpdaterLockError::Query(e)) => { + Err(ActionError::action_failed(e)) + } + } } async fn siu_lock_instance_undo( @@ -119,8 +128,22 @@ async fn siu_fetch_state_and_start_real_saga( ) -> Result<(), ActionError> { let Params { serialized_authn, authz_instance, .. } = sagactx.saga_params::()?; - let orig_lock = sagactx.lookup::(INSTANCE_LOCK)?; let osagactx = sagactx.user_data(); + let lock_id = sagactx.lookup::(INSTANCE_LOCK_ID)?; + // Did we get the lock? If so, we can start the next saga, otherwise, just + // exit gracefully. + let Some(orig_lock) = + sagactx.lookup::>(INSTANCE_LOCK)? + else { + slog::info!( + osagactx.log(), + "instance update: instance is already locked! doing nothing..."; + "instance_id" => %authz_instance.id(), + "saga_id" => %lock_id, + ); + return Ok(()); + }; + let opctx = crate::context::op_context_for_saga_action(&sagactx, &serialized_authn); From a40b35fde03940e01a6bee689829904f0ed04a2c Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Mon, 3 Jun 2024 15:23:28 -0700 Subject: [PATCH 039/234] fix lock generations getting eaten --- nexus/db-queries/src/db/datastore/instance.rs | 1 + nexus/src/app/sagas/instance_update/destroyed.rs | 3 ++- nexus/src/app/sagas/instance_update/mod.rs | 13 ++++++++++--- sled-agent/src/common/instance.rs | 2 +- 4 files changed, 14 insertions(+), 5 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/instance.rs b/nexus/db-queries/src/db/datastore/instance.rs index 80b8d6df95b..a9ee9fd81a9 100644 --- a/nexus/db-queries/src/db/datastore/instance.rs +++ b/nexus/db-queries/src/db/datastore/instance.rs @@ -805,6 +805,7 @@ impl DataStore { "updater_id" => %updater_id, "locked_gen" => ?locked_gen, "already_locked" => !did_lock, + "locked_gen" => ?locked_gen, ); return Ok(UpdaterLock { updater_id, locked_gen }); } diff --git a/nexus/src/app/sagas/instance_update/destroyed.rs b/nexus/src/app/sagas/instance_update/destroyed.rs index 179509fb23d..3a6d3ac9c66 100644 --- a/nexus/src/app/sagas/instance_update/destroyed.rs +++ b/nexus/src/app/sagas/instance_update/destroyed.rs @@ -10,8 +10,10 @@ use crate::app::sagas::ActionError; use nexus_db_model::Generation; use nexus_db_model::Instance; use nexus_db_model::InstanceRuntimeState; +use nexus_db_model::InstanceState; use nexus_db_queries::authn; use nexus_db_queries::authz; +use nexus_db_queries::db::datastore::instance; use omicron_common::api::external; use omicron_common::api::external::Error; use serde::{Deserialize, Serialize}; @@ -249,7 +251,6 @@ async fn siud_update_instance( ) -> Result<(), ActionError> { let Params { ref authz_instance, ref vmm_id, instance, .. } = sagactx.saga_params::()?; - let osagactx = sagactx.user_data(); let new_runtime = InstanceRuntimeState { propolis_id: None, diff --git a/nexus/src/app/sagas/instance_update/mod.rs b/nexus/src/app/sagas/instance_update/mod.rs index f382dac24b0..86b74fe255d 100644 --- a/nexus/src/app/sagas/instance_update/mod.rs +++ b/nexus/src/app/sagas/instance_update/mod.rs @@ -8,9 +8,9 @@ use super::{ }; use crate::app::db::datastore::instance; use crate::app::db::datastore::InstanceSnapshot; +use crate::app::db::model::VmmState; use crate::app::sagas::declare_saga_actions; use nexus_db_queries::{authn, authz}; -use omicron_common::api::external::InstanceState; use serde::{Deserialize, Serialize}; use steno::{ActionError, DagBuilder, Node, SagaName}; use uuid::Uuid; @@ -90,7 +90,7 @@ impl NexusSaga for SagaDoActualInstanceUpdate { if let Some(ref active_vmm) = params.state.active_vmm { // If the active VMM is `Destroyed`, schedule the active VMM // destroyed subsaga. - if active_vmm.runtime.state.state() == InstanceState::Destroyed { + if active_vmm.runtime.state == VmmState::Destroyed { const DESTROYED_SUBSAGA_PARAMS: &str = "params_for_vmm_destroyed_subsaga"; let subsaga_params = destroyed::Params { @@ -206,11 +206,18 @@ async fn unlock_instance_inner( "lock" => ?lock, ); - osagactx + let did_unlock = osagactx .datastore() .instance_updater_unlock(&opctx, authz_instance, lock) .await .map_err(ActionError::action_failed)?; + slog::info!( + osagactx.log(), + "instance update: unlocked instance"; + "instance_id" => %authz_instance.id(), + "did_unlock" => ?did_unlock, + ); + Ok(()) } diff --git a/sled-agent/src/common/instance.rs b/sled-agent/src/common/instance.rs index 017243a50be..8a741d47012 100644 --- a/sled-agent/src/common/instance.rs +++ b/sled-agent/src/common/instance.rs @@ -400,7 +400,7 @@ impl InstanceStates { &self, migration_ids: &Option, ) -> bool { - match (self.migration, migration_ids) { + match (self.migration.as_ref(), migration_ids) { // If the migration ID is already set, and this is a request to set // IDs, the records match if the relevant IDs match. (Some(migration), Some(ids)) => { From b00683ace2ed530d4c8fd67f93b89a4dd2719f22 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Tue, 4 Jun 2024 10:15:10 -0700 Subject: [PATCH 040/234] more consistent naming for logs --- nexus/db-queries/src/db/datastore/instance.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/nexus/db-queries/src/db/datastore/instance.rs b/nexus/db-queries/src/db/datastore/instance.rs index a9ee9fd81a9..80b8d6df95b 100644 --- a/nexus/db-queries/src/db/datastore/instance.rs +++ b/nexus/db-queries/src/db/datastore/instance.rs @@ -805,7 +805,6 @@ impl DataStore { "updater_id" => %updater_id, "locked_gen" => ?locked_gen, "already_locked" => !did_lock, - "locked_gen" => ?locked_gen, ); return Ok(UpdaterLock { updater_id, locked_gen }); } From da48db98b843c5a9b5502910b9fb94b840fe8c64 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Wed, 5 Jun 2024 10:34:47 -0700 Subject: [PATCH 041/234] rm dead import --- nexus/src/app/sagas/instance_update/destroyed.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/nexus/src/app/sagas/instance_update/destroyed.rs b/nexus/src/app/sagas/instance_update/destroyed.rs index 3a6d3ac9c66..76d0f248019 100644 --- a/nexus/src/app/sagas/instance_update/destroyed.rs +++ b/nexus/src/app/sagas/instance_update/destroyed.rs @@ -13,7 +13,6 @@ use nexus_db_model::InstanceRuntimeState; use nexus_db_model::InstanceState; use nexus_db_queries::authn; use nexus_db_queries::authz; -use nexus_db_queries::db::datastore::instance; use omicron_common::api::external; use omicron_common::api::external::Error; use serde::{Deserialize, Serialize}; From 8078508474686e0ec145fa3c835da9d287baf6f2 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Mon, 10 Jun 2024 12:16:55 -0700 Subject: [PATCH 042/234] post-rebase remove dead imports --- nexus/src/app/instance.rs | 1 - nexus/src/app/sagas/instance_update/destroyed.rs | 1 - 2 files changed, 2 deletions(-) diff --git a/nexus/src/app/instance.rs b/nexus/src/app/instance.rs index 112d54f9c92..5ab982918ee 100644 --- a/nexus/src/app/instance.rs +++ b/nexus/src/app/instance.rs @@ -47,7 +47,6 @@ use omicron_common::api::external::LookupResult; use omicron_common::api::external::NameOrId; use omicron_common::api::external::UpdateResult; use omicron_common::api::internal::nexus; -use omicron_common::api::internal::nexus::VmmState; use omicron_common::api::internal::shared::SourceNatConfig; use omicron_uuid_kinds::GenericUuid; use omicron_uuid_kinds::InstanceUuid; diff --git a/nexus/src/app/sagas/instance_update/destroyed.rs b/nexus/src/app/sagas/instance_update/destroyed.rs index 76d0f248019..ec0999a3c6a 100644 --- a/nexus/src/app/sagas/instance_update/destroyed.rs +++ b/nexus/src/app/sagas/instance_update/destroyed.rs @@ -13,7 +13,6 @@ use nexus_db_model::InstanceRuntimeState; use nexus_db_model::InstanceState; use nexus_db_queries::authn; use nexus_db_queries::authz; -use omicron_common::api::external; use omicron_common::api::external::Error; use serde::{Deserialize, Serialize}; use slog::info; From b791897ba1f97f2ac07d8b0f26d1e3e73be3929a Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Thu, 13 Jun 2024 12:45:36 -0700 Subject: [PATCH 043/234] update openapi another time --- clients/nexus-client/src/lib.rs | 2 +- openapi/nexus-internal.json | 9 +++++++++ sled-agent/src/common/instance.rs | 1 + 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/clients/nexus-client/src/lib.rs b/clients/nexus-client/src/lib.rs index ea6f53deaaf..51f0dade1c4 100644 --- a/clients/nexus-client/src/lib.rs +++ b/clients/nexus-client/src/lib.rs @@ -139,7 +139,7 @@ impl From Self { propolis_id: s.propolis_id, vmm_state: s.vmm_state.into(), - // migration_state: s.migration_state.map(Into::into), + migration_state: s.migration_state.map(Into::into), } } } diff --git a/openapi/nexus-internal.json b/openapi/nexus-internal.json index b3f2a3dd9e4..0c6b060f5e6 100644 --- a/openapi/nexus-internal.json +++ b/openapi/nexus-internal.json @@ -4669,6 +4669,15 @@ "description": "A wrapper type containing a sled's total knowledge of the state of a specific VMM and the instance it incarnates.", "type": "object", "properties": { + "migration_state": { + "nullable": true, + "description": "The current state of any in-progress migration for this instance, as understood by this sled.", + "allOf": [ + { + "$ref": "#/components/schemas/MigrationRuntimeState" + } + ] + }, "propolis_id": { "description": "The ID of the VMM whose state is being reported.", "allOf": [ diff --git a/sled-agent/src/common/instance.rs b/sled-agent/src/common/instance.rs index 8a741d47012..a4d94dfac51 100644 --- a/sled-agent/src/common/instance.rs +++ b/sled-agent/src/common/instance.rs @@ -616,6 +616,7 @@ mod test { // but should not change the instance's migration IDs. let observed = ObservedPropolisState { vmm_state: PropolisInstanceState(Observed::Failed), + migration_status: ObservedMigrationStatus::Failed, time: Utc::now(), }; From 2bc81839dfc7b3096c53fb8d29e547a94f47ff76 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Thu, 13 Jun 2024 15:52:46 -0700 Subject: [PATCH 044/234] hack up the CTE to do vmm-and-migration updates --- nexus/db-queries/src/db/datastore/instance.rs | 3 +- nexus/db-queries/src/db/queries/instance.rs | 165 +++++++++++------- 2 files changed, 101 insertions(+), 67 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/instance.rs b/nexus/db-queries/src/db/datastore/instance.rs index 80b8d6df95b..964a84ec1d2 100644 --- a/nexus/db-queries/src/db/datastore/instance.rs +++ b/nexus/db-queries/src/db/datastore/instance.rs @@ -552,10 +552,9 @@ impl DataStore { migration: &Option, ) -> Result { let query = crate::db::queries::instance::InstanceAndVmmUpdate::new( - *instance_id, - new_instance.clone(), *vmm_id, new_vmm.clone(), + Some((*instance_id, new_instance.clone())), migration.clone(), ); diff --git a/nexus/db-queries/src/db/queries/instance.rs b/nexus/db-queries/src/db/queries/instance.rs index fded585b670..9b7dacef317 100644 --- a/nexus/db-queries/src/db/queries/instance.rs +++ b/nexus/db-queries/src/db/queries/instance.rs @@ -85,14 +85,15 @@ use crate::db::update_and_check::UpdateStatus; // can happen if, e.g., sled agent sends a message indicating that a retired VMM // has finally been destroyed when its instance has since been deleted.) pub struct InstanceAndVmmUpdate { - instance_find: Box + Send>, vmm_find: Box + Send>, - instance_update: Box + Send>, vmm_update: Box + Send>, - migration: Option, + instance: Option, + migration: Option, } -struct MigrationUpdate { +struct Update { + name: &'static str, + id: &'static str, find: Box + Send>, update: Box + Send>, } @@ -155,35 +156,17 @@ where impl InstanceAndVmmUpdate { pub fn new( - instance_id: InstanceUuid, - new_instance_runtime_state: InstanceRuntimeState, vmm_id: PropolisUuid, new_vmm_runtime_state: VmmRuntimeState, + instance: Option<(InstanceUuid, InstanceRuntimeState)>, migration: Option, ) -> Self { - let instance_find = Box::new( - instance_dsl::instance - .filter(instance_dsl::id.eq(instance_id.into_untyped_uuid())) - .select(instance_dsl::id), - ); - let vmm_find = Box::new( vmm_dsl::vmm .filter(vmm_dsl::id.eq(vmm_id.into_untyped_uuid())) .select(vmm_dsl::id), ); - let instance_update = Box::new( - diesel::update(instance_dsl::instance) - .filter(instance_dsl::time_deleted.is_null()) - .filter(instance_dsl::id.eq(instance_id.into_untyped_uuid())) - .filter( - instance_dsl::state_generation - .lt(new_instance_runtime_state.gen), - ) - .set(new_instance_runtime_state), - ); - let vmm_update = Box::new( diesel::update(vmm_dsl::vmm) .filter(vmm_dsl::time_deleted.is_null()) @@ -192,6 +175,32 @@ impl InstanceAndVmmUpdate { .set(new_vmm_runtime_state), ); + let instance = instance.map(|(instance_id, new_runtime_state)| { + let instance_id = instance_id.into_untyped_uuid(); + let find = Box::new( + instance_dsl::instance + .filter(instance_dsl::id.eq(instance_id)) + .select(instance_dsl::id), + ); + + let update = Box::new( + diesel::update(instance_dsl::instance) + .filter(instance_dsl::time_deleted.is_null()) + .filter(instance_dsl::id.eq(instance_id)) + .filter( + instance_dsl::state_generation + .lt(new_runtime_state.gen), + ) + .set(new_runtime_state), + ); + Update { + find, + update, + name: "instance", + id: instance_dsl::id::NAME, + } + }); + let migration = migration.map( |MigrationRuntimeState { role, @@ -238,11 +247,16 @@ impl InstanceAndVmmUpdate { )), ), }; - MigrationUpdate { find, update } + Update { + find, + update, + name: "migration", + id: migration_dsl::id::NAME, + } }, ); - Self { instance_find, vmm_find, instance_update, vmm_update, migration } + Self { vmm_find, vmm_update, instance, migration } } pub async fn execute_and_check( @@ -299,36 +313,67 @@ impl Query for InstanceAndVmmUpdate { impl RunQueryDsl for InstanceAndVmmUpdate {} +impl Update { + fn push_subqueries<'b>( + &'b self, + out: &mut AstPass<'_, 'b, Pg>, + ) -> QueryResult<()> { + out.push_sql(self.name); + out.push_sql("_found AS (SELECT ("); + self.find.walk_ast(out.reborrow())?; + out.push_sql(") AS ID), "); + out.push_sql(self.name); + out.push_sql("_updated AS ("); + self.update.walk_ast(out.reborrow())?; + out.push_sql("RETURNING id), "); + out.push_sql(self.name); + out.push_sql("_result AS (SELECT "); + out.push_sql(self.name); + out.push_sql("_found."); + out.push_identifier(self.id)?; + out.push_sql(" AS found, "); + out.push_sql(self.name); + out.push_sql("_updated."); + out.push_identifier(self.id)?; + out.push_sql(" AS updated"); + out.push_sql(" FROM "); + out.push_sql(self.name); + out.push_sql("_found LEFT JOIN "); + out.push_sql(self.name); + out.push_sql("_updated ON "); + out.push_sql(self.name); + out.push_sql("_found."); + out.push_identifier(self.id)?; + out.push_sql("= "); + out.push_sql(self.name); + out.push_sql("_updated."); + out.push_identifier(self.id)?; + out.push_sql(")"); + + Ok(()) + } +} + impl QueryFragment for InstanceAndVmmUpdate { fn walk_ast<'b>(&'b self, mut out: AstPass<'_, 'b, Pg>) -> QueryResult<()> { - out.push_sql("WITH instance_found AS (SELECT ("); - self.instance_find.walk_ast(out.reborrow())?; - out.push_sql(") AS id), "); + out.push_sql("WITH "); + if let Some(ref instance) = self.instance { + instance.push_subqueries(&mut out)?; + out.push_sql(", "); + } + + if let Some(ref migration) = self.migration { + migration.push_subqueries(&mut out)?; + out.push_sql(", "); + } out.push_sql("vmm_found AS (SELECT ("); self.vmm_find.walk_ast(out.reborrow())?; out.push_sql(") AS id), "); - if let Some(MigrationUpdate { ref find, .. }) = self.migration { - out.push_sql("migration_found AS (SELECT ("); - find.walk_ast(out.reborrow())?; - out.push_sql(") AS id), "); - } - - out.push_sql("instance_updated AS ("); - self.instance_update.walk_ast(out.reborrow())?; - out.push_sql(" RETURNING id), "); - out.push_sql("vmm_updated AS ("); self.vmm_update.walk_ast(out.reborrow())?; out.push_sql(" RETURNING id), "); - - if let Some(MigrationUpdate { ref update, .. }) = self.migration { - out.push_sql("migration_updated AS ("); - update.walk_ast(out.reborrow())?; - out.push_sql(" RETURNING id), "); - } - out.push_sql("vmm_result AS ("); out.push_sql("SELECT vmm_found."); out.push_identifier(vmm_dsl::id::NAME)?; @@ -353,34 +398,24 @@ impl QueryFragment for InstanceAndVmmUpdate { out.push_identifier(instance_dsl::id::NAME)?; out.push_sql(" = instance_updated."); out.push_identifier(instance_dsl::id::NAME)?; - out.push_sql(")"); + out.push_sql(") "); - if self.migration.is_some() { - out.push_sql(", "); - out.push_sql("migration_result AS ("); - out.push_sql("SELECT migration_found."); - out.push_identifier(migration_dsl::id::NAME)?; - out.push_sql(" AS found, migration_updated."); - out.push_identifier(migration_dsl::id::NAME)?; - out.push_sql(" AS updated"); - out.push_sql( - " FROM migration_found LEFT JOIN migration_updated ON migration_found.", - ); - out.push_identifier(migration_dsl::id::NAME)?; - out.push_sql(" = migration_updated."); - out.push_identifier(migration_dsl::id::NAME)?; - out.push_sql(")"); + out.push_sql("SELECT vmm_result.found, vmm_result.updated, "); + if self.instance.is_some() { + out.push_sql("instance_result.found, instance_result.updated, "); + } else { + out.push_sql("NULL, NULL, "); } - out.push_sql(" "); - out.push_sql("SELECT vmm_result.found, vmm_result.updated, "); - out.push_sql("instance_result.found, instance_result.updated, "); if self.migration.is_some() { out.push_sql("migration_result.found, migration_result.updated "); } else { out.push_sql("NULL, NULL "); } - out.push_sql("FROM vmm_result, instance_result"); + out.push_sql("FROM vmm_result"); + if self.instance.is_some() { + out.push_sql(", instance_result"); + } if self.migration.is_some() { out.push_sql(", migration_result"); } From 419d0673909c0cc971455b088077be8e329fd88f Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Fri, 14 Jun 2024 09:53:14 -0700 Subject: [PATCH 045/234] actually write migration states to the db --- nexus/db-queries/src/db/datastore/vmm.rs | 44 ++++++++++++++++++++++++ nexus/src/app/instance.rs | 25 ++++++++------ 2 files changed, 58 insertions(+), 11 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/vmm.rs b/nexus/db-queries/src/db/datastore/vmm.rs index 7ce8c1551e4..d1570819219 100644 --- a/nexus/db-queries/src/db/datastore/vmm.rs +++ b/nexus/db-queries/src/db/datastore/vmm.rs @@ -27,6 +27,7 @@ use omicron_common::api::external::LookupResult; use omicron_common::api::external::LookupType; use omicron_common::api::external::ResourceType; use omicron_common::api::external::UpdateResult; +use omicron_common::api::internal::nexus::MigrationRuntimeState; use omicron_uuid_kinds::GenericUuid; use omicron_uuid_kinds::PropolisUuid; use std::net::SocketAddr; @@ -141,6 +142,49 @@ impl DataStore { Ok(updated) } + pub async fn vmm_and_migration_update_runtime( + &self, + vmm_id: Uuid, + new_runtime: &VmmRuntimeState, + migration: Option<&MigrationRuntimeState>, + ) -> Result<(bool, Option), Error> { + let query = crate::db::queries::instance::InstanceAndVmmUpdate::new( + vmm_id, + new_runtime.clone(), + None, + migration.cloned(), + ); + + // The InstanceAndVmmUpdate query handles and indicates failure to find + // either the VMM or the migration, so a query failure here indicates + // some kind of internal error and not a failed lookup. + let result = query + .execute_and_check(&*self.pool_connection_unauthorized().await?) + .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?; + + debug_assert_eq!(result.instance_status, None); + + let vmm_updated = match result.vmm_status { + Some(UpdateStatus::Updated) => true, + Some(UpdateStatus::NotUpdatedButExists) => false, + None => false, + }; + + let migration_updated = if migration.is_some() { + Some(match result.migration_status { + Some(UpdateStatus::Updated) => true, + Some(UpdateStatus::NotUpdatedButExists) => false, + None => false, + }) + } else { + debug_assert_eq!(result.migration_status, None); + None + }; + + Ok((vmm_updated, migration_updated)) + } + /// Forcibly overwrites the Propolis IP/Port in the supplied VMM's record with /// the supplied Propolis IP. /// diff --git a/nexus/src/app/instance.rs b/nexus/src/app/instance.rs index 5ab982918ee..da09edd8a8f 100644 --- a/nexus/src/app/instance.rs +++ b/nexus/src/app/instance.rs @@ -1328,11 +1328,10 @@ impl super::Nexus { if let Some(state) = state { let update_result = self .db_datastore - .vmm_update_runtime( - &state.propolis_id, + .vmm_and_migration_update_runtime( + state.propolis_id, &state.vmm_state.into(), - // TODO(eliza): re-enable writing back migrations! - // &state.migration_state, + state.migration_state.as_ref(), ) .await; @@ -1341,11 +1340,11 @@ impl super::Nexus { "instance_id" => %instance_id, "propolis_id" => %state.propolis_id, "result" => ?update_result); - + let (vmm_updated, migration_updated) = update_result?; Ok(InstanceUpdateResult { instance_updated: false, - vmm_updated: update_result?, - migration_updated: None, + vmm_updated, + migration_updated, }) } else { // There was no instance state to write back, so --- perhaps @@ -1524,20 +1523,24 @@ impl super::Nexus { instance_id: &InstanceUuid, new_runtime_state: &nexus::SledInstanceState, ) -> Result<(), Error> { + let migration = new_runtime_state.migration_state.as_ref(); let propolis_id = new_runtime_state.propolis_id; info!(opctx.log, "received new VMM runtime state from sled agent"; "instance_id" => %instance_id, "propolis_id" => %propolis_id, - "vmm_state" => ?new_runtime_state.vmm_state); + "vmm_state" => ?new_runtime_state.vmm_state, + "migration_state" => ?migration); - let updated = self + let (vmm_updated, migration_updated) = self .db_datastore - .vmm_update_runtime( - &propolis_id, + .vmm_and_migration_update_runtime( + propolis_id, // TODO(eliza): probably should take this by value... &new_runtime_state.vmm_state.clone().into(), + migration, ) .await?; + let updated = vmm_updated || migration_updated.unwrap_or(false); if updated { let (.., authz_instance) = LookupPath::new(&opctx, &self.db_datastore) From dfe25948417305b88064a2203a58df9335fd2216 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Fri, 14 Jun 2024 11:00:27 -0700 Subject: [PATCH 046/234] remove duplicate code in CTE --- nexus/db-queries/src/db/queries/instance.rs | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/nexus/db-queries/src/db/queries/instance.rs b/nexus/db-queries/src/db/queries/instance.rs index 9b7dacef317..c476ff20216 100644 --- a/nexus/db-queries/src/db/queries/instance.rs +++ b/nexus/db-queries/src/db/queries/instance.rs @@ -384,20 +384,6 @@ impl QueryFragment for InstanceAndVmmUpdate { out.push_identifier(vmm_dsl::id::NAME)?; out.push_sql(" = vmm_updated."); out.push_identifier(vmm_dsl::id::NAME)?; - out.push_sql("), "); - - out.push_sql("instance_result AS ("); - out.push_sql("SELECT instance_found."); - out.push_identifier(instance_dsl::id::NAME)?; - out.push_sql(" AS found, instance_updated."); - out.push_identifier(instance_dsl::id::NAME)?; - out.push_sql(" AS updated"); - out.push_sql( - " FROM instance_found LEFT JOIN instance_updated ON instance_found.", - ); - out.push_identifier(instance_dsl::id::NAME)?; - out.push_sql(" = instance_updated."); - out.push_identifier(instance_dsl::id::NAME)?; out.push_sql(") "); out.push_sql("SELECT vmm_result.found, vmm_result.updated, "); From da7bbb9316440072978fc9b7ba7f75b7eb34a7e7 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Fri, 14 Jun 2024 11:40:36 -0700 Subject: [PATCH 047/234] add expectorate tests for CTE --- nexus/db-queries/src/db/queries/instance.rs | 112 ++++++++++++++++++ ...ance_and_vmm_update_vmm_and_imigration.sql | 55 +++++++++ ...stance_and_vmm_update_vmm_and_instance.sql | 48 ++++++++ ..._vmm_update_vmm_instance_and_migration.sql | 84 +++++++++++++ .../instance_and_vmm_update_vmm_only.sql | 24 ++++ 5 files changed, 323 insertions(+) create mode 100644 nexus/db-queries/tests/output/instance_and_vmm_update_vmm_and_imigration.sql create mode 100644 nexus/db-queries/tests/output/instance_and_vmm_update_vmm_and_instance.sql create mode 100644 nexus/db-queries/tests/output/instance_and_vmm_update_vmm_instance_and_migration.sql create mode 100644 nexus/db-queries/tests/output/instance_and_vmm_update_vmm_only.sql diff --git a/nexus/db-queries/src/db/queries/instance.rs b/nexus/db-queries/src/db/queries/instance.rs index c476ff20216..011020c10fd 100644 --- a/nexus/db-queries/src/db/queries/instance.rs +++ b/nexus/db-queries/src/db/queries/instance.rs @@ -409,3 +409,115 @@ impl QueryFragment for InstanceAndVmmUpdate { Ok(()) } } + +#[cfg(test)] +mod test { + use super::*; + use crate::db::model::Generation; + use crate::db::model::VmmState; + use crate::db::raw_query_builder::expectorate_query_contents; + use chrono::Utc; + use omicron_common::api::internal::nexus::MigrationRole; + use omicron_common::api::internal::nexus::MigrationRuntimeState; + use omicron_common::api::internal::nexus::MigrationState; + use uuid::Uuid; + + // These tests are a bit of a "change detector", but they're here to help + // with debugging too. If you change this query, it can be useful to see + // exactly how the output SQL has been altered. + + fn mk_vmm_state() -> VmmRuntimeState { + VmmRuntimeState { + time_state_updated: Utc::now(), + gen: Generation::new(), + state: VmmState::Starting, + } + } + + fn mk_migration_state() -> MigrationRuntimeState { + let migration_id = Uuid::nil(); + MigrationRuntimeState { + migration_id, + state: MigrationState::Pending, + role: MigrationRole::Source, + gen: Generation::new().into(), + time_updated: Utc::now(), + } + } + + fn mk_instance_state() -> (Uuid, InstanceRuntimeState) { + let id = Uuid::nil(); + let state = InstanceRuntimeState { + time_updated: Utc::now(), + gen: Generation::new(), + propolis_id: Some(Uuid::nil()), + dst_propolis_id: Some(Uuid::nil()), + migration_id: Some(Uuid::nil()), + nexus_state: nexus_db_model::InstanceState::Vmm, + }; + (id, state) + } + + #[tokio::test] + async fn expectorate_query_only_vmm() { + let vmm_id = Uuid::nil(); + let vmm_state = mk_vmm_state(); + + let query = InstanceAndVmmUpdate::new(vmm_id, vmm_state, None, None); + expectorate_query_contents( + &query, + "tests/output/instance_and_vmm_update_vmm_only.sql", + ) + .await; + } + + #[tokio::test] + async fn expectorate_query_vmm_and_instance() { + let vmm_id = Uuid::nil(); + let vmm_state = mk_vmm_state(); + let instance = mk_instance_state(); + + let query = + InstanceAndVmmUpdate::new(vmm_id, vmm_state, Some(instance), None); + expectorate_query_contents( + &query, + "tests/output/instance_and_vmm_update_vmm_and_instance.sql", + ) + .await; + } + + #[tokio::test] + async fn expectorate_query_vmm_and_migration() { + let vmm_id = Uuid::nil(); + let vmm_state = mk_vmm_state(); + let migration = mk_migration_state(); + + let query = + InstanceAndVmmUpdate::new(vmm_id, vmm_state, None, Some(migration)); + expectorate_query_contents( + &query, + "tests/output/instance_and_vmm_update_vmm_and_imigration.sql", + ) + .await; + } + + #[tokio::test] + async fn expectorate_query_vmm_instance_and_migration() { + let vmm_id = Uuid::nil(); + let vmm_state = mk_vmm_state(); + let instance = mk_instance_state(); + let migration = mk_migration_state(); + + let query = InstanceAndVmmUpdate::new( + vmm_id, + vmm_state, + Some(instance), + Some(migration), + ); + expectorate_query_contents( + &query, + "tests/output/instance_and_vmm_update_vmm_instance_and_migration.sql", + ) + .await; + } +} diff --git a/nexus/db-queries/tests/output/instance_and_vmm_update_vmm_and_imigration.sql b/nexus/db-queries/tests/output/instance_and_vmm_update_vmm_and_imigration.sql new file mode 100644 index 00000000000..9c54c8b8efb --- /dev/null +++ b/nexus/db-queries/tests/output/instance_and_vmm_update_vmm_and_imigration.sql @@ -0,0 +1,55 @@ +WITH + migration_found + AS ( + SELECT + ( + SELECT + migration.id + FROM + migration + WHERE + migration.id = $1 AND (migration.time_deleted IS NULL) + ) + AS id + ), + migration_updated + AS ( + UPDATE + migration + SET + source_state = $2, time_source_updated = $3 + WHERE + (migration.id = $4 AND migration.source_propolis_id = $5) AND migration.source_gen < $6 + RETURNING + id + ), + migration_result + AS ( + SELECT + migration_found.id AS found, migration_updated.id AS updated + FROM + migration_found LEFT JOIN migration_updated ON migration_found.id = migration_updated.id + ), + vmm_found AS (SELECT (SELECT vmm.id FROM vmm WHERE vmm.id = $7) AS id), + vmm_updated + AS ( + UPDATE + vmm + SET + time_state_updated = $8, state_generation = $9, state = $10 + WHERE + ((vmm.time_deleted IS NULL) AND vmm.id = $11) AND vmm.state_generation < $12 + RETURNING + id + ), + vmm_result + AS ( + SELECT + vmm_found.id AS found, vmm_updated.id AS updated + FROM + vmm_found LEFT JOIN vmm_updated ON vmm_found.id = vmm_updated.id + ) +SELECT + vmm_result.found, vmm_result.updated, NULL, NULL, migration_result.found, migration_result.updated +FROM + vmm_result, migration_result diff --git a/nexus/db-queries/tests/output/instance_and_vmm_update_vmm_and_instance.sql b/nexus/db-queries/tests/output/instance_and_vmm_update_vmm_and_instance.sql new file mode 100644 index 00000000000..ab4ef78b182 --- /dev/null +++ b/nexus/db-queries/tests/output/instance_and_vmm_update_vmm_and_instance.sql @@ -0,0 +1,48 @@ +WITH + instance_found AS (SELECT (SELECT instance.id FROM instance WHERE instance.id = $1) AS id), + instance_updated + AS ( + UPDATE + instance + SET + time_state_updated = $2, + state_generation = $3, + active_propolis_id = $4, + target_propolis_id = $5, + migration_id = $6, + state = $7 + WHERE + ((instance.time_deleted IS NULL) AND instance.id = $8) AND instance.state_generation < $9 + RETURNING + id + ), + instance_result + AS ( + SELECT + instance_found.id AS found, instance_updated.id AS updated + FROM + instance_found LEFT JOIN instance_updated ON instance_found.id = instance_updated.id + ), + vmm_found AS (SELECT (SELECT vmm.id FROM vmm WHERE vmm.id = $10) AS id), + vmm_updated + AS ( + UPDATE + vmm + SET + time_state_updated = $11, state_generation = $12, state = $13 + WHERE + ((vmm.time_deleted IS NULL) AND vmm.id = $14) AND vmm.state_generation < $15 + RETURNING + id + ), + vmm_result + AS ( + SELECT + vmm_found.id AS found, vmm_updated.id AS updated + FROM + vmm_found LEFT JOIN vmm_updated ON vmm_found.id = vmm_updated.id + ) +SELECT + vmm_result.found, vmm_result.updated, instance_result.found, instance_result.updated, NULL, NULL +FROM + vmm_result, instance_result diff --git a/nexus/db-queries/tests/output/instance_and_vmm_update_vmm_instance_and_migration.sql b/nexus/db-queries/tests/output/instance_and_vmm_update_vmm_instance_and_migration.sql new file mode 100644 index 00000000000..bee0b68a3a4 --- /dev/null +++ b/nexus/db-queries/tests/output/instance_and_vmm_update_vmm_instance_and_migration.sql @@ -0,0 +1,84 @@ +WITH + instance_found AS (SELECT (SELECT instance.id FROM instance WHERE instance.id = $1) AS id), + instance_updated + AS ( + UPDATE + instance + SET + time_state_updated = $2, + state_generation = $3, + active_propolis_id = $4, + target_propolis_id = $5, + migration_id = $6, + state = $7 + WHERE + ((instance.time_deleted IS NULL) AND instance.id = $8) AND instance.state_generation < $9 + RETURNING + id + ), + instance_result + AS ( + SELECT + instance_found.id AS found, instance_updated.id AS updated + FROM + instance_found LEFT JOIN instance_updated ON instance_found.id = instance_updated.id + ), + migration_found + AS ( + SELECT + ( + SELECT + migration.id + FROM + migration + WHERE + migration.id = $10 AND (migration.time_deleted IS NULL) + ) + AS id + ), + migration_updated + AS ( + UPDATE + migration + SET + source_state = $11, time_source_updated = $12 + WHERE + (migration.id = $13 AND migration.source_propolis_id = $14) AND migration.source_gen < $15 + RETURNING + id + ), + migration_result + AS ( + SELECT + migration_found.id AS found, migration_updated.id AS updated + FROM + migration_found LEFT JOIN migration_updated ON migration_found.id = migration_updated.id + ), + vmm_found AS (SELECT (SELECT vmm.id FROM vmm WHERE vmm.id = $16) AS id), + vmm_updated + AS ( + UPDATE + vmm + SET + time_state_updated = $17, state_generation = $18, state = $19 + WHERE + ((vmm.time_deleted IS NULL) AND vmm.id = $20) AND vmm.state_generation < $21 + RETURNING + id + ), + vmm_result + AS ( + SELECT + vmm_found.id AS found, vmm_updated.id AS updated + FROM + vmm_found LEFT JOIN vmm_updated ON vmm_found.id = vmm_updated.id + ) +SELECT + vmm_result.found, + vmm_result.updated, + instance_result.found, + instance_result.updated, + migration_result.found, + migration_result.updated +FROM + vmm_result, instance_result, migration_result diff --git a/nexus/db-queries/tests/output/instance_and_vmm_update_vmm_only.sql b/nexus/db-queries/tests/output/instance_and_vmm_update_vmm_only.sql new file mode 100644 index 00000000000..cfe56740fe7 --- /dev/null +++ b/nexus/db-queries/tests/output/instance_and_vmm_update_vmm_only.sql @@ -0,0 +1,24 @@ +WITH + vmm_found AS (SELECT (SELECT vmm.id FROM vmm WHERE vmm.id = $1) AS id), + vmm_updated + AS ( + UPDATE + vmm + SET + time_state_updated = $2, state_generation = $3, state = $4 + WHERE + ((vmm.time_deleted IS NULL) AND vmm.id = $5) AND vmm.state_generation < $6 + RETURNING + id + ), + vmm_result + AS ( + SELECT + vmm_found.id AS found, vmm_updated.id AS updated + FROM + vmm_found LEFT JOIN vmm_updated ON vmm_found.id = vmm_updated.id + ) +SELECT + vmm_result.found, vmm_result.updated, NULL, NULL, NULL, NULL +FROM + vmm_result From ec93ca9ecea872ed01d2a5516e9ea8d5cc37f2e8 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Wed, 19 Jun 2024 12:09:58 -0700 Subject: [PATCH 048/234] uuids are typed now --- .../app/background/tasks/instance_watcher.rs | 32 ++++++------------- .../app/sagas/instance_update/destroyed.rs | 15 ++++++--- nexus/src/app/sagas/instance_update/mod.rs | 4 ++- 3 files changed, 23 insertions(+), 28 deletions(-) diff --git a/nexus/src/app/background/tasks/instance_watcher.rs b/nexus/src/app/background/tasks/instance_watcher.rs index 511451d7b48..03b34c44609 100644 --- a/nexus/src/app/background/tasks/instance_watcher.rs +++ b/nexus/src/app/background/tasks/instance_watcher.rs @@ -161,28 +161,16 @@ impl InstanceWatcher { "updating instance state"; "state" => ?new_runtime_state.vmm_state.state, ); - check.result = crate::app::instance::notify_instance_updated( - &datastore, - &resolver, - &opctx, - &opctx, - &opctx.log, - &InstanceUuid::from_untyped_uuid(target.instance_id), - &new_runtime_state, - &v2p_manager, - ) - .await - .map_err(|e| { - slog::warn!( - opctx.log, - "error updating instance"; - "error" => ?e, - "state" => ?new_runtime_state.vmm_state.state, - ); - Incomplete::UpdateFailed - }) - .and_then(|updated| { - updated.ok_or_else(|| { + check.result = + crate::app::instance::notify_instance_updated_background( + &datastore, + &opctx, + &saga_req, + InstanceUuid::from_untyped_uuid(target.instance_id), + new_runtime_state, + ) + .await + .map_err(|e| { slog::warn!( opctx.log, "error updating instance"; diff --git a/nexus/src/app/sagas/instance_update/destroyed.rs b/nexus/src/app/sagas/instance_update/destroyed.rs index ec0999a3c6a..cba2c31bf39 100644 --- a/nexus/src/app/sagas/instance_update/destroyed.rs +++ b/nexus/src/app/sagas/instance_update/destroyed.rs @@ -14,9 +14,11 @@ use nexus_db_model::InstanceState; use nexus_db_queries::authn; use nexus_db_queries::authz; use omicron_common::api::external::Error; +use omicron_uuid_kinds::GenericUuid; +use omicron_uuid_kinds::InstanceUuid; +use omicron_uuid_kinds::PropolisUuid; use serde::{Deserialize, Serialize}; use slog::info; -use uuid::Uuid; // instance update (active VMM destroyed) subsaga: actions @@ -71,7 +73,7 @@ pub(super) struct Params { pub(super) authz_instance: authz::Instance, /// The UUID of the VMM that was destroyed. - pub(super) vmm_id: Uuid, + pub(super) vmm_id: PropolisUuid, pub(super) instance: Instance, } @@ -122,7 +124,7 @@ async fn siud_release_sled_resources( osagactx .datastore() - .sled_reservation_delete(&opctx, vmm_id) + .sled_reservation_delete(&opctx, vmm_id.into_untyped_uuid()) .await .or_else(|err| { // Necessary for idempotency @@ -161,7 +163,7 @@ async fn siud_release_virtual_provisioning( .datastore() .virtual_provisioning_collection_delete_instance( &opctx, - authz_instance.id(), + InstanceUuid::from_untyped_uuid(authz_instance.id()), instance.project_id, i64::from(instance.ncpus.0 .0), instance.memory, @@ -269,7 +271,10 @@ async fn siud_update_instance( // It's okay for this to fail, it just means that the active VMM ID has changed. let _ = osagactx .datastore() - .instance_update_runtime(&authz_instance.id(), &new_runtime) + .instance_update_runtime( + &InstanceUuid::from_untyped_uuid(authz_instance.id()), + &new_runtime, + ) .await; Ok(()) } diff --git a/nexus/src/app/sagas/instance_update/mod.rs b/nexus/src/app/sagas/instance_update/mod.rs index 86b74fe255d..eaac701c9ed 100644 --- a/nexus/src/app/sagas/instance_update/mod.rs +++ b/nexus/src/app/sagas/instance_update/mod.rs @@ -11,6 +11,8 @@ use crate::app::db::datastore::InstanceSnapshot; use crate::app::db::model::VmmState; use crate::app::sagas::declare_saga_actions; use nexus_db_queries::{authn, authz}; +use omicron_uuid_kinds::GenericUuid; +use omicron_uuid_kinds::PropolisUuid; use serde::{Deserialize, Serialize}; use steno::{ActionError, DagBuilder, Node, SagaName}; use uuid::Uuid; @@ -96,7 +98,7 @@ impl NexusSaga for SagaDoActualInstanceUpdate { let subsaga_params = destroyed::Params { serialized_authn: params.serialized_authn.clone(), authz_instance: params.authz_instance.clone(), - vmm_id: active_vmm.id, + vmm_id: PropolisUuid::from_untyped_uuid(active_vmm.id), instance: params.state.instance.clone(), }; let subsaga_dag = { From 4a5368de7e87f3287400f14920d9cd41ff3742de Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Wed, 19 Jun 2024 16:35:29 -0700 Subject: [PATCH 049/234] tear up way more of sled-agent --- clients/nexus-client/src/lib.rs | 16 +- clients/sled-agent-client/src/lib.rs | 16 +- common/src/api/internal/nexus.rs | 56 +- nexus/db-queries/src/db/datastore/instance.rs | 27 +- nexus/db-queries/src/db/datastore/vmm.rs | 26 +- nexus/db-queries/src/db/queries/instance.rs | 362 ++++++++++--- nexus/src/app/instance.rs | 184 +------ openapi/nexus-internal.json | 35 +- openapi/sled-agent.json | 35 +- sled-agent/src/common/instance.rs | 507 +++++++++--------- sled-agent/src/instance.rs | 13 +- sled-agent/src/sim/collection.rs | 10 +- sled-agent/src/sim/instance.rs | 115 ++-- sled-agent/src/sim/sled_agent.rs | 3 +- 14 files changed, 683 insertions(+), 722 deletions(-) diff --git a/clients/nexus-client/src/lib.rs b/clients/nexus-client/src/lib.rs index 51f0dade1c4..b7722144fe3 100644 --- a/clients/nexus-client/src/lib.rs +++ b/clients/nexus-client/src/lib.rs @@ -139,7 +139,8 @@ impl From Self { propolis_id: s.propolis_id, vmm_state: s.vmm_state.into(), - migration_state: s.migration_state.map(Into::into), + migration_in: s.migration_in.map(Into::into), + migration_out: s.migration_out.map(Into::into), } } } @@ -152,7 +153,6 @@ impl From ) -> Self { Self { migration_id: s.migration_id, - role: s.role.into(), state: s.state.into(), gen: s.gen, time_updated: s.time_updated, @@ -160,18 +160,6 @@ impl From } } -impl From - for types::MigrationRole -{ - fn from(s: omicron_common::api::internal::nexus::MigrationRole) -> Self { - use omicron_common::api::internal::nexus::MigrationRole as Input; - match s { - Input::Source => Self::Source, - Input::Target => Self::Target, - } - } -} - impl From for types::MigrationState { diff --git a/clients/sled-agent-client/src/lib.rs b/clients/sled-agent-client/src/lib.rs index 9ba9138e181..ba3a1256ce3 100644 --- a/clients/sled-agent-client/src/lib.rs +++ b/clients/sled-agent-client/src/lib.rs @@ -164,7 +164,8 @@ impl From Self { propolis_id: s.propolis_id, vmm_state: s.vmm_state.into(), - migration_state: s.migration_state.map(Into::into), + migration_in: s.migration_in.map(Into::into), + migration_out: s.migration_out.map(Into::into), } } } @@ -176,25 +177,12 @@ impl From Self { migration_id: s.migration_id, state: s.state.into(), - role: s.role.into(), gen: s.gen, time_updated: s.time_updated, } } } -impl From - for omicron_common::api::internal::nexus::MigrationRole -{ - fn from(r: types::MigrationRole) -> Self { - use omicron_common::api::internal::nexus::MigrationRole as Output; - match r { - types::MigrationRole::Source => Output::Source, - types::MigrationRole::Target => Output::Target, - } - } -} - impl From for omicron_common::api::internal::nexus::MigrationState { diff --git a/common/src/api/internal/nexus.rs b/common/src/api/internal/nexus.rs index 39cde8e89a0..75eb0b37eab 100644 --- a/common/src/api/internal/nexus.rs +++ b/common/src/api/internal/nexus.rs @@ -123,9 +123,32 @@ pub struct SledInstanceState { /// The most recent state of the sled's VMM process. pub vmm_state: VmmRuntimeState, - /// The current state of any in-progress migration for this instance, as - /// understood by this sled. - pub migration_state: Option, + /// The current state of any inbound migration to this VMM. + pub migration_in: Option, + + /// The state of any outbound migration to this VMM. + pub migration_out: Option, +} + +#[derive(Copy, Clone, Debug, Default)] +pub struct Migrations<'state> { + pub migration_in: Option<&'state MigrationRuntimeState>, + pub migration_out: Option<&'state MigrationRuntimeState>, +} + +impl Migrations<'_> { + pub fn empty() -> Self { + Self { migration_in: None, migration_out: None } + } +} + +impl SledInstanceState { + pub fn migrations(&self) -> Migrations<'_> { + Migrations { + migration_in: self.migration_in.as_ref(), + migration_out: self.migration_out.as_ref(), + } + } } /// An update from a sled regarding the state of a migration, indicating the @@ -134,7 +157,6 @@ pub struct SledInstanceState { pub struct MigrationRuntimeState { pub migration_id: Uuid, pub state: MigrationState, - pub role: MigrationRole, pub gen: Generation, /// Timestamp for the migration state update. @@ -189,32 +211,6 @@ impl fmt::Display for MigrationState { } } -#[derive( - Clone, Copy, Debug, PartialEq, Eq, Deserialize, Serialize, JsonSchema, -)] -#[serde(rename_all = "snake_case")] -pub enum MigrationRole { - /// This update concerns the source VMM of a migration. - Source, - /// This update concerns the target VMM of a migration. - Target, -} - -impl MigrationRole { - pub fn label(&self) -> &'static str { - match self { - Self::Source => "source", - Self::Target => "target", - } - } -} - -impl fmt::Display for MigrationRole { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.write_str(self.label()) - } -} - // Oximeter producer/collector objects. /// The kind of metric producer this is. diff --git a/nexus/db-queries/src/db/datastore/instance.rs b/nexus/db-queries/src/db/datastore/instance.rs index 964a84ec1d2..0e1209933a9 100644 --- a/nexus/db-queries/src/db/datastore/instance.rs +++ b/nexus/db-queries/src/db/datastore/instance.rs @@ -47,7 +47,7 @@ use omicron_common::api::external::ListResultVec; use omicron_common::api::external::LookupResult; use omicron_common::api::external::LookupType; use omicron_common::api::external::ResourceType; -use omicron_common::api::internal::nexus::MigrationRuntimeState; +use omicron_common::api::internal::nexus::Migrations; use omicron_common::bail_unless; use omicron_uuid_kinds::GenericUuid; use omicron_uuid_kinds::InstanceUuid; @@ -549,13 +549,13 @@ impl DataStore { new_instance: &InstanceRuntimeState, vmm_id: &PropolisUuid, new_vmm: &VmmRuntimeState, - migration: &Option, + migrations: Migrations<'_>, ) -> Result { let query = crate::db::queries::instance::InstanceAndVmmUpdate::new( *vmm_id, new_vmm.clone(), Some((*instance_id, new_instance.clone())), - migration.clone(), + migrations, ); // The InstanceAndVmmUpdate query handles and indicates failure to find @@ -566,26 +566,21 @@ impl DataStore { .await .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?; - let instance_updated = match result.instance_status { - Some(UpdateStatus::Updated) => true, - Some(UpdateStatus::NotUpdatedButExists) => false, - None => false, - }; - + let instance_updated = result.instance_status.was_updated(); let vmm_updated = match result.vmm_status { Some(UpdateStatus::Updated) => true, Some(UpdateStatus::NotUpdatedButExists) => false, None => false, }; - let migration_updated = if migration.is_some() { - Some(match result.migration_status { - Some(UpdateStatus::Updated) => true, - Some(UpdateStatus::NotUpdatedButExists) => false, - None => false, - }) + let migration_updated = if migrations.migration_in.is_some() + || migrations.migration_out.is_some() + { + Some( + result.migration_in_status.was_updated() + || result.migration_out_status.was_updated(), + ) } else { - debug_assert_eq!(result.migration_status, None); None }; diff --git a/nexus/db-queries/src/db/datastore/vmm.rs b/nexus/db-queries/src/db/datastore/vmm.rs index d1570819219..798bdf2b4f5 100644 --- a/nexus/db-queries/src/db/datastore/vmm.rs +++ b/nexus/db-queries/src/db/datastore/vmm.rs @@ -27,7 +27,7 @@ use omicron_common::api::external::LookupResult; use omicron_common::api::external::LookupType; use omicron_common::api::external::ResourceType; use omicron_common::api::external::UpdateResult; -use omicron_common::api::internal::nexus::MigrationRuntimeState; +use omicron_common::api::internal::nexus::Migrations; use omicron_uuid_kinds::GenericUuid; use omicron_uuid_kinds::PropolisUuid; use std::net::SocketAddr; @@ -144,15 +144,15 @@ impl DataStore { pub async fn vmm_and_migration_update_runtime( &self, - vmm_id: Uuid, + vmm_id: PropolisUuid, new_runtime: &VmmRuntimeState, - migration: Option<&MigrationRuntimeState>, + migrations: Migrations<'_>, ) -> Result<(bool, Option), Error> { let query = crate::db::queries::instance::InstanceAndVmmUpdate::new( vmm_id, new_runtime.clone(), None, - migration.cloned(), + migrations, ); // The InstanceAndVmmUpdate query handles and indicates failure to find @@ -163,25 +163,23 @@ impl DataStore { .await .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?; - debug_assert_eq!(result.instance_status, None); + // debug_assert_eq!(result.instance_status, ); let vmm_updated = match result.vmm_status { Some(UpdateStatus::Updated) => true, Some(UpdateStatus::NotUpdatedButExists) => false, None => false, }; - - let migration_updated = if migration.is_some() { - Some(match result.migration_status { - Some(UpdateStatus::Updated) => true, - Some(UpdateStatus::NotUpdatedButExists) => false, - None => false, - }) + let migration_updated = if migrations.migration_in.is_some() + || migrations.migration_out.is_some() + { + Some( + result.migration_in_status.was_updated() + || result.migration_out_status.was_updated(), + ) } else { - debug_assert_eq!(result.migration_status, None); None }; - Ok((vmm_updated, migration_updated)) } diff --git a/nexus/db-queries/src/db/queries/instance.rs b/nexus/db-queries/src/db/queries/instance.rs index 011020c10fd..c8bb6a7e091 100644 --- a/nexus/db-queries/src/db/queries/instance.rs +++ b/nexus/db-queries/src/db/queries/instance.rs @@ -18,9 +18,7 @@ use nexus_db_model::{ }, Generation, InstanceRuntimeState, MigrationState, VmmRuntimeState, }; -use omicron_common::api::internal::nexus::{ - MigrationRole, MigrationRuntimeState, -}; +use omicron_common::api::internal::nexus::{MigrationRuntimeState, Migrations}; use omicron_uuid_kinds::{GenericUuid, InstanceUuid, PropolisUuid}; use uuid::Uuid; @@ -88,7 +86,8 @@ pub struct InstanceAndVmmUpdate { vmm_find: Box + Send>, vmm_update: Box + Send>, instance: Option, - migration: Option, + migration_in: Option, + migration_out: Option, } struct Update { @@ -104,16 +103,38 @@ pub struct InstanceAndVmmUpdateResult { /// `Some(status)` if the target instance was found; the wrapped /// `UpdateStatus` indicates whether the row was updated. `None` if the /// instance was not found. - pub instance_status: Option, + pub instance_status: RecordUpdateStatus, /// `Some(status)` if the target VMM was found; the wrapped `UpdateStatus` /// indicates whether the row was updated. `None` if the VMM was not found. pub vmm_status: Option, - /// `Some(status)` if the target migration was found; the wrapped `UpdateStatus` - /// indicates whether the row was updated. `None` if the migration was not - /// found, or no migration update was performed. - pub migration_status: Option, + /// `Some(status)` if the inbound migration was found; the wrapped `UpdateStatus` + /// indicates whether the row was updated. `None` if the inbound migration + /// was not found, or no migration update was performed. + pub migration_in_status: RecordUpdateStatus, + + /// `Some(status)` if the outbound migration was found; the wrapped `UpdateStatus` + /// indicates whether the row was updated. `None` if the inbound migration + /// was not found, or no migration update was performed. + pub migration_out_status: RecordUpdateStatus, +} + +#[derive(Copy, Clone, PartialEq, Debug)] +pub enum RecordUpdateStatus { + /// No record was found for the provided ID. + NotFound, + /// No record for this table was provided as part of the update. + NotProvided, + /// An update for this record was provided, and a a record matching the + /// provided ID exists. + Found(UpdateStatus), +} + +impl RecordUpdateStatus { + pub fn was_updated(self) -> bool { + matches!(self, Self::Found(UpdateStatus::Updated)) + } } /// Computes the update status to return from the results of queries that find @@ -159,7 +180,7 @@ impl InstanceAndVmmUpdate { vmm_id: PropolisUuid, new_vmm_runtime_state: VmmRuntimeState, instance: Option<(InstanceUuid, InstanceRuntimeState)>, - migration: Option, + Migrations { migration_in, migration_out }: Migrations<'_>, ) -> Self { let vmm_find = Box::new( vmm_dsl::vmm @@ -201,75 +222,98 @@ impl InstanceAndVmmUpdate { } }); - let migration = migration.map( + fn migration_find( + migration_id: Uuid, + ) -> Box + Send> { + Box::new( + migration_dsl::migration + .filter(migration_dsl::id.eq(migration_id)) + .filter(migration_dsl::time_deleted.is_null()) + .select(migration_dsl::id), + ) + } + + let migration_in = migration_in.cloned().map( |MigrationRuntimeState { - role, migration_id, state, gen, time_updated, }| { let state = MigrationState::from(state); - let find = Box::new( - migration_dsl::migration + let gen = Generation::from(gen); + let update = Box::new( + diesel::update(migration_dsl::migration) .filter(migration_dsl::id.eq(migration_id)) - .filter(migration_dsl::time_deleted.is_null()) - .select(migration_dsl::id), + .filter( + migration_dsl::target_propolis_id + .eq(vmm_id.into_untyped_uuid()), + ) + .filter(migration_dsl::target_gen.lt(gen)) + .set(( + migration_dsl::target_state.eq(state), + migration_dsl::time_target_updated.eq(time_updated), + )), ); + Update { + find: migration_find(migration_id), + update, + name: "migration_in", + id: migration_dsl::id::NAME, + } + }, + ); + + let migration_out = migration_out.cloned().map( + |MigrationRuntimeState { + migration_id, + state, + gen, + time_updated, + }| { + let state = MigrationState::from(state); let gen = Generation::from(gen); - let update: Box + Send> = match role { - MigrationRole::Target => Box::new( - diesel::update(migration_dsl::migration) - .filter(migration_dsl::id.eq(migration_id)) - .filter( - migration_dsl::target_propolis_id - .eq(vmm_id.into_untyped_uuid()), - ) - .filter(migration_dsl::target_gen.lt(gen)) - .set(( - migration_dsl::target_state.eq(state), - migration_dsl::time_target_updated - .eq(time_updated), - )), - ), - MigrationRole::Source => Box::new( - diesel::update(migration_dsl::migration) - .filter(migration_dsl::id.eq(migration_id)) - .filter( - migration_dsl::source_propolis_id - .eq(vmm_id.into_untyped_uuid()), - ) - .filter(migration_dsl::source_gen.lt(gen)) - .set(( - migration_dsl::source_state.eq(state), - migration_dsl::time_source_updated - .eq(time_updated), - )), - ), - }; + let update = Box::new( + diesel::update(migration_dsl::migration) + .filter(migration_dsl::id.eq(migration_id)) + .filter( + migration_dsl::source_propolis_id + .eq(vmm_id.into_untyped_uuid()), + ) + .filter(migration_dsl::source_gen.lt(gen)) + .set(( + migration_dsl::source_state.eq(state), + migration_dsl::time_source_updated.eq(time_updated), + )), + ); Update { - find, + find: migration_find(migration_id), update, - name: "migration", + name: "migration_out", id: migration_dsl::id::NAME, } }, ); - Self { vmm_find, vmm_update, instance, migration } + Self { vmm_find, vmm_update, instance, migration_in, migration_out } } pub async fn execute_and_check( self, conn: &(impl async_bb8_diesel::AsyncConnection + Sync), ) -> Result { + let has_migration_in = self.migration_in.is_some(); + let has_migration_out = self.migration_out.is_some(); + let has_instance = self.instance.is_some(); let ( vmm_found, vmm_updated, instance_found, instance_updated, - migration_found, - migration_updated, + migration_in_found, + migration_in_updated, + migration_out_found, + migration_out_updated, ) = self .get_result_async::<( Option, @@ -278,19 +322,43 @@ impl InstanceAndVmmUpdate { Option, Option, Option, + Option, + Option, + // WHEW! )>(conn) .await?; - let instance_status = - compute_update_status(instance_found, instance_updated); let vmm_status = compute_update_status(vmm_found, vmm_updated); - let migration_status = - compute_update_status(migration_found, migration_updated); + + let instance_status = if has_instance { + compute_update_status(instance_found, instance_updated) + .map(RecordUpdateStatus::Found) + .unwrap_or(RecordUpdateStatus::NotFound) + } else { + RecordUpdateStatus::NotProvided + }; + + let migration_in_status = if has_migration_in { + compute_update_status(migration_in_found, migration_in_updated) + .map(RecordUpdateStatus::Found) + .unwrap_or(RecordUpdateStatus::NotFound) + } else { + RecordUpdateStatus::NotProvided + }; + + let migration_out_status = if has_migration_out { + compute_update_status(migration_out_found, migration_out_updated) + .map(RecordUpdateStatus::Found) + .unwrap_or(RecordUpdateStatus::NotFound) + } else { + RecordUpdateStatus::NotProvided + }; Ok(InstanceAndVmmUpdateResult { instance_status, vmm_status, - migration_status, + migration_in_status, + migration_out_status, }) } } @@ -308,6 +376,8 @@ impl Query for InstanceAndVmmUpdate { Nullable, Nullable, Nullable, + Nullable, + Nullable, ); } @@ -362,8 +432,13 @@ impl QueryFragment for InstanceAndVmmUpdate { out.push_sql(", "); } - if let Some(ref migration) = self.migration { - migration.push_subqueries(&mut out)?; + if let Some(ref m) = self.migration_in { + m.push_subqueries(&mut out)?; + out.push_sql(", "); + } + + if let Some(ref m) = self.migration_out { + m.push_subqueries(&mut out)?; out.push_sql(", "); } @@ -386,24 +461,38 @@ impl QueryFragment for InstanceAndVmmUpdate { out.push_identifier(vmm_dsl::id::NAME)?; out.push_sql(") "); - out.push_sql("SELECT vmm_result.found, vmm_result.updated, "); - if self.instance.is_some() { - out.push_sql("instance_result.found, instance_result.updated, "); - } else { - out.push_sql("NULL, NULL, "); + fn push_select_from_result( + update: Option<&Update>, + out: &mut AstPass<'_, '_, Pg>, + ) { + if let Some(update) = update { + out.push_sql(update.name); + out.push_sql("_result.found, "); + out.push_sql(update.name); + out.push_sql("_result.updated"); + } else { + out.push_sql("NULL, NULL") + } } - if self.migration.is_some() { - out.push_sql("migration_result.found, migration_result.updated "); - } else { - out.push_sql("NULL, NULL "); - } + out.push_sql("SELECT vmm_result.found, vmm_result.updated, "); + push_select_from_result(self.instance.as_ref(), &mut out); + out.push_sql(", "); + push_select_from_result(self.migration_in.as_ref(), &mut out); + out.push_sql(", "); + push_select_from_result(self.migration_out.as_ref(), &mut out); + out.push_sql(" "); + out.push_sql("FROM vmm_result"); if self.instance.is_some() { out.push_sql(", instance_result"); } - if self.migration.is_some() { - out.push_sql(", migration_result"); + if self.migration_in.is_some() { + out.push_sql(", migration_in_result"); + } + + if self.migration_out.is_some() { + out.push_sql(", migration_out_result"); } Ok(()) @@ -417,7 +506,6 @@ mod test { use crate::db::model::VmmState; use crate::db::raw_query_builder::expectorate_query_contents; use chrono::Utc; - use omicron_common::api::internal::nexus::MigrationRole; use omicron_common::api::internal::nexus::MigrationRuntimeState; use omicron_common::api::internal::nexus::MigrationState; use uuid::Uuid; @@ -439,14 +527,13 @@ mod test { MigrationRuntimeState { migration_id, state: MigrationState::Pending, - role: MigrationRole::Source, gen: Generation::new().into(), time_updated: Utc::now(), } } - fn mk_instance_state() -> (Uuid, InstanceRuntimeState) { - let id = Uuid::nil(); + fn mk_instance_state() -> (InstanceUuid, InstanceRuntimeState) { + let id = InstanceUuid::nil(); let state = InstanceRuntimeState { time_updated: Utc::now(), gen: Generation::new(), @@ -460,10 +547,15 @@ mod test { #[tokio::test] async fn expectorate_query_only_vmm() { - let vmm_id = Uuid::nil(); + let vmm_id = PropolisUuid::nil(); let vmm_state = mk_vmm_state(); - let query = InstanceAndVmmUpdate::new(vmm_id, vmm_state, None, None); + let query = InstanceAndVmmUpdate::new( + vmm_id, + vmm_state, + None, + Migrations::default(), + ); expectorate_query_contents( &query, "tests/output/instance_and_vmm_update_vmm_only.sql", @@ -473,12 +565,16 @@ mod test { #[tokio::test] async fn expectorate_query_vmm_and_instance() { - let vmm_id = Uuid::nil(); + let vmm_id = PropolisUuid::nil(); let vmm_state = mk_vmm_state(); let instance = mk_instance_state(); - let query = - InstanceAndVmmUpdate::new(vmm_id, vmm_state, Some(instance), None); + let query = InstanceAndVmmUpdate::new( + vmm_id, + vmm_state, + Some(instance), + Migrations::default(), + ); expectorate_query_contents( &query, "tests/output/instance_and_vmm_update_vmm_and_instance.sql", @@ -487,23 +583,66 @@ mod test { } #[tokio::test] - async fn expectorate_query_vmm_and_migration() { - let vmm_id = Uuid::nil(); + async fn expectorate_query_vmm_and_migration_in() { + let vmm_id = PropolisUuid::nil(); + let vmm_state = mk_vmm_state(); + let migration = mk_migration_state(); + + let query = InstanceAndVmmUpdate::new( + vmm_id, + vmm_state, + None, + Migrations { migration_in: Some(&migration), migration_out: None }, + ); + expectorate_query_contents( + &query, + "tests/output/instance_and_vmm_update_vmm_and_migration_in.sql", + ) + .await; + } + + #[tokio::test] + async fn expectorate_query_vmm_instance_and_migration_in() { + let vmm_id = PropolisUuid::nil(); let vmm_state = mk_vmm_state(); + let instance = mk_instance_state(); let migration = mk_migration_state(); - let query = - InstanceAndVmmUpdate::new(vmm_id, vmm_state, None, Some(migration)); + let query = InstanceAndVmmUpdate::new( + vmm_id, + vmm_state, + Some(instance), + Migrations { migration_in: Some(&migration), migration_out: None }, + ); expectorate_query_contents( &query, - "tests/output/instance_and_vmm_update_vmm_and_imigration.sql", + "tests/output/instance_and_vmm_update_vmm_instance_and_migration_in.sql", ) .await; } #[tokio::test] - async fn expectorate_query_vmm_instance_and_migration() { - let vmm_id = Uuid::nil(); + async fn expectorate_query_vmm_and_migration_out() { + let vmm_id = PropolisUuid::nil(); + let vmm_state = mk_vmm_state(); + let migration = mk_migration_state(); + + let query = InstanceAndVmmUpdate::new( + vmm_id, + vmm_state, + None, + Migrations { migration_out: Some(&migration), migration_in: None }, + ); + expectorate_query_contents( + &query, + "tests/output/instance_and_vmm_update_vmm_and_migration_out.sql", + ) + .await; + } + + #[tokio::test] + async fn expectorate_query_vmm_instance_and_migration_out() { + let vmm_id = PropolisUuid::nil(); let vmm_state = mk_vmm_state(); let instance = mk_instance_state(); let migration = mk_migration_state(); @@ -512,11 +651,58 @@ mod test { vmm_id, vmm_state, Some(instance), - Some(migration), + Migrations { migration_out: Some(&migration), migration_in: None }, + ); + expectorate_query_contents( + &query, + "tests/output/instance_and_vmm_update_vmm_instance_and_migration_out.sql", + ) + .await; + } + + #[tokio::test] + async fn expectorate_query_vmm_and_both_migrations() { + let vmm_id = PropolisUuid::nil(); + let vmm_state = mk_vmm_state(); + let migration_in = mk_migration_state(); + let migration_out = mk_migration_state(); + + let query = InstanceAndVmmUpdate::new( + vmm_id, + vmm_state, + None, + Migrations { + migration_in: Some(&migration_in), + migration_out: Some(&migration_out), + }, + ); + expectorate_query_contents( + &query, + "tests/output/instance_and_vmm_update_vmm_and_both_migrations.sql", + ) + .await; + } + + #[tokio::test] + async fn expectorate_query_vmm_instance_and_both_migrations() { + let vmm_id = PropolisUuid::nil(); + let vmm_state = mk_vmm_state(); + let instance = mk_instance_state(); + let migration_in = mk_migration_state(); + let migration_out = mk_migration_state(); + + let query = InstanceAndVmmUpdate::new( + vmm_id, + vmm_state, + Some(instance), + Migrations { + migration_in: Some(&migration_in), + migration_out: Some(&migration_out), + }, ); expectorate_query_contents( &query, - "tests/output/instance_and_vmm_update_vmm_instance_and_migration.sql", + "tests/output/instance_and_vmm_update_vmm_instance_and_both_migrations.sql", ) .await; } diff --git a/nexus/src/app/instance.rs b/nexus/src/app/instance.rs index da09edd8a8f..758364de79e 100644 --- a/nexus/src/app/instance.rs +++ b/nexus/src/app/instance.rs @@ -1330,8 +1330,8 @@ impl super::Nexus { .db_datastore .vmm_and_migration_update_runtime( state.propolis_id, - &state.vmm_state.into(), - state.migration_state.as_ref(), + &state.vmm_state.clone().into(), + state.migrations(), ) .await; @@ -1523,13 +1523,14 @@ impl super::Nexus { instance_id: &InstanceUuid, new_runtime_state: &nexus::SledInstanceState, ) -> Result<(), Error> { - let migration = new_runtime_state.migration_state.as_ref(); + let migrations = new_runtime_state.migrations(); let propolis_id = new_runtime_state.propolis_id; info!(opctx.log, "received new VMM runtime state from sled agent"; "instance_id" => %instance_id, "propolis_id" => %propolis_id, "vmm_state" => ?new_runtime_state.vmm_state, - "migration_state" => ?migration); + "migration_state" => ?migrations, + ); let (vmm_updated, migration_updated) = self .db_datastore @@ -1537,14 +1538,20 @@ impl super::Nexus { propolis_id, // TODO(eliza): probably should take this by value... &new_runtime_state.vmm_state.clone().into(), - migration, + migrations, ) .await?; let updated = vmm_updated || migration_updated.unwrap_or(false); if updated { + info!(opctx.log, "starting update saga for {instance_id}"; + "instance_id" => %instance_id, + "propolis_id" => %propolis_id, + "vmm_state" => ?new_runtime_state.vmm_state, + "migration_state" => ?migrations, + ); let (.., authz_instance) = LookupPath::new(&opctx, &self.db_datastore) - .instance_id(*instance_id) + .instance_id(instance_id.into_untyped_uuid()) .lookup_for(authz::Action::Modify) .await?; let saga_params = sagas::instance_update::Params { @@ -2002,20 +2009,24 @@ pub(crate) async fn notify_instance_updated_background( instance_id: InstanceUuid, new_runtime_state: nexus::SledInstanceState, ) -> Result { + let migrations = new_runtime_state.migrations(); let propolis_id = new_runtime_state.propolis_id; info!(opctx.log, "received new VMM runtime state from sled agent"; "instance_id" => %instance_id, "propolis_id" => %propolis_id, "vmm_state" => ?new_runtime_state.vmm_state, - "migration_state" => ?new_runtime_state.migration_state); + "migration_state" => ?migrations, + ); - let updated = datastore - .vmm_update_runtime( - &propolis_id, + let (vmm_updated, migration_updated) = datastore + .vmm_and_migration_update_runtime( + propolis_id, // TODO(eliza): probably should take this by value... &new_runtime_state.vmm_state.clone().into(), + migrations, ) .await?; + let updated = vmm_updated || migration_updated.unwrap_or(false); if updated { let (.., authz_instance) = LookupPath::new(&opctx, datastore) @@ -2026,6 +2037,12 @@ pub(crate) async fn notify_instance_updated_background( serialized_authn: authn::saga::Serialized::for_opctx(opctx), authz_instance, }; + info!(opctx.log, "queueing update saga for {instance_id}"; + "instance_id" => %instance_id, + "propolis_id" => %propolis_id, + "vmm_state" => ?new_runtime_state.vmm_state, + "migration_state" => ?migrations, + ); saga_request .send(sagas::SagaRequest::InstanceUpdate { params }) .await @@ -2036,153 +2053,6 @@ pub(crate) async fn notify_instance_updated_background( })?; } - // // If the supplied instance state indicates that the instance no longer - // // has an active VMM, attempt to delete the virtual provisioning record, - // // and the assignment of the Propolis metric producer to an oximeter - // // collector. - // // - // // As with updating networking state, this must be done before - // // committing the new runtime state to the database: once the DB is - // // written, a new start saga can arrive and start the instance, which - // // will try to create its own virtual provisioning charges, which will - // // race with this operation. - // if new_runtime_state.instance_state.propolis_id.is_none() { - // datastore - // .virtual_provisioning_collection_delete_instance( - // opctx, - // *instance_id, - // db_instance.project_id, - // i64::from(db_instance.ncpus.0 .0), - // db_instance.memory, - // (&new_runtime_state.instance_state.gen).into(), - // ) - // .await?; - - // Write the new instance and VMM states back to CRDB. This needs to be - // done before trying to clean up the VMM, since the datastore will only - // allow a VMM to be marked as deleted if it is already in a terminal - // state. - // let result = datastore - // .instance_and_vmm_update_runtime( - // instance_id, - // &db::model::InstanceRuntimeState::from( - // new_runtime_state.instance_state.clone(), - // ), - // &propolis_id, - // &db::model::VmmRuntimeState::from( - // new_runtime_state.vmm_state.clone(), - // ), - // &new_runtime_state.migration_state, - // ) - // .await; - - // // Has a migration terminated? If so,mark the migration record as deleted if - // // and only if both sides of the migration are in a terminal state. - // if let Some(nexus::MigrationRuntimeState { - // migration_id, - // state, - // role, - // .. - // }) = new_runtime_state.migration_state - // { - // if state.is_terminal() { - // info!( - // log, - // "migration has terminated, trying to delete it..."; - // "instance_id" => %instance_id, - // "propolis_id" => %propolis_id, - // "migration_id" => %propolis_id, - // "migration_state" => %state, - // "migration_role" => %role, - // ); - // if !datastore.migration_terminate(opctx, migration_id).await? { - // info!( - // log, - // "did not mark migration record as deleted (the other half \ - // may not yet have reported termination)"; - // "instance_id" => %instance_id, - // "propolis_id" => %propolis_id, - // "migration_id" => %propolis_id, - // "migration_state" => %state, - // "migration_role" => %role, - // ); - // } - // } - // } - - // // If the VMM is now in a terminal state, make sure its resources get - // // cleaned up. - // // - // // For idempotency, only check to see if the update was successfully - // // processed and ignore whether the VMM record was actually updated. - // // This is required to handle the case where this routine is called - // // once, writes the terminal VMM state, fails before all per-VMM - // // resources are released, returns a retriable error, and is retried: - // // the per-VMM resources still need to be cleaned up, but the DB update - // // will return Ok(_, false) because the database was already updated. - // // - // // Unlike the pre-update cases, it is legal to do this cleanup *after* - // // committing state to the database, because a terminated VMM cannot be - // // reused (restarting or migrating its former instance will use new VMM - // // IDs). - // if result.is_ok() { - // let propolis_terminated = matches!( - // new_runtime_state.vmm_state.state, - // VmmState::Destroyed | VmmState::Failed - // ); - - // if propolis_terminated { - // info!(log, "vmm is terminated, cleaning up resources"; - // "instance_id" => %instance_id, - // "propolis_id" => %propolis_id); - - // datastore - // .sled_reservation_delete(opctx, propolis_id.into_untyped_uuid()) - // .await?; - - // if !datastore.vmm_mark_deleted(opctx, &propolis_id).await? { - // warn!(log, "failed to mark vmm record as deleted"; - // "instance_id" => %instance_id, - // "propolis_id" => %propolis_id, - // "vmm_state" => ?new_runtime_state.vmm_state); - // } - // } - // } - - // match result { - // Ok((instance_updated, vmm_updated)) => { - // info!(log, "instance and vmm updated by sled agent"; - // "instance_id" => %instance_id, - // "propolis_id" => %propolis_id, - // "instance_updated" => instance_updated, - // "vmm_updated" => vmm_updated); - // Ok(Some(InstanceUpdated { instance_updated, vmm_updated })) - // } - - // // The update command should swallow object-not-found errors and - // // return them back as failures to update, so this error case is - // // unexpected. There's no work to do if this occurs, however. - // Err(Error::ObjectNotFound { .. }) => { - // error!(log, "instance/vmm update unexpectedly returned \ - // an object not found error"; - // "instance_id" => %instance_id, - // "propolis_id" => %propolis_id); - // Ok(None) - // } - - // // If the datastore is unavailable, propagate that to the caller. - // // TODO-robustness Really this should be any _transient_ error. How - // // can we distinguish? Maybe datastore should emit something - // // different from Error with an Into. - // Err(error) => { - // warn!(log, "failed to update instance from sled agent"; - // "instance_id" => %instance_id, - // "propolis_id" => %propolis_id, - // "error" => ?error); - // Err(error) - // } - - // } Ok(updated) } diff --git a/openapi/nexus-internal.json b/openapi/nexus-internal.json index 0c6b060f5e6..6400b71d0f8 100644 --- a/openapi/nexus-internal.json +++ b/openapi/nexus-internal.json @@ -3424,24 +3424,6 @@ "minLength": 5, "maxLength": 17 }, - "MigrationRole": { - "oneOf": [ - { - "description": "This update concerns the source VMM of a migration.", - "type": "string", - "enum": [ - "source" - ] - }, - { - "description": "This update concerns the target VMM of a migration.", - "type": "string", - "enum": [ - "target" - ] - } - ] - }, "MigrationRuntimeState": { "description": "An update from a sled regarding the state of a migration, indicating the role of the VMM whose migration state was updated.", "type": "object", @@ -3453,9 +3435,6 @@ "type": "string", "format": "uuid" }, - "role": { - "$ref": "#/components/schemas/MigrationRole" - }, "state": { "$ref": "#/components/schemas/MigrationState" }, @@ -3468,7 +3447,6 @@ "required": [ "gen", "migration_id", - "role", "state", "time_updated" ] @@ -4669,9 +4647,18 @@ "description": "A wrapper type containing a sled's total knowledge of the state of a specific VMM and the instance it incarnates.", "type": "object", "properties": { - "migration_state": { + "migration_in": { + "nullable": true, + "description": "The current state of any inbound migration to this VMM.", + "allOf": [ + { + "$ref": "#/components/schemas/MigrationRuntimeState" + } + ] + }, + "migration_out": { "nullable": true, - "description": "The current state of any in-progress migration for this instance, as understood by this sled.", + "description": "The state of any outbound migration to this VMM.", "allOf": [ { "$ref": "#/components/schemas/MigrationRuntimeState" diff --git a/openapi/sled-agent.json b/openapi/sled-agent.json index b041bd69703..478b6c52b0d 100644 --- a/openapi/sled-agent.json +++ b/openapi/sled-agent.json @@ -3500,24 +3500,6 @@ "minLength": 5, "maxLength": 17 }, - "MigrationRole": { - "oneOf": [ - { - "description": "This update concerns the source VMM of a migration.", - "type": "string", - "enum": [ - "source" - ] - }, - { - "description": "This update concerns the target VMM of a migration.", - "type": "string", - "enum": [ - "target" - ] - } - ] - }, "MigrationRuntimeState": { "description": "An update from a sled regarding the state of a migration, indicating the role of the VMM whose migration state was updated.", "type": "object", @@ -3529,9 +3511,6 @@ "type": "string", "format": "uuid" }, - "role": { - "$ref": "#/components/schemas/MigrationRole" - }, "state": { "$ref": "#/components/schemas/MigrationState" }, @@ -3544,7 +3523,6 @@ "required": [ "gen", "migration_id", - "role", "state", "time_updated" ] @@ -4615,9 +4593,18 @@ "description": "A wrapper type containing a sled's total knowledge of the state of a specific VMM and the instance it incarnates.", "type": "object", "properties": { - "migration_state": { + "migration_in": { + "nullable": true, + "description": "The current state of any inbound migration to this VMM.", + "allOf": [ + { + "$ref": "#/components/schemas/MigrationRuntimeState" + } + ] + }, + "migration_out": { "nullable": true, - "description": "The current state of any in-progress migration for this instance, as understood by this sled.", + "description": "The state of any outbound migration to this VMM.", "allOf": [ { "$ref": "#/components/schemas/MigrationRuntimeState" diff --git a/sled-agent/src/common/instance.rs b/sled-agent/src/common/instance.rs index a4d94dfac51..e35d9e3a28b 100644 --- a/sled-agent/src/common/instance.rs +++ b/sled-agent/src/common/instance.rs @@ -8,21 +8,23 @@ use crate::params::InstanceMigrationSourceParams; use chrono::{DateTime, Utc}; use omicron_common::api::external::Generation; use omicron_common::api::internal::nexus::{ - MigrationRole, MigrationRuntimeState, MigrationState, SledInstanceState, - VmmRuntimeState, VmmState, + MigrationRuntimeState, MigrationState, SledInstanceState, VmmRuntimeState, + VmmState, }; use omicron_uuid_kinds::PropolisUuid; use propolis_client::types::{ - InstanceState as PropolisApiState, InstanceStateMonitorResponse, - MigrationState as PropolisMigrationState, + InstanceMigrationStatus, InstanceState as PropolisApiState, + InstanceStateMonitorResponse, MigrationState as PropolisMigrationState, }; +use uuid::Uuid; /// The instance and VMM state that sled agent maintains on a per-VMM basis. #[derive(Clone, Debug)] pub struct InstanceStates { vmm: VmmRuntimeState, propolis_id: PropolisUuid, - migration: Option, + migration_in: Option, + migration_out: Option, } /// Newtype to allow conversion from Propolis API states (returned by the @@ -100,9 +102,8 @@ pub(crate) struct ObservedPropolisState { /// The state reported by Propolis's instance state monitor API. pub vmm_state: PropolisInstanceState, - /// Information about whether the state observer queried migration status at - /// all and, if so, what response it got from Propolis. - pub migration_status: ObservedMigrationStatus, + pub migration_in: Option, + pub migration_out: Option, /// The approximate time at which this observation was made. pub time: DateTime, @@ -112,66 +113,41 @@ impl ObservedPropolisState { /// Constructs a Propolis state observation from an instance's current /// state and an instance state monitor response received from /// Propolis. - pub fn new( - state: &InstanceStates, - propolis_state: &InstanceStateMonitorResponse, - ) -> Self { - // If there's no migration currently registered with this sled, report - // the current state and that no migration is currently in progress, - // even if Propolis has some migration data to share. (This case arises - // when Propolis returns state from a previous migration that sled agent - // has already retired.) - // - // N.B. This needs to be read from the instance runtime state and not - // the migration runtime state to ensure that, once a migration in - // completes, the "completed" observation is reported to - // `InstanceStates::apply_propolis_observation` exactly once. - // Otherwise that routine will try to apply the "inbound migration - // complete" instance state transition twice. - let Some(migration_id) = instance_runtime.migration_id else { - return Self { - vmm_state: PropolisInstanceState(propolis_state.state), - migration_status: ObservedMigrationStatus::NoMigration, - time: Utc::now(), - }; - }; - - // Sled agent believes a live migration may be in progress. See if - // either of the Propolis migrations corresponds to it. - let propolis_migration = match ( - &propolis_state.migration.migration_in, - &propolis_state.migration.migration_out, - ) { - (Some(inbound), _) if inbound.id == migration_id => inbound, - (_, Some(outbound)) if outbound.id == migration_id => outbound, - _ => { - // Sled agent believes this instance should be migrating, but - // Propolis isn't reporting a matching migration yet, so assume - // the migration is still pending. - return Self { - vmm_state: PropolisInstanceState(propolis_state.state), - migration_status: ObservedMigrationStatus::Pending, - time: Utc::now(), - }; - } - }; - + pub fn new(propolis_state: &InstanceStateMonitorResponse) -> Self { Self { vmm_state: PropolisInstanceState(propolis_state.state), - migration_status: match propolis_migration.state { - PropolisMigrationState::Finish => { - ObservedMigrationStatus::Succeeded - } - PropolisMigrationState::Error => { - ObservedMigrationStatus::Failed - } - _ => ObservedMigrationStatus::InProgress, - }, + migration_in: propolis_state + .migration + .migration_in + .as_ref() + .map(ObservedMigrationState::from), + migration_out: propolis_state + .migration + .migration_out + .as_ref() + .map(ObservedMigrationState::from), time: Utc::now(), } } } +#[derive(Copy, Clone, Debug)] +pub struct ObservedMigrationState { + state: MigrationState, + id: Uuid, +} + +impl From<&'_ InstanceMigrationStatus> for ObservedMigrationState { + fn from(observed: &InstanceMigrationStatus) -> Self { + let state = match observed.state { + PropolisMigrationState::Error => MigrationState::Failed, + PropolisMigrationState::Finish => MigrationState::Completed, + _ => MigrationState::InProgress, + }; + Self { state, id: observed.id } + } +} + /// The set of instance states that sled agent can publish to Nexus. This is /// a subset of the instance states Nexus knows about: the Creating and /// Destroyed states are reserved for Nexus to use for instances that are being @@ -198,8 +174,13 @@ pub enum Action { } impl InstanceStates { - pub fn new(vmm: VmmRuntimeState, propolis_id: Uuid) -> Self { - InstanceStates { vmm, propolis_id, migration: None } + pub fn new(vmm: VmmRuntimeState, propolis_id: PropolisUuid) -> Self { + InstanceStates { + vmm, + propolis_id, + migration_in: None, + migration_out: None, + } } pub fn vmm(&self) -> &VmmRuntimeState { @@ -210,10 +191,6 @@ impl InstanceStates { self.propolis_id } - pub(crate) fn migration(&self) -> Option<&MigrationRuntimeState> { - self.migration.as_ref() - } - /// Creates a `SledInstanceState` structure containing the entirety of this /// structure's runtime state. This requires cloning; for simple read access /// use the `instance` or `vmm` accessors instead. @@ -221,25 +198,9 @@ impl InstanceStates { SledInstanceState { vmm_state: self.vmm.clone(), propolis_id: self.propolis_id, - migration_state: self.migration.clone(), - } - } - - fn transition_migration( - &mut self, - state: MigrationState, - time_updated: DateTime, - ) { - let migration = self.migration.as_mut().expect( - "an ObservedMigrationState should only be constructed when the \ - VMM has an active migration", - ); - // Don't generate spurious state updates if the migration is already in - // the state we're transitioning to. - if migration.state != state { - migration.state = state; - migration.time_updated = time_updated; - migration.gen = migration.gen.next(); + // migration_state: self.migration.clone(), + migration_in: self.migration_in.clone(), + migration_out: self.migration_out.clone(), } } @@ -249,6 +210,49 @@ impl InstanceStates { &mut self, observed: &ObservedPropolisState, ) -> Option { + fn transition_migration( + current: &mut Option, + ObservedMigrationState { id, state }: ObservedMigrationState, + now: DateTime, + ) { + if let Some(ref mut m) = current { + // Don't generate spurious state updates if the migration is already in + // the state we're transitioning to. + if m.migration_id == id && m.state == state { + return; + } + m.state = state; + if m.migration_id == id { + m.gen = m.gen.next(); + } else { + m.migration_id = id; + m.gen = Generation::new(); + } + m.time_updated = now; + } else { + *current = Some(MigrationRuntimeState { + migration_id: id, + gen: Generation::new(), + state, + time_updated: now, + }); + } + } + + fn destroy_migration( + migration: &mut MigrationRuntimeState, + now: DateTime, + ) { + if matches!( + migration.state, + MigrationState::InProgress | MigrationState::Pending + ) { + migration.gen = migration.gen.next(); + migration.time_updated = now; + migration.state = MigrationState::Failed; + } + } + let vmm_gone = matches!( observed.vmm_state.0, PropolisApiState::Destroyed | PropolisApiState::Failed @@ -264,27 +268,11 @@ impl InstanceStates { // Update the instance record to reflect the result of any completed // migration. - match observed.migration_status { - ObservedMigrationStatus::Succeeded => { - self.transition_migration( - MigrationState::Completed, - observed.time, - ); - } - ObservedMigrationStatus::Failed => { - self.transition_migration( - MigrationState::Failed, - observed.time, - ); - } - ObservedMigrationStatus::InProgress => { - self.transition_migration( - MigrationState::InProgress, - observed.time, - ); - } - ObservedMigrationStatus::NoMigration - | ObservedMigrationStatus::Pending => {} + if let Some(m) = observed.migration_in { + transition_migration(&mut self.migration_in, m, observed.time); + } + if let Some(m) = observed.migration_out { + transition_migration(&mut self.migration_out, m, observed.time); } // If this Propolis has exited, tear down its zone. If it was in the @@ -301,13 +289,11 @@ impl InstanceStates { if vmm_gone { // If there's an active migration and the VMM is suddenly gone, // that should constitute a migration failure! - if let Some(MigrationState::Pending | MigrationState::InProgress) = - self.migration.as_ref().map(|m| m.state) - { - self.transition_migration( - MigrationState::Failed, - observed.time, - ); + if let Some(ref mut m) = self.migration_in { + destroy_migration(m, observed.time); + } + if let Some(ref mut m) = self.migration_out { + destroy_migration(m, observed.time); } Some(Action::Destroy) } else { @@ -353,11 +339,10 @@ impl InstanceStates { let fake_observed = ObservedPropolisState { vmm_state, - migration_status: if self.migration.is_some() { - ObservedMigrationStatus::Failed - } else { - ObservedMigrationStatus::NoMigration - }, + // We don't actually need to populate these, because observing a + // `Destroyed` instance state will fail any in progress migrations anyway. + migration_in: None, + migration_out: None, time: Utc::now(), }; @@ -366,51 +351,23 @@ impl InstanceStates { /// Sets or clears this instance's migration IDs and advances its Propolis /// generation number. + #[deprecated(note = "eliza get rid of this")] pub(crate) fn set_migration_ids( &mut self, ids: &Option, now: DateTime, ) { - if let Some(InstanceMigrationSourceParams { - migration_id, - dst_propolis_id, - }) = *ids - { - let role = if dst_propolis_id == self.propolis_id { - MigrationRole::Target - } else { - MigrationRole::Source - }; - self.migration = Some(MigrationRuntimeState { - migration_id, - state: MigrationState::Pending, - role, - gen: Generation::new(), - time_updated: now, - }) - } else { - self.migration = None; - } } /// Returns true if the migration IDs in this instance are already set as they /// would be on a successful transition from the migration IDs in /// `old_runtime` to the ones in `migration_ids`. + #[deprecated(note = "eliza get rid of this")] pub(crate) fn migration_ids_already_set( &self, migration_ids: &Option, ) -> bool { - match (self.migration.as_ref(), migration_ids) { - // If the migration ID is already set, and this is a request to set - // IDs, the records match if the relevant IDs match. - (Some(migration), Some(ids)) => { - migration.migration_id == ids.migration_id - } - // If the migration ID is already cleared, and this is a request to - // clear IDs, the records match. - (None, None) => true, - _ => false, - } + false } } @@ -440,10 +397,9 @@ mod test { let mut state = make_instance(); state.vmm.state = VmmState::Migrating; let migration_id = Uuid::new_v4(); - state.migration = Some(MigrationRuntimeState { + state.migration_out = Some(MigrationRuntimeState { migration_id, state: MigrationState::InProgress, - role: MigrationRole::Source, // advance the generation once, since we are starting out in the // `InProgress` state. gen: Generation::new().next(), @@ -457,11 +413,9 @@ mod test { let mut state = make_instance(); state.vmm.state = VmmState::Migrating; let migration_id = Uuid::new_v4(); - state.propolis_id = Uuid::new_v4(); - state.migration = Some(MigrationRuntimeState { + state.migration_in = Some(MigrationRuntimeState { migration_id, state: MigrationState::InProgress, - role: MigrationRole::Target, // advance the generation once, since we are starting out in the // `InProgress` state. gen: Generation::new().next(), @@ -475,7 +429,8 @@ mod test { ) -> ObservedPropolisState { ObservedPropolisState { vmm_state: propolis_state, - migration_status: ObservedMigrationStatus::NoMigration, + migration_in: None, + migration_out: None, time: Utc::now(), } } @@ -509,43 +464,58 @@ mod test { fn test_termination_fails_in_progress_migration( mk_instance: impl Fn() -> InstanceStates, ) { + } + + #[test] + fn source_termination_fails_in_progress_migration() { for state in [Observed::Destroyed, Observed::Failed] { - let mut instance_state = mk_instance(); - let original_migration = instance_state.clone().migration.unwrap(); + let mut instance_state = make_migration_source_instance(); + let original_migration = + instance_state.clone().migration_out.unwrap(); let requested_action = instance_state .apply_propolis_observation(&make_observed_state(state.into())); - let migration = - instance_state.migration.expect("state must have a migration"); + let migration = instance_state + .migration_out + .expect("state must have a migration"); assert_eq!(migration.state, MigrationState::Failed); assert!(migration.gen > original_migration.gen); assert!(matches!(requested_action, Some(Action::Destroy))); } } - #[test] - fn source_termination_fails_in_progress_migration() { - test_termination_fails_in_progress_migration( - make_migration_source_instance, - ) - } - #[test] fn target_termination_fails_in_progress_migration() { - test_termination_fails_in_progress_migration( - make_migration_target_instance, - ) + for state in [Observed::Destroyed, Observed::Failed] { + let mut instance_state = make_migration_target_instance(); + let original_migration = + instance_state.clone().migration_in.unwrap(); + let requested_action = instance_state + .apply_propolis_observation(&make_observed_state(state.into())); + + let migration = instance_state + .migration_in + .expect("state must have a migration"); + assert_eq!(migration.state, MigrationState::Failed); + assert!(migration.gen > original_migration.gen); + assert!(matches!(requested_action, Some(Action::Destroy))); + } } #[test] fn destruction_after_migration_out_does_not_transition() { let mut state = make_migration_source_instance(); + let migration_id = state.migration_out.unwrap().migration_id; // After a migration succeeds, the source VM appears to stop but reports // that the migration has succeeded. let mut observed = ObservedPropolisState { vmm_state: PropolisInstanceState(Observed::Stopping), - migration_status: ObservedMigrationStatus::Succeeded, + migration_out: Some(ObservedMigrationState { + state: MigrationState::Completed, + id: migration_id, + }), + migration_in: None, time: Utc::now(), }; @@ -558,11 +528,11 @@ mod test { // The migration state should transition to "completed" let migration = state - .migration + .migration_out .clone() .expect("instance must have a migration state"); let prev_migration = - prev.migration.expect("previous state must have a migration"); + prev.migration_out.expect("previous state must have a migration"); assert_eq!(migration.state, MigrationState::Completed); assert!(migration.gen > prev_migration.gen); let prev_migration = migration; @@ -583,7 +553,7 @@ mod test { // Now that the migration has completed, it should not transition again. let migration = state - .migration + .migration_out .clone() .expect("instance must have a migration state"); assert_eq!(migration.state, MigrationState::Completed); @@ -601,7 +571,7 @@ mod test { assert!(state.vmm.gen > prev.vmm.gen); let migration = state - .migration + .migration_out .clone() .expect("instance must have a migration state"); assert_eq!(migration.state, MigrationState::Completed); @@ -611,12 +581,17 @@ mod test { #[test] fn failure_after_migration_in_does_not_transition() { let mut state = make_migration_target_instance(); + let migration_id = state.migration_in.unwrap().migration_id; // Failure to migrate into an instance should mark the VMM as destroyed // but should not change the instance's migration IDs. let observed = ObservedPropolisState { vmm_state: PropolisInstanceState(Observed::Failed), - migration_status: ObservedMigrationStatus::Failed, + migration_in: Some(ObservedMigrationState { + state: MigrationState::Failed, + id: migration_id, + }), + migration_out: None, time: Utc::now(), }; @@ -631,9 +606,9 @@ mod test { // The migration state should transition. let migration = - state.migration.expect("instance must have a migration state"); + state.migration_in.expect("instance must have a migration state"); let prev_migration = - prev.migration.expect("previous state must have a migration"); + prev.migration_in.expect("previous state must have a migration"); assert_eq!(migration.state, MigrationState::Failed); assert!(migration.gen > prev_migration.gen); } @@ -656,99 +631,101 @@ mod test { // The migration state should transition. let migration = - state.migration.expect("instance must have a migration state"); + state.migration_in.expect("instance must have a migration state"); let prev_migration = - prev.migration.expect("previous state must have a migration"); + prev.migration_in.expect("previous state must have a migration"); assert_eq!(migration.state, MigrationState::Failed); assert!(migration.gen > prev_migration.gen); } #[test] + #[ignore = "this logic is basically trivial now, maybe just get rid of the test?"] fn migration_out_after_migration_in() { - let mut state = make_migration_target_instance(); - let mut observed = ObservedPropolisState { - vmm_state: PropolisInstanceState(Observed::Running), - migration_status: ObservedMigrationStatus::Succeeded, - time: Utc::now(), - }; - - // The transition into the Running state on the migration target should - // take over for the source, updating the Propolis generation. - let prev = state.clone(); - assert!(state.apply_propolis_observation(&observed).is_none()); - assert_state_change_has_gen_change(&prev, &state); - assert_eq!(state.vmm.state, VmmState::Running); - assert!(state.vmm.gen > prev.vmm.gen); - - // The migration state should transition to completed. - let migration = state - .migration - .clone() - .expect("instance must have a migration state"); - let prev_migration = - prev.migration.expect("previous state must have a migration"); - assert_eq!(migration.state, MigrationState::Completed); - assert!(migration.gen > prev_migration.gen); - - // Pretend Nexus set some new migration IDs. - let migration_id = Uuid::new_v4(); - let prev = state.clone(); - state.set_migration_ids( - &Some(InstanceMigrationSourceParams { - migration_id, - dst_propolis_id: PropolisUuid::new_v4(), - }), - Utc::now(), - ); - assert_state_change_has_gen_change(&prev, &state); - assert_eq!(state.vmm.gen, prev.vmm.gen); - - // There should be a new, pending migration state. - let migration = state - .migration - .clone() - .expect("instance must have a migration state"); - assert_eq!(migration.state, MigrationState::Pending); - assert_eq!(migration.migration_id, migration_id); - let prev_migration = migration; - - // Mark that the new migration out is in progress. This doesn't change - // anything in the instance runtime state, but does update the VMM state - // generation. - let prev = state.clone(); - observed.vmm_state = PropolisInstanceState(Observed::Migrating); - assert!(state.apply_propolis_observation(&observed).is_none()); - assert_state_change_has_gen_change(&prev, &state); - assert_eq!(state.vmm.state, VmmState::Migrating); - assert!(state.vmm.gen > prev.vmm.gen); - - // The migration state should transition to in progress. - let migration = state - .migration - .clone() - .expect("instance must have a migration state"); - assert_eq!(migration.state, MigrationState::InProgress); - assert!(migration.gen > prev_migration.gen); - let prev_migration = migration; - - // Propolis will publish that the migration succeeds before changing any - // state. This should transfer control to the target but should not - // touch the migration ID (that is the new target's job). - let prev = state.clone(); - observed.vmm_state = PropolisInstanceState(Observed::Migrating); - assert!(state.apply_propolis_observation(&observed).is_none()); - assert_state_change_has_gen_change(&prev, &state); - assert_eq!(state.vmm.state, VmmState::Migrating); - assert!(state.vmm.gen > prev.vmm.gen); - - // The migration state should transition to completed. - let migration = state - .migration - .clone() - .expect("instance must have a migration state"); - assert_eq!(migration.state, MigrationState::Completed); - assert!(migration.gen > prev_migration.gen); - - // The rest of the destruction sequence is covered by other tests. + todo!("eliza") + // let mut state = make_migration_target_instance(); + // let mut observed = ObservedPropolisState { + // vmm_state: PropolisInstanceState(Observed::Running), + // migration_in: ObservedMigrationStatus::Succeeded, + // time: Utc::now(), + // }; + + // // The transition into the Running state on the migration target should + // // take over for the source, updating the Propolis generation. + // let prev = state.clone(); + // assert!(state.apply_propolis_observation(&observed).is_none()); + // assert_state_change_has_gen_change(&prev, &state); + // assert_eq!(state.vmm.state, VmmState::Running); + // assert!(state.vmm.gen > prev.vmm.gen); + + // // The migration state should transition to completed. + // let migration = state + // .migration + // .clone() + // .expect("instance must have a migration state"); + // let prev_migration = + // prev.migration.expect("previous state must have a migration"); + // assert_eq!(migration.state, MigrationState::Completed); + // assert!(migration.gen > prev_migration.gen); + + // // Pretend Nexus set some new migration IDs. + // let migration_id = Uuid::new_v4(); + // let prev = state.clone(); + // state.set_migration_ids( + // &Some(InstanceMigrationSourceParams { + // migration_id, + // dst_propolis_id: PropolisUuid::new_v4(), + // }), + // Utc::now(), + // ); + // assert_state_change_has_gen_change(&prev, &state); + // assert_eq!(state.vmm.gen, prev.vmm.gen); + + // // There should be a new, pending migration state. + // let migration = state + // .migration + // .clone() + // .expect("instance must have a migration state"); + // assert_eq!(migration.state, MigrationState::Pending); + // assert_eq!(migration.migration_id, migration_id); + // let prev_migration = migration; + + // // Mark that the new migration out is in progress. This doesn't change + // // anything in the instance runtime state, but does update the VMM state + // // generation. + // let prev = state.clone(); + // observed.vmm_state = PropolisInstanceState(Observed::Migrating); + // assert!(state.apply_propolis_observation(&observed).is_none()); + // assert_state_change_has_gen_change(&prev, &state); + // assert_eq!(state.vmm.state, VmmState::Migrating); + // assert!(state.vmm.gen > prev.vmm.gen); + + // // The migration state should transition to in progress. + // let migration = state + // .migration + // .clone() + // .expect("instance must have a migration state"); + // assert_eq!(migration.state, MigrationState::InProgress); + // assert!(migration.gen > prev_migration.gen); + // let prev_migration = migration; + + // // Propolis will publish that the migration succeeds before changing any + // // state. This should transfer control to the target but should not + // // touch the migration ID (that is the new target's job). + // let prev = state.clone(); + // observed.vmm_state = PropolisInstanceState(Observed::Migrating); + // assert!(state.apply_propolis_observation(&observed).is_none()); + // assert_state_change_has_gen_change(&prev, &state); + // assert_eq!(state.vmm.state, VmmState::Migrating); + // assert!(state.vmm.gen > prev.vmm.gen); + + // // The migration state should transition to completed. + // let migration = state + // .migration + // .clone() + // .expect("instance must have a migration state"); + // assert_eq!(migration.state, MigrationState::Completed); + // assert!(migration.gen > prev_migration.gen); + + // // The rest of the destruction sequence is covered by other tests. } } diff --git a/sled-agent/src/instance.rs b/sled-agent/src/instance.rs index 688ed195b81..6034b50ee8b 100644 --- a/sled-agent/src/instance.rs +++ b/sled-agent/src/instance.rs @@ -384,10 +384,7 @@ impl InstanceRunner { use InstanceMonitorRequest::*; match request { Some(Update { state, tx }) => { - let observed = ObservedPropolisState::new( - &self.state, - &state, - ); + let observed = ObservedPropolisState::new(&state); let reaction = self.observe_state(&observed).await; self.publish_state_to_nexus().await; @@ -707,13 +704,7 @@ impl InstanceRunner { let migrate = match migrate { Some(params) => { - let migration_id = self - .state - .migration() - .ok_or_else(|| { - Error::Migration(anyhow!("Missing Migration UUID")) - })? - .migration_id; + let migration_id = todo!("eliza: this probably needs to be sent by Nexus directly now?"); Some(propolis_client::types::InstanceMigrateInitiateRequest { src_addr: params.src_propolis_addr.to_string(), src_uuid: params.src_propolis_id, diff --git a/sled-agent/src/sim/collection.rs b/sled-agent/src/sim/collection.rs index c9197fc3b86..ffb7327ce77 100644 --- a/sled-agent/src/sim/collection.rs +++ b/sled-agent/src/sim/collection.rs @@ -431,15 +431,19 @@ mod test { fn make_instance( logctx: &LogContext, ) -> (SimObject, Receiver<()>) { - let propolis_id = Uuid::new_v4(); + let propolis_id = PropolisUuid::new_v4(); let vmm_state = VmmRuntimeState { state: VmmState::Starting, gen: Generation::new(), time_updated: Utc::now(), }; - let state = - SledInstanceState { vmm_state, propolis_id, migration_state: None }; + let state = SledInstanceState { + vmm_state, + propolis_id, + migration_in: None, + migration_out: None, + }; SimObject::new_simulated_auto(&state, logctx.log.new(o!())) } diff --git a/sled-agent/src/sim/instance.rs b/sled-agent/src/sim/instance.rs index 33b9cc10a1e..abd7b78223e 100644 --- a/sled-agent/src/sim/instance.rs +++ b/sled-agent/src/sim/instance.rs @@ -16,7 +16,7 @@ use omicron_common::api::external::Error; use omicron_common::api::external::Generation; use omicron_common::api::external::ResourceType; use omicron_common::api::internal::nexus::{ - InstanceRuntimeState, MigrationRole, SledInstanceState, VmmState, + InstanceRuntimeState, SledInstanceState, VmmState, }; use propolis_client::types::{ InstanceMigrateStatusResponse as PropolisMigrateResponse, @@ -81,54 +81,47 @@ impl SimInstanceInner { /// Queue a successful simulated migration. /// - fn queue_successful_migration(&mut self, role: MigrationRole) { + fn queue_successful_migration(&mut self) { // Propolis transitions to the Migrating state once before // actually starting migration. self.queue_propolis_state(PropolisInstanceState::Migrating); - let migration_id = self.state.migration().unwrap_or_else(|| { - panic!( - "should have migration ID set before getting request to - migrate in (current state: {:?})", - self - ) - }); - - match role { - MigrationRole::Source => { - self.queue_migration_update(PropolisMigrateResponse { - migration_in: None, - migration_out: Some(PropolisMigrationStatus { - id: migration_id, - state: propolis_client::types::MigrationState::Sync, - }), - }); - self.queue_migration_update(PropolisMigrateResponse { - migration_in: None, - migration_out: Some(PropolisMigrationStatus { - id: migration_id, - state: propolis_client::types::MigrationState::Finish, - }), - }); - self.queue_graceful_stop(); - } - MigrationRole::Target => { - self.queue_migration_update(PropolisMigrateResponse { - migration_in: Some(PropolisMigrationStatus { - id: migration_id, - state: propolis_client::types::MigrationState::Sync, - }), - migration_out: None, - }); - self.queue_migration_update(PropolisMigrateResponse { - migration_in: Some(PropolisMigrationStatus { - id: migration_id, - state: propolis_client::types::MigrationState::Finish, - }), - migration_out: None, - }); - self.queue_propolis_state(PropolisInstanceState::Running) - } - } + todo!("eliza: fix this bit") + // match role { + // MigrationRole::Source => { + // self.queue_migration_update(PropolisMigrateResponse { + // migration_in: None, + // migration_out: Some(PropolisMigrationStatus { + // id: todo! + // state: propolis_client::types::MigrationState::Sync, + // }), + // }); + // self.queue_migration_update(PropolisMigrateResponse { + // migration_in: None, + // migration_out: Some(PropolisMigrationStatus { + // id: migration_id, + // state: propolis_client::types::MigrationState::Finish, + // }), + // }); + // self.queue_graceful_stop(); + // } + // MigrationRole::Target => { + // self.queue_migration_update(PropolisMigrateResponse { + // migration_in: Some(PropolisMigrationStatus { + // id: migration_id, + // state: propolis_client::types::MigrationState::Sync, + // }), + // migration_out: None, + // }); + // self.queue_migration_update(PropolisMigrateResponse { + // migration_in: Some(PropolisMigrationStatus { + // id: migration_id, + // state: propolis_client::types::MigrationState::Finish, + // }), + // migration_out: None, + // }); + // self.queue_propolis_state(PropolisInstanceState::Running) + // } + // } } fn queue_graceful_stop(&mut self) { @@ -178,7 +171,7 @@ impl SimInstanceInner { ))); } - self.queue_successful_migration(MigrationRole::Target) + // self.queue_successful_migration(MigrationRole::Target) } InstanceStateRequested::Running => { match self.next_resting_state() { @@ -278,7 +271,6 @@ impl SimInstanceInner { } self.state.apply_propolis_observation(&ObservedPropolisState::new( - &self.state, &self.last_response, )) } else { @@ -384,19 +376,20 @@ impl SimInstanceInner { // If we set migration IDs and are the migration source, ensure that we // will perform the correct state transitions to simulate a successful // migration. - if ids.is_some() { - let role = self - .state - .migration() - .expect( - "we just got a `put_migration_ids` request with `Some` IDs, \ - so we should have a migration" - ) - .role; - if role == MigrationRole::Source { - self.queue_successful_migration(MigrationRole::Source) - } - } + // if ids.is_some() { + // let role = self + // .state + // .migration() + // .expect( + // "we just got a `put_migration_ids` request with `Some` IDs, \ + // so we should have a migration" + // ) + // .role; + // if role == MigrationRole::Source { + // self.queue_successful_migration(MigrationRole::Source) + // } + // } + todo!(); Ok(self.state.sled_instance_state()) } diff --git a/sled-agent/src/sim/sled_agent.rs b/sled-agent/src/sim/sled_agent.rs index ff93d598e03..746dd2f22ee 100644 --- a/sled-agent/src/sim/sled_agent.rs +++ b/sled-agent/src/sim/sled_agent.rs @@ -375,7 +375,8 @@ impl SledAgent { SledInstanceState { vmm_state: vmm_runtime, propolis_id, - migration_state: None, + migration_in: None, + migration_out: None, }, None, ) From 53299b37a132eb9ba79dc33022b98cca13a99716 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Wed, 19 Jun 2024 16:43:40 -0700 Subject: [PATCH 050/234] compiley-ness --- sled-agent/src/common/instance.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sled-agent/src/common/instance.rs b/sled-agent/src/common/instance.rs index e35d9e3a28b..358a2fde4ad 100644 --- a/sled-agent/src/common/instance.rs +++ b/sled-agent/src/common/instance.rs @@ -505,7 +505,7 @@ mod test { #[test] fn destruction_after_migration_out_does_not_transition() { let mut state = make_migration_source_instance(); - let migration_id = state.migration_out.unwrap().migration_id; + let migration_id = state.migration_out.as_ref().unwrap().migration_id; // After a migration succeeds, the source VM appears to stop but reports // that the migration has succeeded. @@ -581,7 +581,7 @@ mod test { #[test] fn failure_after_migration_in_does_not_transition() { let mut state = make_migration_target_instance(); - let migration_id = state.migration_in.unwrap().migration_id; + let migration_id = state.migration_in.as_ref().unwrap().migration_id; // Failure to migrate into an instance should mark the VMM as destroyed // but should not change the instance's migration IDs. From 777cdcca6d2b2559915ea522f986719a6798d6b7 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Thu, 20 Jun 2024 11:50:03 -0700 Subject: [PATCH 051/234] remove most of `instance_set_migration_ids` --- clients/sled-agent-client/src/lib.rs | 40 +++++ nexus/src/app/instance.rs | 73 +-------- nexus/src/app/sagas/instance_migrate.rs | 22 +-- nexus/tests/integration_tests/instances.rs | 75 ++++++++- sled-agent/src/common/instance.rs | 30 +--- sled-agent/src/http_entrypoints.rs | 19 +-- sled-agent/src/instance.rs | 50 +----- sled-agent/src/instance_manager.rs | 53 +------ sled-agent/src/sim/http_entrypoints.rs | 35 +++-- sled-agent/src/sim/instance.rs | 173 +++++++++++---------- sled-agent/src/sim/sled_agent.rs | 53 ++++--- sled-agent/src/sled_agent.rs | 23 +-- 12 files changed, 292 insertions(+), 354 deletions(-) diff --git a/clients/sled-agent-client/src/lib.rs b/clients/sled-agent-client/src/lib.rs index ba3a1256ce3..98671ebf48d 100644 --- a/clients/sled-agent-client/src/lib.rs +++ b/clients/sled-agent-client/src/lib.rs @@ -5,6 +5,9 @@ //! Interface for making API requests to a Sled Agent use async_trait::async_trait; +use schemars::JsonSchema; +use serde::Deserialize; +use serde::Serialize; use std::convert::TryFrom; use uuid::Uuid; @@ -445,6 +448,11 @@ impl From #[async_trait] pub trait TestInterfaces { async fn instance_finish_transition(&self, id: Uuid); + async fn instance_simulate_migration_source( + &self, + id: Uuid, + params: SimulateMigrationSource, + ); async fn disk_finish_transition(&self, id: Uuid); } @@ -471,4 +479,36 @@ impl TestInterfaces for Client { .await .expect("disk_finish_transition() failed unexpectedly"); } + + async fn instance_simulate_migration_source( + &self, + id: Uuid, + params: SimulateMigrationSource, + ) { + let baseurl = self.baseurl(); + let client = self.client(); + let url = format!("{baseurl}/instances/{id}/sim-migration-source"); + client + .post(url) + .send() + .await + .expect("instance_simulate_migration_source() failed unexpectedly"); + } +} + +// N.B. that this needs to be kept in sync with the types defined in +// `sled_agent::sim`! AFAICT this is the first simulated-only interface that has +// a body, so I wasn't sure whether there was a nice way to do this without +// creating a cyclic dependency or taking a giant pile of query params instead +// of JSON... +#[derive(Serialize, Deserialize, JsonSchema)] +pub struct SimulateMigrationSource { + pub migration_id: Uuid, + pub result: SimulatedMigrationResult, +} + +#[derive(Serialize, Deserialize, JsonSchema)] +pub enum SimulatedMigrationResult { + Success, + Failure, } diff --git a/nexus/src/app/instance.rs b/nexus/src/app/instance.rs index 758364de79e..67db4377d91 100644 --- a/nexus/src/app/instance.rs +++ b/nexus/src/app/instance.rs @@ -548,7 +548,7 @@ impl super::Nexus { &self, opctx: &OpContext, instance_id: InstanceUuid, - sled_id: SledUuid, + src_propolis_id: PropolisUuid, prev_instance_runtime: &db::model::InstanceRuntimeState, migration_params: InstanceMigrationSourceParams, ) -> UpdateResult { @@ -560,42 +560,7 @@ impl super::Nexus { .lookup_for(authz::Action::Modify) .await?; - let sa = self.sled_client(&sled_id).await?; - let instance_put_result = sa - .instance_put_migration_ids( - &instance_id, - &InstancePutMigrationIdsBody { - old_runtime: prev_instance_runtime.clone().into(), - migration_params: Some(migration_params), - }, - ) - .await - .map(|res| Some(res.into_inner().into())) - .map_err(|e| SledAgentInstancePutError(e)); - - // Write the updated instance runtime state back to CRDB. If this - // outright fails, this operation fails. If the operation nominally - // succeeds but nothing was updated, this action is outdated and the - // caller should not proceed with migration. - let InstanceUpdateResult { instance_updated, .. } = - match instance_put_result { - Ok(state) => { - self.write_returned_instance_state(&instance_id, state) - .await? - } - Err(e) => { - if e.instance_unhealthy() { - let _ = self - .mark_instance_failed( - &instance_id, - &prev_instance_runtime, - &e, - ) - .await; - } - return Err(e.into()); - } - }; + let instance_updated = todo!("eliza: do this transition purely in nexus rather than in sled-agent..."); if instance_updated { Ok(self @@ -627,44 +592,12 @@ impl super::Nexus { pub(crate) async fn instance_clear_migration_ids( &self, instance_id: InstanceUuid, - sled_id: SledUuid, prev_instance_runtime: &db::model::InstanceRuntimeState, ) -> Result<(), Error> { assert!(prev_instance_runtime.migration_id.is_some()); assert!(prev_instance_runtime.dst_propolis_id.is_some()); - let sa = self.sled_client(&sled_id).await?; - let instance_put_result = sa - .instance_put_migration_ids( - &instance_id, - &InstancePutMigrationIdsBody { - old_runtime: prev_instance_runtime.clone().into(), - migration_params: None, - }, - ) - .await - .map(|res| Some(res.into_inner().into())) - .map_err(|e| SledAgentInstancePutError(e)); - - match instance_put_result { - Ok(state) => { - self.write_returned_instance_state(&instance_id, state).await?; - } - Err(e) => { - if e.instance_unhealthy() { - let _ = self - .mark_instance_failed( - &instance_id, - &prev_instance_runtime, - &e, - ) - .await; - } - return Err(e.into()); - } - } - - Ok(()) + todo!("eliza: do this transition in the DB rather than in sled-agent") } /// Reboot the specified instance. diff --git a/nexus/src/app/sagas/instance_migrate.rs b/nexus/src/app/sagas/instance_migrate.rs index b8599feb049..23b8c94a7d2 100644 --- a/nexus/src/app/sagas/instance_migrate.rs +++ b/nexus/src/app/sagas/instance_migrate.rs @@ -77,14 +77,7 @@ declare_saga_actions! { // This step the instance's migration ID and destination Propolis ID - // fields. Because the instance is active, its current sled agent maintains - // its most recent runtime state, so to update it, the saga calls into the - // sled and asks it to produce an updated instance record with the - // appropriate migration IDs and a new generation number. - // - // The source sled agent synchronizes concurrent attempts to set these IDs. - // Setting a new migration ID and re-setting an existing ID are allowed, but - // trying to set an ID when a different ID is already present fails. + // fields. SET_MIGRATION_IDS -> "set_migration_ids" { + sim_set_migration_ids - sim_clear_migration_ids @@ -323,14 +316,15 @@ async fn sim_set_migration_ids( let db_instance = ¶ms.instance; let instance_id = InstanceUuid::from_untyped_uuid(db_instance.id()); - let src_sled_id = SledUuid::from_untyped_uuid(params.src_vmm.sled_id); + let src_propolis_id = + PropolisUuid::from_untyped_uuid(params.src_vmm.sled_id); let migration_id = sagactx.lookup::("migrate_id")?; let dst_propolis_id = sagactx.lookup::("dst_propolis_id")?; info!(osagactx.log(), "setting migration IDs on migration source sled"; "instance_id" => %db_instance.id(), - "sled_id" => %src_sled_id, "migration_id" => %migration_id, + "src_propolis_id" => %src_propolis_id, "dst_propolis_id" => %dst_propolis_id, "prev_runtime_state" => ?db_instance.runtime()); @@ -339,7 +333,7 @@ async fn sim_set_migration_ids( .instance_set_migration_ids( &opctx, instance_id, - src_sled_id, + src_propolis_id, db_instance.runtime(), InstanceMigrationSourceParams { dst_propolis_id, migration_id }, ) @@ -378,11 +372,7 @@ async fn sim_clear_migration_ids( // as failed. if let Err(e) = osagactx .nexus() - .instance_clear_migration_ids( - instance_id, - src_sled_id, - db_instance.runtime(), - ) + .instance_clear_migration_ids(instance_id, db_instance.runtime()) .await { warn!(osagactx.log(), diff --git a/nexus/tests/integration_tests/instances.rs b/nexus/tests/integration_tests/instances.rs index 9c965ccf8ae..d670ebc43e3 100644 --- a/nexus/tests/integration_tests/instances.rs +++ b/nexus/tests/integration_tests/instances.rs @@ -842,10 +842,18 @@ async fn test_instance_migrate(cptestctx: &ControlPlaneTestContext) { // sufficient to move the instance back into a Running state (strictly // speaking no further updates from the source are required if the target // successfully takes over). - instance_simulate_on_sled(cptestctx, nexus, dst_sled_id, instance_id).await; + instance_simulate_migration_source( + cptestctx, + nexus, + original_sled, + instance_id, + migration_id, + ) + .await; // Ensure that both sled agents report that the migration has completed. instance_simulate_on_sled(cptestctx, nexus, original_sled, instance_id) .await; + instance_simulate_on_sled(cptestctx, nexus, dst_sled_id, instance_id).await; let instance = instance_get(&client, &instance_url).await; assert_eq!(instance.runtime.run_state, InstanceState::Running); @@ -973,8 +981,40 @@ async fn test_instance_migrate_v2p_and_routes( .parsed_body::() .unwrap(); + let migration_id = { + let datastore = apictx.nexus.datastore(); + let opctx = OpContext::for_tests( + cptestctx.logctx.log.new(o!()), + datastore.clone(), + ); + let (.., authz_instance) = LookupPath::new(&opctx, &datastore) + .instance_id(instance.identity.id) + .lookup_for(nexus_db_queries::authz::Action::Read) + .await + .unwrap(); + datastore + .instance_refetch(&opctx, &authz_instance) + .await + .unwrap() + .runtime_state + .migration_id + .expect("since we've started a migration, the instance record must have a migration id!") + }; + + // Tell both sled-agents to pretend to do the migration. + instance_simulate_migration_source( + cptestctx, + nexus, + original_sled, + instance_id, + migration_id, + ) + .await; + instance_simulate_on_sled(cptestctx, nexus, original_sled_id, instance_id) + .await; instance_simulate_on_sled(cptestctx, nexus, dst_sled_id, instance_id).await; let instance = instance_get(&client, &instance_url).await; + assert_eq!(instance.runtime.run_state, InstanceState::Running); let current_sled = nexus .instance_sled_id(&instance_id) @@ -4923,3 +4963,36 @@ async fn instance_simulate_on_sled( let sa = nexus.sled_client(&sled_id).await.unwrap(); sa.instance_finish_transition(instance_id.into_untyped_uuid()).await; } + +/// Simulates a migration source for the provided instance ID, sled ID, and +/// migration ID. +// +// XXX(eliza): I had really wanted to have the migration target's simulated +// sled-agent do this automagically when it's told to start a migration in, but +// unfortunately, I wasn't able to figure out a way for it to get the simulated +// *sled-agent*'s IP --- it just gets the Propolis IP in the migration target +// params, and the propolis doesn't actually exist... +async fn instance_simulate_migration_source( + cptestctx: &ControlPlaneTestContext, + nexus: &Arc, + sled_id: SledUuid, + instance_id: InstanceUuid, + migration_id: Uuid, +) { + info!( + &cptestctx.logctx.log, + "Simulating migration source sled"; + "instance_id" => %instance_id, + "sled_id" => %sled_id, + "migration_id" => %migration_id, + ); + let sa = nexus.sled_client(&sled_id).await.unwrap(); + sa.instance_simulate_migrationSource( + instance_id.into_untyped_uuid(), + sled_agent_client::SimulateMigrationSource { + migration_id, + result: sled_agent_client::SimulatedMigrationResult::Success, + }, + ) + .await; +} diff --git a/sled-agent/src/common/instance.rs b/sled-agent/src/common/instance.rs index 358a2fde4ad..87939cb7cf2 100644 --- a/sled-agent/src/common/instance.rs +++ b/sled-agent/src/common/instance.rs @@ -4,7 +4,6 @@ //! Describes the states of VM instances. -use crate::params::InstanceMigrationSourceParams; use chrono::{DateTime, Utc}; use omicron_common::api::external::Generation; use omicron_common::api::internal::nexus::{ @@ -191,6 +190,14 @@ impl InstanceStates { self.propolis_id } + pub fn migration_in(&self) -> Option<&MigrationRuntimeState> { + self.migration_in.as_ref() + } + + pub fn migration_out(&self) -> Option<&MigrationRuntimeState> { + self.migration_out.as_ref() + } + /// Creates a `SledInstanceState` structure containing the entirety of this /// structure's runtime state. This requires cloning; for simple read access /// use the `instance` or `vmm` accessors instead. @@ -348,27 +355,6 @@ impl InstanceStates { self.apply_propolis_observation(&fake_observed); } - - /// Sets or clears this instance's migration IDs and advances its Propolis - /// generation number. - #[deprecated(note = "eliza get rid of this")] - pub(crate) fn set_migration_ids( - &mut self, - ids: &Option, - now: DateTime, - ) { - } - - /// Returns true if the migration IDs in this instance are already set as they - /// would be on a successful transition from the migration IDs in - /// `old_runtime` to the ones in `migration_ids`. - #[deprecated(note = "eliza get rid of this")] - pub(crate) fn migration_ids_already_set( - &self, - migration_ids: &Option, - ) -> bool { - false - } } #[cfg(test)] diff --git a/sled-agent/src/http_entrypoints.rs b/sled-agent/src/http_entrypoints.rs index 407254419cf..7ce7be1d079 100644 --- a/sled-agent/src/http_entrypoints.rs +++ b/sled-agent/src/http_entrypoints.rs @@ -501,20 +501,13 @@ async fn instance_get_state( path = "/instances/{instance_id}/migration-ids", }] async fn instance_put_migration_ids( - rqctx: RequestContext, - path_params: Path, - body: TypedBody, + _: RequestContext, + _: Path, + _: TypedBody, ) -> Result, HttpError> { - let sa = rqctx.context(); - let instance_id = path_params.into_inner().instance_id; - let body_args = body.into_inner(); - Ok(HttpResponseOk( - sa.instance_put_migration_ids( - instance_id, - &body_args.old_runtime, - &body_args.migration_params, - ) - .await?, + Err(HttpError::for_bad_request( + None, + "operation no longer supported".to_string(), )) } diff --git a/sled-agent/src/instance.rs b/sled-agent/src/instance.rs index 6034b50ee8b..e271d268120 100644 --- a/sled-agent/src/instance.rs +++ b/sled-agent/src/instance.rs @@ -16,9 +16,9 @@ use crate::nexus::NexusClientWithResolver; use crate::params::ZoneBundleMetadata; use crate::params::{InstanceExternalIpBody, ZoneBundleCause}; use crate::params::{ - InstanceHardware, InstanceMetadata, InstanceMigrationSourceParams, - InstanceMigrationTargetParams, InstancePutStateResponse, - InstanceStateRequested, InstanceUnregisterResponse, VpcFirewallRule, + InstanceHardware, InstanceMetadata, InstanceMigrationTargetParams, + InstancePutStateResponse, InstanceStateRequested, + InstanceUnregisterResponse, VpcFirewallRule, }; use crate::profile::*; use crate::zone_bundle::BundleError; @@ -33,7 +33,7 @@ use illumos_utils::running_zone::{RunningZone, ZoneBuilderFactory}; use illumos_utils::svc::wait_for_service; use illumos_utils::zone::PROPOLIS_ZONE_PREFIX; use omicron_common::api::internal::nexus::{ - InstanceRuntimeState, SledInstanceState, VmmRuntimeState, + SledInstanceState, VmmRuntimeState, }; use omicron_common::api::internal::shared::{ NetworkInterface, SledIdentifiers, SourceNatConfig, @@ -228,11 +228,6 @@ enum InstanceRequest { state: crate::params::InstanceStateRequested, tx: oneshot::Sender>, }, - PutMigrationIds { - old_runtime: InstanceRuntimeState, - migration_ids: Option, - tx: oneshot::Sender>, - }, Terminate { mark_failed: bool, tx: oneshot::Sender>, @@ -428,12 +423,6 @@ impl InstanceRunner { .map_err(|e| e.into())) .map_err(|_| Error::FailedSendClientClosed) }, - Some(PutMigrationIds{ old_runtime, migration_ids, tx }) => { - tx.send( - self.put_migration_ids(&migration_ids).await.map_err(|e| e.into()) - ) - .map_err(|_| Error::FailedSendClientClosed) - }, Some(Terminate { mark_failed, tx }) => { tx.send(Ok(InstanceUnregisterResponse { updated_runtime: Some(self.terminate(mark_failed).await) @@ -1155,23 +1144,6 @@ impl Instance { Ok(()) } - pub async fn put_migration_ids( - &self, - tx: oneshot::Sender>, - old_runtime: InstanceRuntimeState, - migration_ids: Option, - ) -> Result<(), Error> { - self.tx - .send(InstanceRequest::PutMigrationIds { - old_runtime, - migration_ids, - tx, - }) - .await - .map_err(|_| Error::FailedSendChannelClosed)?; - Ok(()) - } - /// Rudely terminates this instance's Propolis (if it has one) and /// immediately transitions the instance to the Destroyed state. pub async fn terminate( @@ -1358,20 +1330,6 @@ impl InstanceRunner { Ok(self.state.sled_instance_state()) } - async fn put_migration_ids( - &mut self, - migration_ids: &Option, - ) -> Result { - // Allow this transition for idempotency if the instance is - // already in the requested goal state. - if self.state.migration_ids_already_set(migration_ids) { - return Ok(self.state.sled_instance_state()); - } - - self.state.set_migration_ids(migration_ids, Utc::now()); - Ok(self.state.sled_instance_state()) - } - async fn setup_propolis_inner(&mut self) -> Result { // Create OPTE ports for the instance. We also store the names of all // those ports to notify the metrics task to start collecting statistics diff --git a/sled-agent/src/instance_manager.rs b/sled-agent/src/instance_manager.rs index afa1e7797e1..012af14b6ba 100644 --- a/sled-agent/src/instance_manager.rs +++ b/sled-agent/src/instance_manager.rs @@ -12,8 +12,8 @@ use crate::params::InstanceExternalIpBody; use crate::params::InstanceMetadata; use crate::params::ZoneBundleMetadata; use crate::params::{ - InstanceHardware, InstanceMigrationSourceParams, InstancePutStateResponse, - InstanceStateRequested, InstanceUnregisterResponse, + InstanceHardware, InstancePutStateResponse, InstanceStateRequested, + InstanceUnregisterResponse, }; use crate::vmm_reservoir::VmmReservoirManagerHandle; use crate::zone_bundle::BundleError; @@ -225,26 +225,6 @@ impl InstanceManager { } } - pub async fn put_migration_ids( - &self, - instance_id: InstanceUuid, - old_runtime: &InstanceRuntimeState, - migration_ids: &Option, - ) -> Result { - let (tx, rx) = oneshot::channel(); - self.inner - .tx - .send(InstanceManagerRequest::PutMigrationIds { - instance_id, - old_runtime: old_runtime.clone(), - migration_ids: *migration_ids, - tx, - }) - .await - .map_err(|_| Error::FailedSendInstanceManagerClosed)?; - rx.await? - } - pub async fn instance_issue_disk_snapshot_request( &self, instance_id: InstanceUuid, @@ -382,12 +362,7 @@ enum InstanceManagerRequest { target: InstanceStateRequested, tx: oneshot::Sender>, }, - PutMigrationIds { - instance_id: InstanceUuid, - old_runtime: InstanceRuntimeState, - migration_ids: Option, - tx: oneshot::Sender>, - }, + InstanceIssueDiskSnapshot { instance_id: InstanceUuid, disk_id: Uuid, @@ -515,9 +490,6 @@ impl InstanceManagerRunner { Some(EnsureState { instance_id, target, tx }) => { self.ensure_state(tx, instance_id, target).await }, - Some(PutMigrationIds { instance_id, old_runtime, migration_ids, tx }) => { - self.put_migration_ids(tx, instance_id, &old_runtime, &migration_ids).await - }, Some(InstanceIssueDiskSnapshot { instance_id, disk_id, snapshot_id, tx }) => { self.instance_issue_disk_snapshot_request(tx, instance_id, disk_id, snapshot_id).await }, @@ -728,25 +700,6 @@ impl InstanceManagerRunner { Ok(()) } - /// Idempotently attempts to set the instance's migration IDs to the - /// supplied IDs. - async fn put_migration_ids( - &mut self, - tx: oneshot::Sender>, - instance_id: InstanceUuid, - old_runtime: &InstanceRuntimeState, - migration_ids: &Option, - ) -> Result<(), Error> { - let (_, instance) = self - .instances - .get(&instance_id) - .ok_or_else(|| Error::NoSuchInstance(instance_id))?; - instance - .put_migration_ids(tx, old_runtime.clone(), *migration_ids) - .await?; - Ok(()) - } - async fn instance_issue_disk_snapshot_request( &self, tx: oneshot::Sender>, diff --git a/sled-agent/src/sim/http_entrypoints.rs b/sled-agent/src/sim/http_entrypoints.rs index 268e8a9cf1e..40216457b35 100644 --- a/sled-agent/src/sim/http_entrypoints.rs +++ b/sled-agent/src/sim/http_entrypoints.rs @@ -162,20 +162,13 @@ async fn instance_get_state( path = "/instances/{instance_id}/migration-ids", }] async fn instance_put_migration_ids( - rqctx: RequestContext>, - path_params: Path, - body: TypedBody, + _: RequestContext>, + _: Path, + _: TypedBody, ) -> Result, HttpError> { - let sa = rqctx.context(); - let instance_id = path_params.into_inner().instance_id; - let body_args = body.into_inner(); - Ok(HttpResponseOk( - sa.instance_put_migration_ids( - instance_id, - &body_args.old_runtime, - &body_args.migration_params, - ) - .await?, + Err(HttpError::for_bad_request( + None, + "operation no longer supported".to_string(), )) } @@ -225,6 +218,22 @@ async fn instance_poke_post( Ok(HttpResponseUpdatedNoContent()) } +#[endpoint { + method = POST, + path = "/instances/{instance_id}/sim-migration-source", +}] +async fn instance_post_sim_migration_source( + rqctx: RequestContext>, + path_params: Path, + body: TypedBody, +) -> Result { + let sa = rqctx.context(); + let instance_id = path_params.into_inner().instance_id; + sa.instance_simulate_migration_source(instance_id, body.into_inner()) + .await?; + Ok(HttpResponseUpdatedNoContent()) +} + /// Path parameters for Disk requests (sled agent API) #[derive(Deserialize, JsonSchema)] struct DiskPathParam { diff --git a/sled-agent/src/sim/instance.rs b/sled-agent/src/sim/instance.rs index abd7b78223e..0ccf99fe251 100644 --- a/sled-agent/src/sim/instance.rs +++ b/sled-agent/src/sim/instance.rs @@ -8,21 +8,22 @@ use super::simulatable::Simulatable; use crate::common::instance::{ObservedPropolisState, PublishedVmmState}; use crate::nexus::NexusClient; -use crate::params::{InstanceMigrationSourceParams, InstanceStateRequested}; +use crate::params::InstanceStateRequested; use async_trait::async_trait; use chrono::Utc; use nexus_client; use omicron_common::api::external::Error; use omicron_common::api::external::Generation; use omicron_common::api::external::ResourceType; -use omicron_common::api::internal::nexus::{ - InstanceRuntimeState, SledInstanceState, VmmState, -}; +use omicron_common::api::internal::nexus::{SledInstanceState, VmmState}; use propolis_client::types::{ InstanceMigrateStatusResponse as PropolisMigrateResponse, InstanceMigrationStatus as PropolisMigrationStatus, InstanceState as PropolisInstanceState, InstanceStateMonitorResponse, }; +use schemars::JsonSchema; +use serde::Deserialize; +use serde::Serialize; use std::collections::VecDeque; use std::sync::Arc; use std::sync::Mutex; @@ -30,6 +31,18 @@ use uuid::Uuid; use crate::common::instance::{Action as InstanceAction, InstanceStates}; +#[derive(Serialize, Deserialize, JsonSchema)] +pub struct SimulateMigrationSource { + pub(in crate::sim) migration_id: Uuid, + pub(in crate::sim) result: SimulatedMigrationResult, +} + +#[derive(Serialize, Deserialize, JsonSchema)] +pub(in crate::sim) enum SimulatedMigrationResult { + Success, + Failure, +} + #[derive(Clone, Debug)] enum MonitorChange { PropolisState(PropolisInstanceState), @@ -79,49 +92,68 @@ impl SimInstanceInner { self.queue.push_back(MonitorChange::MigrateStatus(migrate_status)) } - /// Queue a successful simulated migration. - /// - fn queue_successful_migration(&mut self) { + /// Queue a simulated migration out. + fn queue_migration_out( + &mut self, + migration_id: Uuid, + result: SimulatedMigrationResult, + ) { + let migration_update = |state| PropolisMigrateResponse { + migration_in: None, + migration_out: Some(PropolisMigrationStatus { + id: migration_id, + state, + }), + }; + // Propolis transitions to the Migrating state once before + // actually starting migration. + self.queue_propolis_state(PropolisInstanceState::Migrating); + self.queue_migration_update(migration_update( + propolis_client::types::MigrationState::Sync, + )); + match result { + SimulatedMigrationResult::Success => { + self.queue_migration_update(migration_update( + propolis_client::types::MigrationState::Finish, + )); + self.queue_graceful_stop(); + } + SimulatedMigrationResult::Failure => { + todo!("finish this part when we actuall need it...") + } + } + } + + /// Queue a simulated migration in. + fn queue_migration_in( + &mut self, + migration_id: Uuid, + result: SimulatedMigrationResult, + ) { + let migration_update = |state| PropolisMigrateResponse { + migration_in: Some(PropolisMigrationStatus { + id: migration_id, + state, + }), + migration_out: None, + }; // Propolis transitions to the Migrating state once before // actually starting migration. self.queue_propolis_state(PropolisInstanceState::Migrating); - todo!("eliza: fix this bit") - // match role { - // MigrationRole::Source => { - // self.queue_migration_update(PropolisMigrateResponse { - // migration_in: None, - // migration_out: Some(PropolisMigrationStatus { - // id: todo! - // state: propolis_client::types::MigrationState::Sync, - // }), - // }); - // self.queue_migration_update(PropolisMigrateResponse { - // migration_in: None, - // migration_out: Some(PropolisMigrationStatus { - // id: migration_id, - // state: propolis_client::types::MigrationState::Finish, - // }), - // }); - // self.queue_graceful_stop(); - // } - // MigrationRole::Target => { - // self.queue_migration_update(PropolisMigrateResponse { - // migration_in: Some(PropolisMigrationStatus { - // id: migration_id, - // state: propolis_client::types::MigrationState::Sync, - // }), - // migration_out: None, - // }); - // self.queue_migration_update(PropolisMigrateResponse { - // migration_in: Some(PropolisMigrationStatus { - // id: migration_id, - // state: propolis_client::types::MigrationState::Finish, - // }), - // migration_out: None, - // }); - // self.queue_propolis_state(PropolisInstanceState::Running) - // } - // } + self.queue_migration_update(migration_update( + propolis_client::types::MigrationState::Sync, + )); + match result { + SimulatedMigrationResult::Success => { + self.queue_migration_update(migration_update( + propolis_client::types::MigrationState::Finish, + )); + self.queue_propolis_state(PropolisInstanceState::Running) + } + SimulatedMigrationResult::Failure => { + todo!("finish this part when we actuall need it...") + } + } } fn queue_graceful_stop(&mut self) { @@ -171,7 +203,13 @@ impl SimInstanceInner { ))); } - // self.queue_successful_migration(MigrationRole::Target) + let migration_id = self.state.migration_out() + .ok_or_else(|| Error::invalid_request("can't request migration in for a vmm that wasn't created with a migration ID"))? + .migration_id; + self.queue_migration_in( + migration_id, + SimulatedMigrationResult::Success, + ); } InstanceStateRequested::Running => { match self.next_resting_state() { @@ -361,38 +399,6 @@ impl SimInstanceInner { self.destroyed = true; self.state.sled_instance_state() } - - /// Stores a set of migration IDs in the instance's runtime state. - fn put_migration_ids( - &mut self, - ids: &Option, - ) -> Result { - if self.state.migration_ids_already_set(ids) { - return Ok(self.state.sled_instance_state()); - } - - self.state.set_migration_ids(ids, Utc::now()); - - // If we set migration IDs and are the migration source, ensure that we - // will perform the correct state transitions to simulate a successful - // migration. - // if ids.is_some() { - // let role = self - // .state - // .migration() - // .expect( - // "we just got a `put_migration_ids` request with `Some` IDs, \ - // so we should have a migration" - // ) - // .role; - // if role == MigrationRole::Source { - // self.queue_successful_migration(MigrationRole::Source) - // } - // } - todo!(); - - Ok(self.state.sled_instance_state()) - } } /// A simulation of an Instance created by the external Oxide API. @@ -420,13 +426,14 @@ impl SimInstance { self.inner.lock().unwrap().terminate() } - pub async fn put_migration_ids( + pub(crate) fn set_simulated_migration_source( &self, - old_runtime: &InstanceRuntimeState, - ids: &Option, - ) -> Result { - let mut inner = self.inner.lock().unwrap(); - inner.put_migration_ids(ids) + migration: SimulateMigrationSource, + ) { + self.inner + .lock() + .unwrap() + .queue_migration_in(migration.migration_id, migration.result); } } diff --git a/sled-agent/src/sim/sled_agent.rs b/sled-agent/src/sim/sled_agent.rs index 746dd2f22ee..7b198695f23 100644 --- a/sled-agent/src/sim/sled_agent.rs +++ b/sled-agent/src/sim/sled_agent.rs @@ -7,14 +7,14 @@ use super::collection::{PokeMode, SimCollection}; use super::config::Config; use super::disk::SimDisk; -use super::instance::SimInstance; +use super::instance::{self, SimInstance}; use super::storage::CrucibleData; use super::storage::Storage; use crate::nexus::NexusClient; use crate::params::{ DiskStateRequested, InstanceExternalIpBody, InstanceHardware, - InstanceMetadata, InstanceMigrationSourceParams, InstancePutStateResponse, - InstanceStateRequested, InstanceUnregisterResponse, + InstanceMetadata, InstancePutStateResponse, InstanceStateRequested, + InstanceUnregisterResponse, }; use crate::sim::simulatable::Simulatable; use crate::updates::UpdateManager; @@ -30,7 +30,7 @@ use omicron_common::api::external::{ ByteCount, DiskState, Error, Generation, ResourceType, }; use omicron_common::api::internal::nexus::{ - DiskRuntimeState, SledInstanceState, + DiskRuntimeState, MigrationRuntimeState, MigrationState, SledInstanceState, }; use omicron_common::api::internal::nexus::{ InstanceRuntimeState, VmmRuntimeState, @@ -265,7 +265,7 @@ impl SledAgent { instance_id: InstanceUuid, propolis_id: PropolisUuid, hardware: InstanceHardware, - _instance_runtime: InstanceRuntimeState, + instance_runtime: InstanceRuntimeState, vmm_runtime: VmmRuntimeState, metadata: InstanceMetadata, ) -> Result { @@ -368,6 +368,15 @@ impl SledAgent { } } + let migration_in = instance_runtime.migration_id.map(|migration_id| { + MigrationRuntimeState { + migration_id, + state: MigrationState::Pending, + gen: Generation::new(), + time_updated: chrono::Utc::now(), + } + }); + let instance_run_time_state = self .instances .sim_ensure( @@ -375,7 +384,7 @@ impl SledAgent { SledInstanceState { vmm_state: vmm_runtime, propolis_id, - migration_in: None, + migration_in, migration_out: None, }, None, @@ -540,6 +549,24 @@ impl SledAgent { Ok(instance.current()) } + pub async fn instance_simulate_migration_source( + &self, + instance_id: InstanceUuid, + migration: instance::SimulateMigrationSource, + ) -> Result<(), HttpError> { + let instance = self + .instances + .sim_get_cloned_object(&instance_id.into_untyped_uuid()) + .await + .map_err(|_| { + crate::sled_agent::Error::Instance( + crate::instance_manager::Error::NoSuchInstance(instance_id), + ) + })?; + instance.set_simulated_migration_source(migration); + Ok(()) + } + pub async fn set_instance_ensure_state_error(&self, error: Option) { *self.instance_ensure_state_error.lock().await = error; } @@ -563,20 +590,6 @@ impl SledAgent { Ok(()) } - pub async fn instance_put_migration_ids( - self: &Arc, - instance_id: InstanceUuid, - old_runtime: &InstanceRuntimeState, - migration_ids: &Option, - ) -> Result { - let instance = self - .instances - .sim_get_cloned_object(&instance_id.into_untyped_uuid()) - .await?; - - instance.put_migration_ids(old_runtime, migration_ids).await - } - /// Idempotently ensures that the given API Disk (described by `api_disk`) /// is attached (or not) as specified. This simulates disk attach and /// detach, similar to instance boot and halt. diff --git a/sled-agent/src/sled_agent.rs b/sled-agent/src/sled_agent.rs index 8fa18b0a633..f8454a0f7ba 100644 --- a/sled-agent/src/sled_agent.rs +++ b/sled-agent/src/sled_agent.rs @@ -18,9 +18,9 @@ use crate::nexus::{ }; use crate::params::{ DiskStateRequested, InstanceExternalIpBody, InstanceHardware, - InstanceMetadata, InstanceMigrationSourceParams, InstancePutStateResponse, - InstanceStateRequested, InstanceUnregisterResponse, OmicronZoneTypeExt, - TimeSync, VpcFirewallRule, ZoneBundleMetadata, Zpool, + InstanceMetadata, InstancePutStateResponse, InstanceStateRequested, + InstanceUnregisterResponse, OmicronZoneTypeExt, TimeSync, VpcFirewallRule, + ZoneBundleMetadata, Zpool, }; use crate::probe_manager::ProbeManager; use crate::services::{self, ServiceManager}; @@ -1011,23 +1011,6 @@ impl SledAgent { .map_err(|e| Error::Instance(e)) } - /// Idempotently ensures that the instance's runtime state contains the - /// supplied migration IDs, provided that the caller continues to meet the - /// conditions needed to change those IDs. See the doc comments for - /// [`crate::params::InstancePutMigrationIdsBody`]. - pub async fn instance_put_migration_ids( - &self, - instance_id: InstanceUuid, - old_runtime: &InstanceRuntimeState, - migration_ids: &Option, - ) -> Result { - self.inner - .instances - .put_migration_ids(instance_id, old_runtime, migration_ids) - .await - .map_err(|e| Error::Instance(e)) - } - /// Idempotently ensures that an instance's OPTE/port state includes the /// specified external IP address. /// From 67c268b398e4054df27d7ec6182d35e63d3450ab Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Thu, 20 Jun 2024 13:43:06 -0700 Subject: [PATCH 052/234] make instance-migrate sagas just set migration IDs --- nexus/db-queries/src/db/datastore/instance.rs | 158 ++++++++++++++++++ .../db-queries/src/db/datastore/migration.rs | 22 +-- nexus/src/app/instance.rs | 73 -------- nexus/src/app/sagas/instance_migrate.rs | 94 +++++++---- nexus/tests/integration_tests/instances.rs | 4 +- sled-agent/src/sim/http_entrypoints.rs | 1 + 6 files changed, 233 insertions(+), 119 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/instance.rs b/nexus/db-queries/src/db/datastore/instance.rs index 0e1209933a9..7bde43dad52 100644 --- a/nexus/db-queries/src/db/datastore/instance.rs +++ b/nexus/db-queries/src/db/datastore/instance.rs @@ -518,6 +518,164 @@ impl DataStore { Ok(updated) } + /// Updates an instance record by setting the instance's migration ID. + pub async fn instance_set_migration_ids( + &self, + opctx: &OpContext, + instance_id: InstanceUuid, + src_propolis_id: PropolisUuid, + migration_id: Uuid, + target_propolis_id: PropolisUuid, + ) -> Result { + use db::schema::instance::dsl; + + let instance_id = instance_id.into_untyped_uuid(); + let target_propolis_id = target_propolis_id.into_untyped_uuid(); + let src_propolis_id = src_propolis_id.into_untyped_uuid(); + let updated = diesel::update(dsl::instance) + .filter(dsl::time_deleted.is_null()) + .filter(dsl::id.eq(instance_id)) + .filter(dsl::migration_id.is_null()) + .filter(dsl::target_propolis_id.is_null()) + .filter(dsl::active_propolis_id.eq(src_propolis_id)) + .set(( + dsl::migration_id.eq(Some(migration_id)), + dsl::target_propolis_id.eq(Some(target_propolis_id)), + // advance the generation + dsl::state_generation.eq(dsl::state_generation + 1), + dsl::time_state_updated.eq(Utc::now()), + )) + .check_if_exists::(instance_id.into_untyped_uuid()) + .execute_and_check(&*self.pool_connection_authorized(&opctx).await?) + .await + .map_err(|e| { + public_error_from_diesel( + e, + ErrorHandler::NotFoundByLookup( + ResourceType::Instance, + LookupType::ById(instance_id), + ), + ) + })?; + + match updated { + // If we updated the instance, that's great! Good job team! + UpdateAndQueryResult { status: UpdateStatus::Updated, .. } => { + Ok(true) + } + // No update was performed because the migration ID has already been + // set to the ID we were trying to set it to. That's fine, count it + // as a success. + UpdateAndQueryResult { + found: Instance { runtime_state, .. }, + .. + } if runtime_state.migration_id == Some(migration_id) => { + debug_assert_eq!( + runtime_state.dst_propolis_id, + Some(target_propolis_id) + ); + debug_assert_eq!( + runtime_state.propolis_id, + Some(src_propolis_id) + ); + Ok(false) + } + + // On the other hand, if there was already a different migration ID, + // that means another migrate saga has already started a migration. + // Guess I'll die! + UpdateAndQueryResult { + found: + Instance { + runtime_state: + InstanceRuntimeState { + migration_id: Some(actual_migration_id), + .. + }, + .. + }, + .. + } => { + slog::info!( + opctx.log, + "failed to set instance migration IDs: a different migration ID was already set"; + "instance_id" => %instance_id, + "desired_migration_id" => %migration_id, + "actual_migration_id" => %actual_migration_id, + ); + Err(Error::conflict("instance is already migrating")) + } + // If one of the other filters didn't match, our understanding of + // the instance's state is clearly pretty wromg. + UpdateAndQueryResult { + found: Instance { runtime_state, .. }, + .. + } => { + slog::warn!( + opctx.log, + "failed to set instance migration IDs: one of its Propolis IDs was what way we anticipated!"; + "instance_id" => %instance_id, + "desired_migration_id" => %migration_id, + "desired_active_propolis_id" => %src_propolis_id, + "desired_target_propolis_id" => %target_propolis_id, + "actual_migration_id" => ?runtime_state.migration_id, + "actual_active_propolis_id" => ?runtime_state.propolis_id, + "actual_target_propolis_id" => ?runtime_state.dst_propolis_id, + ); + Err(Error::conflict( + "instance snapshot didn't match actual state", + )) + } + } + } + + /// Unsets the migration IDs set by + /// [`DataStore::instance_set_migration_ids`]. + /// + /// This method will only unset the instance's migration IDs if they match + /// the provided ones. + pub async fn instance_unset_migration_ids( + &self, + opctx: &OpContext, + instance_id: InstanceUuid, + migration_id: Uuid, + target_propolis_id: PropolisUuid, + ) -> Result { + use db::schema::instance::dsl; + + let instance_id = instance_id.into_untyped_uuid(); + let target_propolis_id = target_propolis_id.into_untyped_uuid(); + let updated = diesel::update(dsl::instance) + .filter(dsl::time_deleted.is_null()) + .filter(dsl::id.eq(instance_id)) + .filter(dsl::migration_id.eq(migration_id)) + .filter(dsl::target_propolis_id.eq(target_propolis_id)) + .set(( + dsl::migration_id.eq(None::), + dsl::target_propolis_id.eq(None::), + // advance the generation + dsl::state_generation.eq(dsl::state_generation + 1), + dsl::time_state_updated.eq(Utc::now()), + )) + .check_if_exists::(instance_id.into_untyped_uuid()) + .execute_and_check(&*self.pool_connection_authorized(&opctx).await?) + .await + .map(|r| match r.status { + UpdateStatus::Updated => true, + UpdateStatus::NotUpdatedButExists => false, + }) + .map_err(|e| { + public_error_from_diesel( + e, + ErrorHandler::NotFoundByLookup( + ResourceType::Instance, + LookupType::ById(instance_id), + ), + ) + })?; + Ok(updated) + } + /// Updates an instance record and a VMM record with a single database /// command. /// diff --git a/nexus/db-queries/src/db/datastore/migration.rs b/nexus/db-queries/src/db/datastore/migration.rs index 5efe88e83f3..049f0b0f6ff 100644 --- a/nexus/db-queries/src/db/datastore/migration.rs +++ b/nexus/db-queries/src/db/datastore/migration.rs @@ -76,24 +76,24 @@ impl DataStore { .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) } - /// Marks a migration record as deleted if and only if both sides of the - /// migration are in a terminal state. - pub async fn migration_terminate( + /// Marks a migration record as failed. + pub async fn migration_mark_failed( &self, opctx: &OpContext, migration_id: Uuid, ) -> UpdateResult { - const TERMINAL_STATES: &[MigrationState] = &[ - MigrationState(nexus::MigrationState::Completed), - MigrationState(nexus::MigrationState::Failed), - ]; - + let failed = MigrationState(nexus::MigrationState::Failed); diesel::update(dsl::migration) .filter(dsl::id.eq(migration_id)) .filter(dsl::time_deleted.is_null()) - .filter(dsl::source_state.eq_any(TERMINAL_STATES)) - .filter(dsl::target_state.eq_any(TERMINAL_STATES)) - .set(dsl::time_deleted.eq(Utc::now())) + .set(( + dsl::source_state.eq(failed), + dsl::source_gen.eq(dsl::source_gen + 1), + dsl::time_source_updated.eq(Utc::now()), + dsl::target_state.eq(failed), + dsl::target_gen.eq(dsl::target_gen + 1), + dsl::time_target_updated.eq(Utc::now()), + )) .check_if_exists::(migration_id) .execute_and_check(&*self.pool_connection_authorized(opctx).await?) .await diff --git a/nexus/src/app/instance.rs b/nexus/src/app/instance.rs index 67db4377d91..6542bd962cd 100644 --- a/nexus/src/app/instance.rs +++ b/nexus/src/app/instance.rs @@ -59,10 +59,8 @@ use propolis_client::support::InstanceSerialConsoleHelper; use propolis_client::support::WSClientOffset; use propolis_client::support::WebSocketStream; use sagas::instance_common::ExternalIpAttach; -use sled_agent_client::types::InstanceMigrationSourceParams; use sled_agent_client::types::InstanceMigrationTargetParams; use sled_agent_client::types::InstanceProperties; -use sled_agent_client::types::InstancePutMigrationIdsBody; use sled_agent_client::types::InstancePutStateBody; use std::matches; use std::net::SocketAddr; @@ -529,77 +527,6 @@ impl super::Nexus { self.db_datastore.instance_fetch_with_vmm(opctx, &authz_instance).await } - /// Attempts to set the migration IDs for the supplied instance via the - /// sled specified in `db_instance`. - /// - /// The caller is assumed to have fetched the current instance record from - /// the DB and verified that the record has no migration IDs. - /// - /// Returns `Ok` and the updated instance record if this call successfully - /// updated the instance with the sled agent and that update was - /// successfully reflected into CRDB. Returns `Err` with an appropriate - /// error otherwise. - /// - /// # Panics - /// - /// Asserts that `db_instance` has no migration ID or destination Propolis - /// ID set. - pub(crate) async fn instance_set_migration_ids( - &self, - opctx: &OpContext, - instance_id: InstanceUuid, - src_propolis_id: PropolisUuid, - prev_instance_runtime: &db::model::InstanceRuntimeState, - migration_params: InstanceMigrationSourceParams, - ) -> UpdateResult { - assert!(prev_instance_runtime.migration_id.is_none()); - assert!(prev_instance_runtime.dst_propolis_id.is_none()); - - let (.., authz_instance) = LookupPath::new(opctx, &self.db_datastore) - .instance_id(instance_id.into_untyped_uuid()) - .lookup_for(authz::Action::Modify) - .await?; - - let instance_updated = todo!("eliza: do this transition purely in nexus rather than in sled-agent..."); - - if instance_updated { - Ok(self - .db_datastore - .instance_refetch(opctx, &authz_instance) - .await?) - } else { - Err(Error::conflict( - "instance is already migrating, or underwent an operation that \ - prevented this migration from proceeding" - )) - } - } - - /// Attempts to clear the migration IDs for the supplied instance via the - /// sled specified in `db_instance`. - /// - /// The supplied instance record must contain valid migration IDs. - /// - /// Returns `Ok` if sled agent accepted the request to clear migration IDs - /// and the resulting attempt to write instance runtime state back to CRDB - /// succeeded. This routine returns `Ok` even if the update was not actually - /// applied (due to a separate generation number change). - /// - /// # Panics - /// - /// Asserts that `db_instance` has a migration ID and destination Propolis - /// ID set. - pub(crate) async fn instance_clear_migration_ids( - &self, - instance_id: InstanceUuid, - prev_instance_runtime: &db::model::InstanceRuntimeState, - ) -> Result<(), Error> { - assert!(prev_instance_runtime.migration_id.is_some()); - assert!(prev_instance_runtime.dst_propolis_id.is_some()); - - todo!("eliza: do this transition in the DB rather than in sled-agent") - } - /// Reboot the specified instance. pub(crate) async fn instance_reboot( &self, diff --git a/nexus/src/app/sagas/instance_migrate.rs b/nexus/src/app/sagas/instance_migrate.rs index 23b8c94a7d2..5d371cec374 100644 --- a/nexus/src/app/sagas/instance_migrate.rs +++ b/nexus/src/app/sagas/instance_migrate.rs @@ -16,9 +16,7 @@ use nexus_db_queries::{authn, authz, db}; use omicron_uuid_kinds::{GenericUuid, InstanceUuid, PropolisUuid, SledUuid}; use serde::Deserialize; use serde::Serialize; -use sled_agent_client::types::{ - InstanceMigrationSourceParams, InstanceMigrationTargetParams, -}; +use sled_agent_client::types::InstanceMigrationTargetParams; use slog::warn; use std::net::{Ipv6Addr, SocketAddr}; use steno::ActionError; @@ -72,12 +70,27 @@ declare_saga_actions! { CREATE_MIGRATION_RECORD -> "migration_record" { + sim_create_migration_record - - sim_delete_migration_record + - sim_fail_migration_record } - // This step the instance's migration ID and destination Propolis ID - // fields. + // fields in the database. + // + // If the instance's migration ID has already been set when we attempt to + // set ours, that means we have probably raced with another migrate saga for + // the same instance. If this is the case, this action will fail and the + // saga will unwind. + // + // Yes, it's a bit unfortunate that our attempt to compare-and-swap in a + // migration ID happens only after we've created VMM and migration records, + // and that we'll have to destroy them as we unwind. However, the + // alternative, setting the migration IDs *before* records for the target + // VMM and the migration are created, would mean that there is a period of + // time during which the instance record contains foreign keys into the + // `vmm` and `migration` tables that don't have corresponding records to + // those tables. Because the `instance` table is queried in the public API, + // we take care to ensure that it doesn't have "dangling pointers" to + // records in the `vmm` and `migration` tables that don't exist yet. SET_MIGRATION_IDS -> "set_migration_ids" { + sim_set_migration_ids - sim_clear_migration_ids @@ -232,7 +245,7 @@ async fn sim_create_migration_record( .map_err(ActionError::action_failed) } -async fn sim_delete_migration_record( +async fn sim_fail_migration_record( sagactx: NexusActionContext, ) -> Result<(), anyhow::Error> { let osagactx: &std::sync::Arc = @@ -244,9 +257,24 @@ async fn sim_delete_migration_record( ); let migration_id = sagactx.lookup::("migrate_id")?; - info!(osagactx.log(), "deleting migration record"; - "migration_id" => %migration_id); - osagactx.datastore().migration_mark_deleted(&opctx, migration_id).await?; + info!( + osagactx.log(), + "migration saga unwinding, marking migration record as failed"; + "instance_id" => %params.instance.id(), + "migration_id" => %migration_id, + ); + // If the migration record wasn't updated, this means it's already deleted, + // which...seems weird, but isn't worth getting the whole saga unwind stuck over. + if let Err(e) = + osagactx.datastore().migration_mark_deleted(&opctx, migration_id).await + { + warn!(osagactx.log(), + "Error marking migration record as failed during rollback"; + "instance_id" => %params.instance.id(), + "migration_id" => %migration_id, + "error" => ?e); + } + Ok(()) } @@ -306,7 +334,7 @@ async fn sim_destroy_vmm_record( async fn sim_set_migration_ids( sagactx: NexusActionContext, -) -> Result { +) -> Result<(), ActionError> { let osagactx = sagactx.user_data(); let params = sagactx.saga_params::()?; let opctx = crate::context::op_context_for_saga_action( @@ -328,19 +356,19 @@ async fn sim_set_migration_ids( "dst_propolis_id" => %dst_propolis_id, "prev_runtime_state" => ?db_instance.runtime()); - let updated_record = osagactx - .nexus() + osagactx + .datastore() .instance_set_migration_ids( &opctx, instance_id, src_propolis_id, - db_instance.runtime(), - InstanceMigrationSourceParams { dst_propolis_id, migration_id }, + migration_id, + dst_propolis_id, ) .await .map_err(ActionError::action_failed)?; - Ok(updated_record) + Ok(()) } async fn sim_clear_migration_ids( @@ -348,31 +376,31 @@ async fn sim_clear_migration_ids( ) -> Result<(), anyhow::Error> { let osagactx = sagactx.user_data(); let params = sagactx.saga_params::()?; + let opctx = crate::context::op_context_for_saga_action( + &sagactx, + ¶ms.serialized_authn, + ); let src_sled_id = SledUuid::from_untyped_uuid(params.src_vmm.sled_id); - let db_instance = - sagactx.lookup::("set_migration_ids")?; + let db_instance = params.instance; let instance_id = InstanceUuid::from_untyped_uuid(db_instance.id()); + let migration_id = sagactx.lookup::("migrate_id")?; + let dst_propolis_id = sagactx.lookup::("dst_propolis_id")?; + info!(osagactx.log(), "clearing migration IDs for saga unwind"; "instance_id" => %db_instance.id(), "sled_id" => %src_sled_id, - "prev_runtime_state" => ?db_instance.runtime()); + "migration_id" => %migration_id, + "dst_propolis_id" => %dst_propolis_id); - // Because the migration never actually started (and thus didn't finish), - // the instance should be at the same Propolis generation as it was when - // migration IDs were set, which means sled agent should accept a request to - // clear them. The only exception is if the instance stopped, but that also - // clears its migration IDs; in that case there is no work to do here. - // - // Other failures to clear migration IDs are handled like any other failure - // to update an instance's state: the callee attempts to mark the instance - // as failed; if the failure occurred because the instance changed state - // such that sled agent could not fulfill the request, the callee will - // produce a stale generation number and will not actually mark the instance - // as failed. if let Err(e) = osagactx - .nexus() - .instance_clear_migration_ids(instance_id, db_instance.runtime()) + .datastore() + .instance_unset_migration_ids( + &opctx, + instance_id, + migration_id, + dst_propolis_id, + ) .await { warn!(osagactx.log(), diff --git a/nexus/tests/integration_tests/instances.rs b/nexus/tests/integration_tests/instances.rs index d670ebc43e3..8cd49a4d029 100644 --- a/nexus/tests/integration_tests/instances.rs +++ b/nexus/tests/integration_tests/instances.rs @@ -1005,7 +1005,7 @@ async fn test_instance_migrate_v2p_and_routes( instance_simulate_migration_source( cptestctx, nexus, - original_sled, + original_sled_id, instance_id, migration_id, ) @@ -4987,7 +4987,7 @@ async fn instance_simulate_migration_source( "migration_id" => %migration_id, ); let sa = nexus.sled_client(&sled_id).await.unwrap(); - sa.instance_simulate_migrationSource( + sa.instance_simulate_migration_source( instance_id.into_untyped_uuid(), sled_agent_client::SimulateMigrationSource { migration_id, diff --git a/sled-agent/src/sim/http_entrypoints.rs b/sled-agent/src/sim/http_entrypoints.rs index 40216457b35..43aeec72a5c 100644 --- a/sled-agent/src/sim/http_entrypoints.rs +++ b/sled-agent/src/sim/http_entrypoints.rs @@ -53,6 +53,7 @@ pub fn api() -> SledApiDescription { api.register(instance_put_external_ip)?; api.register(instance_delete_external_ip)?; api.register(instance_poke_post)?; + api.register(instance_post_sim_migration_source)?; api.register(disk_put)?; api.register(disk_poke_post)?; api.register(update_artifact)?; From 3bbe6bbc67866173dbb73b0eebe294d1aef33337 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Fri, 21 Jun 2024 10:43:24 -0700 Subject: [PATCH 053/234] cleanup --- clients/sled-agent-client/src/lib.rs | 1 + sled-agent/src/common/instance.rs | 5 ----- sled-agent/src/instance.rs | 7 ++++++- sled-agent/src/sim/instance.rs | 4 ++-- 4 files changed, 9 insertions(+), 8 deletions(-) diff --git a/clients/sled-agent-client/src/lib.rs b/clients/sled-agent-client/src/lib.rs index 98671ebf48d..158e8676174 100644 --- a/clients/sled-agent-client/src/lib.rs +++ b/clients/sled-agent-client/src/lib.rs @@ -490,6 +490,7 @@ impl TestInterfaces for Client { let url = format!("{baseurl}/instances/{id}/sim-migration-source"); client .post(url) + .json(¶ms) .send() .await .expect("instance_simulate_migration_source() failed unexpectedly"); diff --git a/sled-agent/src/common/instance.rs b/sled-agent/src/common/instance.rs index 87939cb7cf2..4879657db38 100644 --- a/sled-agent/src/common/instance.rs +++ b/sled-agent/src/common/instance.rs @@ -447,11 +447,6 @@ mod test { } } - fn test_termination_fails_in_progress_migration( - mk_instance: impl Fn() -> InstanceStates, - ) { - } - #[test] fn source_termination_fails_in_progress_migration() { for state in [Observed::Destroyed, Observed::Failed] { diff --git a/sled-agent/src/instance.rs b/sled-agent/src/instance.rs index e271d268120..f8c00e9d863 100644 --- a/sled-agent/src/instance.rs +++ b/sled-agent/src/instance.rs @@ -693,7 +693,12 @@ impl InstanceRunner { let migrate = match migrate { Some(params) => { - let migration_id = todo!("eliza: this probably needs to be sent by Nexus directly now?"); + let migration_id = self.state + .migration_in() + // TODO(eliza): this is a bit of a shame; it would be nice + // to refactor this code so we don't unwrap here. + .expect("if we have migration target params, we should also have a migration in") + .migration_id; Some(propolis_client::types::InstanceMigrateInitiateRequest { src_addr: params.src_propolis_addr.to_string(), src_uuid: params.src_propolis_id, diff --git a/sled-agent/src/sim/instance.rs b/sled-agent/src/sim/instance.rs index 0ccf99fe251..0c87369539c 100644 --- a/sled-agent/src/sim/instance.rs +++ b/sled-agent/src/sim/instance.rs @@ -151,7 +151,7 @@ impl SimInstanceInner { self.queue_propolis_state(PropolisInstanceState::Running) } SimulatedMigrationResult::Failure => { - todo!("finish this part when we actuall need it...") + todo!("finish this part when we actually need it...") } } } @@ -433,7 +433,7 @@ impl SimInstance { self.inner .lock() .unwrap() - .queue_migration_in(migration.migration_id, migration.result); + .queue_migration_out(migration.migration_id, migration.result); } } From edbfdc8f146b2dd51501d18da46e41c24f4c605f Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Fri, 21 Jun 2024 10:44:31 -0700 Subject: [PATCH 054/234] update CTE expected SQL --- ...and_vmm_update_vmm_and_both_migrations.sql | 95 ++++++++++++++ ...stance_and_vmm_update_vmm_and_instance.sql | 9 +- ...ce_and_vmm_update_vmm_and_migration_in.sql | 63 ++++++++++ ...e_and_vmm_update_vmm_and_migration_out.sql | 63 ++++++++++ ...pdate_vmm_instance_and_both_migrations.sql | 119 ++++++++++++++++++ ...m_update_vmm_instance_and_migration_in.sql | 87 +++++++++++++ ..._update_vmm_instance_and_migration_out.sql | 87 +++++++++++++ .../instance_and_vmm_update_vmm_only.sql | 2 +- 8 files changed, 523 insertions(+), 2 deletions(-) create mode 100644 nexus/db-queries/tests/output/instance_and_vmm_update_vmm_and_both_migrations.sql create mode 100644 nexus/db-queries/tests/output/instance_and_vmm_update_vmm_and_migration_in.sql create mode 100644 nexus/db-queries/tests/output/instance_and_vmm_update_vmm_and_migration_out.sql create mode 100644 nexus/db-queries/tests/output/instance_and_vmm_update_vmm_instance_and_both_migrations.sql create mode 100644 nexus/db-queries/tests/output/instance_and_vmm_update_vmm_instance_and_migration_in.sql create mode 100644 nexus/db-queries/tests/output/instance_and_vmm_update_vmm_instance_and_migration_out.sql diff --git a/nexus/db-queries/tests/output/instance_and_vmm_update_vmm_and_both_migrations.sql b/nexus/db-queries/tests/output/instance_and_vmm_update_vmm_and_both_migrations.sql new file mode 100644 index 00000000000..15f5ec00890 --- /dev/null +++ b/nexus/db-queries/tests/output/instance_and_vmm_update_vmm_and_both_migrations.sql @@ -0,0 +1,95 @@ +WITH + migration_in_found + AS ( + SELECT + ( + SELECT + migration.id + FROM + migration + WHERE + migration.id = $1 AND (migration.time_deleted IS NULL) + ) + AS id + ), + migration_in_updated + AS ( + UPDATE + migration + SET + target_state = $2, time_target_updated = $3 + WHERE + (migration.id = $4 AND migration.target_propolis_id = $5) AND migration.target_gen < $6 + RETURNING + id + ), + migration_in_result + AS ( + SELECT + migration_in_found.id AS found, migration_in_updated.id AS updated + FROM + migration_in_found + LEFT JOIN migration_in_updated ON migration_in_found.id = migration_in_updated.id + ), + migration_out_found + AS ( + SELECT + ( + SELECT + migration.id + FROM + migration + WHERE + migration.id = $7 AND (migration.time_deleted IS NULL) + ) + AS id + ), + migration_out_updated + AS ( + UPDATE + migration + SET + source_state = $8, time_source_updated = $9 + WHERE + (migration.id = $10 AND migration.source_propolis_id = $11) AND migration.source_gen < $12 + RETURNING + id + ), + migration_out_result + AS ( + SELECT + migration_out_found.id AS found, migration_out_updated.id AS updated + FROM + migration_out_found + LEFT JOIN migration_out_updated ON migration_out_found.id = migration_out_updated.id + ), + vmm_found AS (SELECT (SELECT vmm.id FROM vmm WHERE vmm.id = $13) AS id), + vmm_updated + AS ( + UPDATE + vmm + SET + time_state_updated = $14, state_generation = $15, state = $16 + WHERE + ((vmm.time_deleted IS NULL) AND vmm.id = $17) AND vmm.state_generation < $18 + RETURNING + id + ), + vmm_result + AS ( + SELECT + vmm_found.id AS found, vmm_updated.id AS updated + FROM + vmm_found LEFT JOIN vmm_updated ON vmm_found.id = vmm_updated.id + ) +SELECT + vmm_result.found, + vmm_result.updated, + NULL, + NULL, + migration_in_result.found, + migration_in_result.updated, + migration_out_result.found, + migration_out_result.updated +FROM + vmm_result, migration_in_result, migration_out_result diff --git a/nexus/db-queries/tests/output/instance_and_vmm_update_vmm_and_instance.sql b/nexus/db-queries/tests/output/instance_and_vmm_update_vmm_and_instance.sql index ab4ef78b182..3014e9068fb 100644 --- a/nexus/db-queries/tests/output/instance_and_vmm_update_vmm_and_instance.sql +++ b/nexus/db-queries/tests/output/instance_and_vmm_update_vmm_and_instance.sql @@ -43,6 +43,13 @@ WITH vmm_found LEFT JOIN vmm_updated ON vmm_found.id = vmm_updated.id ) SELECT - vmm_result.found, vmm_result.updated, instance_result.found, instance_result.updated, NULL, NULL + vmm_result.found, + vmm_result.updated, + instance_result.found, + instance_result.updated, + NULL, + NULL, + NULL, + NULL FROM vmm_result, instance_result diff --git a/nexus/db-queries/tests/output/instance_and_vmm_update_vmm_and_migration_in.sql b/nexus/db-queries/tests/output/instance_and_vmm_update_vmm_and_migration_in.sql new file mode 100644 index 00000000000..03f6d27d2cd --- /dev/null +++ b/nexus/db-queries/tests/output/instance_and_vmm_update_vmm_and_migration_in.sql @@ -0,0 +1,63 @@ +WITH + migration_in_found + AS ( + SELECT + ( + SELECT + migration.id + FROM + migration + WHERE + migration.id = $1 AND (migration.time_deleted IS NULL) + ) + AS id + ), + migration_in_updated + AS ( + UPDATE + migration + SET + target_state = $2, time_target_updated = $3 + WHERE + (migration.id = $4 AND migration.target_propolis_id = $5) AND migration.target_gen < $6 + RETURNING + id + ), + migration_in_result + AS ( + SELECT + migration_in_found.id AS found, migration_in_updated.id AS updated + FROM + migration_in_found + LEFT JOIN migration_in_updated ON migration_in_found.id = migration_in_updated.id + ), + vmm_found AS (SELECT (SELECT vmm.id FROM vmm WHERE vmm.id = $7) AS id), + vmm_updated + AS ( + UPDATE + vmm + SET + time_state_updated = $8, state_generation = $9, state = $10 + WHERE + ((vmm.time_deleted IS NULL) AND vmm.id = $11) AND vmm.state_generation < $12 + RETURNING + id + ), + vmm_result + AS ( + SELECT + vmm_found.id AS found, vmm_updated.id AS updated + FROM + vmm_found LEFT JOIN vmm_updated ON vmm_found.id = vmm_updated.id + ) +SELECT + vmm_result.found, + vmm_result.updated, + NULL, + NULL, + migration_in_result.found, + migration_in_result.updated, + NULL, + NULL +FROM + vmm_result, migration_in_result diff --git a/nexus/db-queries/tests/output/instance_and_vmm_update_vmm_and_migration_out.sql b/nexus/db-queries/tests/output/instance_and_vmm_update_vmm_and_migration_out.sql new file mode 100644 index 00000000000..6dd4ab55205 --- /dev/null +++ b/nexus/db-queries/tests/output/instance_and_vmm_update_vmm_and_migration_out.sql @@ -0,0 +1,63 @@ +WITH + migration_out_found + AS ( + SELECT + ( + SELECT + migration.id + FROM + migration + WHERE + migration.id = $1 AND (migration.time_deleted IS NULL) + ) + AS id + ), + migration_out_updated + AS ( + UPDATE + migration + SET + source_state = $2, time_source_updated = $3 + WHERE + (migration.id = $4 AND migration.source_propolis_id = $5) AND migration.source_gen < $6 + RETURNING + id + ), + migration_out_result + AS ( + SELECT + migration_out_found.id AS found, migration_out_updated.id AS updated + FROM + migration_out_found + LEFT JOIN migration_out_updated ON migration_out_found.id = migration_out_updated.id + ), + vmm_found AS (SELECT (SELECT vmm.id FROM vmm WHERE vmm.id = $7) AS id), + vmm_updated + AS ( + UPDATE + vmm + SET + time_state_updated = $8, state_generation = $9, state = $10 + WHERE + ((vmm.time_deleted IS NULL) AND vmm.id = $11) AND vmm.state_generation < $12 + RETURNING + id + ), + vmm_result + AS ( + SELECT + vmm_found.id AS found, vmm_updated.id AS updated + FROM + vmm_found LEFT JOIN vmm_updated ON vmm_found.id = vmm_updated.id + ) +SELECT + vmm_result.found, + vmm_result.updated, + NULL, + NULL, + NULL, + NULL, + migration_out_result.found, + migration_out_result.updated +FROM + vmm_result, migration_out_result diff --git a/nexus/db-queries/tests/output/instance_and_vmm_update_vmm_instance_and_both_migrations.sql b/nexus/db-queries/tests/output/instance_and_vmm_update_vmm_instance_and_both_migrations.sql new file mode 100644 index 00000000000..3959f323c92 --- /dev/null +++ b/nexus/db-queries/tests/output/instance_and_vmm_update_vmm_instance_and_both_migrations.sql @@ -0,0 +1,119 @@ +WITH + instance_found AS (SELECT (SELECT instance.id FROM instance WHERE instance.id = $1) AS id), + instance_updated + AS ( + UPDATE + instance + SET + time_state_updated = $2, + state_generation = $3, + active_propolis_id = $4, + target_propolis_id = $5, + migration_id = $6, + state = $7 + WHERE + ((instance.time_deleted IS NULL) AND instance.id = $8) AND instance.state_generation < $9 + RETURNING + id + ), + instance_result + AS ( + SELECT + instance_found.id AS found, instance_updated.id AS updated + FROM + instance_found LEFT JOIN instance_updated ON instance_found.id = instance_updated.id + ), + migration_in_found + AS ( + SELECT + ( + SELECT + migration.id + FROM + migration + WHERE + migration.id = $10 AND (migration.time_deleted IS NULL) + ) + AS id + ), + migration_in_updated + AS ( + UPDATE + migration + SET + target_state = $11, time_target_updated = $12 + WHERE + (migration.id = $13 AND migration.target_propolis_id = $14) AND migration.target_gen < $15 + RETURNING + id + ), + migration_in_result + AS ( + SELECT + migration_in_found.id AS found, migration_in_updated.id AS updated + FROM + migration_in_found + LEFT JOIN migration_in_updated ON migration_in_found.id = migration_in_updated.id + ), + migration_out_found + AS ( + SELECT + ( + SELECT + migration.id + FROM + migration + WHERE + migration.id = $16 AND (migration.time_deleted IS NULL) + ) + AS id + ), + migration_out_updated + AS ( + UPDATE + migration + SET + source_state = $17, time_source_updated = $18 + WHERE + (migration.id = $19 AND migration.source_propolis_id = $20) AND migration.source_gen < $21 + RETURNING + id + ), + migration_out_result + AS ( + SELECT + migration_out_found.id AS found, migration_out_updated.id AS updated + FROM + migration_out_found + LEFT JOIN migration_out_updated ON migration_out_found.id = migration_out_updated.id + ), + vmm_found AS (SELECT (SELECT vmm.id FROM vmm WHERE vmm.id = $22) AS id), + vmm_updated + AS ( + UPDATE + vmm + SET + time_state_updated = $23, state_generation = $24, state = $25 + WHERE + ((vmm.time_deleted IS NULL) AND vmm.id = $26) AND vmm.state_generation < $27 + RETURNING + id + ), + vmm_result + AS ( + SELECT + vmm_found.id AS found, vmm_updated.id AS updated + FROM + vmm_found LEFT JOIN vmm_updated ON vmm_found.id = vmm_updated.id + ) +SELECT + vmm_result.found, + vmm_result.updated, + instance_result.found, + instance_result.updated, + migration_in_result.found, + migration_in_result.updated, + migration_out_result.found, + migration_out_result.updated +FROM + vmm_result, instance_result, migration_in_result, migration_out_result diff --git a/nexus/db-queries/tests/output/instance_and_vmm_update_vmm_instance_and_migration_in.sql b/nexus/db-queries/tests/output/instance_and_vmm_update_vmm_instance_and_migration_in.sql new file mode 100644 index 00000000000..ab73df18048 --- /dev/null +++ b/nexus/db-queries/tests/output/instance_and_vmm_update_vmm_instance_and_migration_in.sql @@ -0,0 +1,87 @@ +WITH + instance_found AS (SELECT (SELECT instance.id FROM instance WHERE instance.id = $1) AS id), + instance_updated + AS ( + UPDATE + instance + SET + time_state_updated = $2, + state_generation = $3, + active_propolis_id = $4, + target_propolis_id = $5, + migration_id = $6, + state = $7 + WHERE + ((instance.time_deleted IS NULL) AND instance.id = $8) AND instance.state_generation < $9 + RETURNING + id + ), + instance_result + AS ( + SELECT + instance_found.id AS found, instance_updated.id AS updated + FROM + instance_found LEFT JOIN instance_updated ON instance_found.id = instance_updated.id + ), + migration_in_found + AS ( + SELECT + ( + SELECT + migration.id + FROM + migration + WHERE + migration.id = $10 AND (migration.time_deleted IS NULL) + ) + AS id + ), + migration_in_updated + AS ( + UPDATE + migration + SET + target_state = $11, time_target_updated = $12 + WHERE + (migration.id = $13 AND migration.target_propolis_id = $14) AND migration.target_gen < $15 + RETURNING + id + ), + migration_in_result + AS ( + SELECT + migration_in_found.id AS found, migration_in_updated.id AS updated + FROM + migration_in_found + LEFT JOIN migration_in_updated ON migration_in_found.id = migration_in_updated.id + ), + vmm_found AS (SELECT (SELECT vmm.id FROM vmm WHERE vmm.id = $16) AS id), + vmm_updated + AS ( + UPDATE + vmm + SET + time_state_updated = $17, state_generation = $18, state = $19 + WHERE + ((vmm.time_deleted IS NULL) AND vmm.id = $20) AND vmm.state_generation < $21 + RETURNING + id + ), + vmm_result + AS ( + SELECT + vmm_found.id AS found, vmm_updated.id AS updated + FROM + vmm_found LEFT JOIN vmm_updated ON vmm_found.id = vmm_updated.id + ) +SELECT + vmm_result.found, + vmm_result.updated, + instance_result.found, + instance_result.updated, + migration_in_result.found, + migration_in_result.updated, + NULL, + NULL +FROM + vmm_result, instance_result, migration_in_result diff --git a/nexus/db-queries/tests/output/instance_and_vmm_update_vmm_instance_and_migration_out.sql b/nexus/db-queries/tests/output/instance_and_vmm_update_vmm_instance_and_migration_out.sql new file mode 100644 index 00000000000..eb4e558d95d --- /dev/null +++ b/nexus/db-queries/tests/output/instance_and_vmm_update_vmm_instance_and_migration_out.sql @@ -0,0 +1,87 @@ +WITH + instance_found AS (SELECT (SELECT instance.id FROM instance WHERE instance.id = $1) AS id), + instance_updated + AS ( + UPDATE + instance + SET + time_state_updated = $2, + state_generation = $3, + active_propolis_id = $4, + target_propolis_id = $5, + migration_id = $6, + state = $7 + WHERE + ((instance.time_deleted IS NULL) AND instance.id = $8) AND instance.state_generation < $9 + RETURNING + id + ), + instance_result + AS ( + SELECT + instance_found.id AS found, instance_updated.id AS updated + FROM + instance_found LEFT JOIN instance_updated ON instance_found.id = instance_updated.id + ), + migration_out_found + AS ( + SELECT + ( + SELECT + migration.id + FROM + migration + WHERE + migration.id = $10 AND (migration.time_deleted IS NULL) + ) + AS id + ), + migration_out_updated + AS ( + UPDATE + migration + SET + source_state = $11, time_source_updated = $12 + WHERE + (migration.id = $13 AND migration.source_propolis_id = $14) AND migration.source_gen < $15 + RETURNING + id + ), + migration_out_result + AS ( + SELECT + migration_out_found.id AS found, migration_out_updated.id AS updated + FROM + migration_out_found + LEFT JOIN migration_out_updated ON migration_out_found.id = migration_out_updated.id + ), + vmm_found AS (SELECT (SELECT vmm.id FROM vmm WHERE vmm.id = $16) AS id), + vmm_updated + AS ( + UPDATE + vmm + SET + time_state_updated = $17, state_generation = $18, state = $19 + WHERE + ((vmm.time_deleted IS NULL) AND vmm.id = $20) AND vmm.state_generation < $21 + RETURNING + id + ), + vmm_result + AS ( + SELECT + vmm_found.id AS found, vmm_updated.id AS updated + FROM + vmm_found LEFT JOIN vmm_updated ON vmm_found.id = vmm_updated.id + ) +SELECT + vmm_result.found, + vmm_result.updated, + instance_result.found, + instance_result.updated, + NULL, + NULL, + migration_out_result.found, + migration_out_result.updated +FROM + vmm_result, instance_result, migration_out_result diff --git a/nexus/db-queries/tests/output/instance_and_vmm_update_vmm_only.sql b/nexus/db-queries/tests/output/instance_and_vmm_update_vmm_only.sql index cfe56740fe7..8f81e662a96 100644 --- a/nexus/db-queries/tests/output/instance_and_vmm_update_vmm_only.sql +++ b/nexus/db-queries/tests/output/instance_and_vmm_update_vmm_only.sql @@ -19,6 +19,6 @@ WITH vmm_found LEFT JOIN vmm_updated ON vmm_found.id = vmm_updated.id ) SELECT - vmm_result.found, vmm_result.updated, NULL, NULL, NULL, NULL + vmm_result.found, vmm_result.updated, NULL, NULL, NULL, NULL, NULL, NULL FROM vmm_result From 3bdb0941df4602f2d34a607beb9fceedc1252990 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Fri, 21 Jun 2024 11:04:47 -0700 Subject: [PATCH 055/234] whoops that was the sled id and not the vmm ID --- nexus/src/app/sagas/instance_migrate.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/nexus/src/app/sagas/instance_migrate.rs b/nexus/src/app/sagas/instance_migrate.rs index 5d371cec374..27e851f8425 100644 --- a/nexus/src/app/sagas/instance_migrate.rs +++ b/nexus/src/app/sagas/instance_migrate.rs @@ -344,8 +344,7 @@ async fn sim_set_migration_ids( let db_instance = ¶ms.instance; let instance_id = InstanceUuid::from_untyped_uuid(db_instance.id()); - let src_propolis_id = - PropolisUuid::from_untyped_uuid(params.src_vmm.sled_id); + let src_propolis_id = PropolisUuid::from_untyped_uuid(params.src_vmm.id); let migration_id = sagactx.lookup::("migrate_id")?; let dst_propolis_id = sagactx.lookup::("dst_propolis_id")?; @@ -380,17 +379,16 @@ async fn sim_clear_migration_ids( &sagactx, ¶ms.serialized_authn, ); - let src_sled_id = SledUuid::from_untyped_uuid(params.src_vmm.sled_id); let db_instance = params.instance; let instance_id = InstanceUuid::from_untyped_uuid(db_instance.id()); - + let src_propolis_id = PropolisUuid::from_untyped_uuid(params.src_vmm.id); let migration_id = sagactx.lookup::("migrate_id")?; let dst_propolis_id = sagactx.lookup::("dst_propolis_id")?; info!(osagactx.log(), "clearing migration IDs for saga unwind"; "instance_id" => %db_instance.id(), - "sled_id" => %src_sled_id, "migration_id" => %migration_id, + "src_propolis_id" => %src_propolis_id, "dst_propolis_id" => %dst_propolis_id); if let Err(e) = osagactx @@ -406,6 +404,8 @@ async fn sim_clear_migration_ids( warn!(osagactx.log(), "Error clearing migration IDs during rollback"; "instance_id" => %instance_id, + "src_propolis_id" => %src_propolis_id, + "dst_propolis_id" => %dst_propolis_id, "error" => ?e); } From 63ec92a7f1d2af4ab5534e53c581d37b21692a69 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Fri, 21 Jun 2024 11:29:57 -0700 Subject: [PATCH 056/234] just return instance in instance_set_migration_ids this way, the `instance-migrate` saga doesn't have to change as much... --- nexus/db-queries/src/db/datastore/instance.rs | 19 +++++++++---------- nexus/src/app/sagas/instance_migrate.rs | 6 ++---- 2 files changed, 11 insertions(+), 14 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/instance.rs b/nexus/db-queries/src/db/datastore/instance.rs index 7bde43dad52..37cd08605e7 100644 --- a/nexus/db-queries/src/db/datastore/instance.rs +++ b/nexus/db-queries/src/db/datastore/instance.rs @@ -526,7 +526,7 @@ impl DataStore { src_propolis_id: PropolisUuid, migration_id: Uuid, target_propolis_id: PropolisUuid, - ) -> Result { + ) -> Result { use db::schema::instance::dsl; let instance_id = instance_id.into_untyped_uuid(); @@ -560,25 +560,24 @@ impl DataStore { match updated { // If we updated the instance, that's great! Good job team! - UpdateAndQueryResult { status: UpdateStatus::Updated, .. } => { - Ok(true) + UpdateAndQueryResult { status: UpdateStatus::Updated, found } => { + Ok(found) } // No update was performed because the migration ID has already been // set to the ID we were trying to set it to. That's fine, count it // as a success. - UpdateAndQueryResult { - found: Instance { runtime_state, .. }, - .. - } if runtime_state.migration_id == Some(migration_id) => { + UpdateAndQueryResult { found, .. } + if found.runtime_state.migration_id == Some(migration_id) => + { debug_assert_eq!( - runtime_state.dst_propolis_id, + found.runtime_state.dst_propolis_id, Some(target_propolis_id) ); debug_assert_eq!( - runtime_state.propolis_id, + found.runtime_state.propolis_id, Some(src_propolis_id) ); - Ok(false) + Ok(found) } // On the other hand, if there was already a different migration ID, diff --git a/nexus/src/app/sagas/instance_migrate.rs b/nexus/src/app/sagas/instance_migrate.rs index 27e851f8425..b495b24ccd9 100644 --- a/nexus/src/app/sagas/instance_migrate.rs +++ b/nexus/src/app/sagas/instance_migrate.rs @@ -334,7 +334,7 @@ async fn sim_destroy_vmm_record( async fn sim_set_migration_ids( sagactx: NexusActionContext, -) -> Result<(), ActionError> { +) -> Result { let osagactx = sagactx.user_data(); let params = sagactx.saga_params::()?; let opctx = crate::context::op_context_for_saga_action( @@ -365,9 +365,7 @@ async fn sim_set_migration_ids( dst_propolis_id, ) .await - .map_err(ActionError::action_failed)?; - - Ok(()) + .map_err(ActionError::action_failed) } async fn sim_clear_migration_ids( From ad94f20575afe2889adfae751829e5b0b3583f99 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Fri, 21 Jun 2024 11:39:27 -0700 Subject: [PATCH 057/234] fix sim sled-agent looking at the wrong migration --- sled-agent/src/sim/instance.rs | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/sled-agent/src/sim/instance.rs b/sled-agent/src/sim/instance.rs index 0c87369539c..eba522fbfc5 100644 --- a/sled-agent/src/sim/instance.rs +++ b/sled-agent/src/sim/instance.rs @@ -203,8 +203,15 @@ impl SimInstanceInner { ))); } - let migration_id = self.state.migration_out() - .ok_or_else(|| Error::invalid_request("can't request migration in for a vmm that wasn't created with a migration ID"))? + let migration_id = self + .state + .migration_in() + .ok_or_else(|| { + Error::invalid_request( + "can't request migration in for a vmm that wasn't \ + created with a migration ID", + ) + })? .migration_id; self.queue_migration_in( migration_id, From 796f75927961ffbb4f5edf68a7b603a1226e6ba5 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Fri, 21 Jun 2024 11:56:28 -0700 Subject: [PATCH 058/234] sled-agent: make sure migration-in is populated --- sled-agent/src/common/instance.rs | 46 +++++++++++++++++------------- sled-agent/src/instance.rs | 6 +++- sled-agent/src/instance_manager.rs | 1 + sled-agent/src/sim/instance.rs | 1 + 4 files changed, 33 insertions(+), 21 deletions(-) diff --git a/sled-agent/src/common/instance.rs b/sled-agent/src/common/instance.rs index 4879657db38..4e0d7a57fdc 100644 --- a/sled-agent/src/common/instance.rs +++ b/sled-agent/src/common/instance.rs @@ -173,13 +173,21 @@ pub enum Action { } impl InstanceStates { - pub fn new(vmm: VmmRuntimeState, propolis_id: PropolisUuid) -> Self { - InstanceStates { - vmm, - propolis_id, - migration_in: None, - migration_out: None, - } + pub fn new( + vmm: VmmRuntimeState, + propolis_id: PropolisUuid, + migration_id: Option, + ) -> Self { + // If this instance is created with a migration ID, we are the intended + // target of a migration in. Set that up now. + let migration_in = + migration_id.map(|migration_id| MigrationRuntimeState { + migration_id, + state: MigrationState::Pending, + gen: Generation::new(), + time_updated: Utc::now(), + }); + InstanceStates { vmm, propolis_id, migration_in, migration_out: None } } pub fn vmm(&self) -> &VmmRuntimeState { @@ -376,7 +384,7 @@ mod test { time_updated: now, }; - InstanceStates::new(vmm, propolis_id) + InstanceStates::new(vmm, propolis_id, None) } fn make_migration_source_instance() -> InstanceStates { @@ -396,18 +404,16 @@ mod test { } fn make_migration_target_instance() -> InstanceStates { - let mut state = make_instance(); - state.vmm.state = VmmState::Migrating; - let migration_id = Uuid::new_v4(); - state.migration_in = Some(MigrationRuntimeState { - migration_id, - state: MigrationState::InProgress, - // advance the generation once, since we are starting out in the - // `InProgress` state. - gen: Generation::new().next(), - time_updated: Utc::now(), - }); - state + let propolis_id = PropolisUuid::new_v4(); + let now = Utc::now(); + + let vmm = VmmRuntimeState { + state: VmmState::Migrating, + gen: Generation::new(), + time_updated: now, + }; + + InstanceStates::new(vmm, propolis_id, Some(Uuid::new_v4())) } fn make_observed_state( diff --git a/sled-agent/src/instance.rs b/sled-agent/src/instance.rs index f8c00e9d863..3b17b68afcb 100644 --- a/sled-agent/src/instance.rs +++ b/sled-agent/src/instance.rs @@ -965,6 +965,8 @@ impl Instance { /// * `log`: Logger for dumping debug information. /// * `id`: UUID of the instance to be created. /// * `propolis_id`: UUID for the VMM to be created. + /// * `migration_id`: UUID of the migration in to this VMM, if the VMM is + /// being created as the target of an active migration. /// * `ticket`: A ticket that ensures this instance is a member of its /// instance manager's tracking table. /// * `state`: The initial state of this instance. @@ -976,6 +978,7 @@ impl Instance { log: Logger, id: InstanceUuid, propolis_id: PropolisUuid, + migration_id: Option, ticket: InstanceTicket, state: InstanceInitialState, services: InstanceManagerServices, @@ -985,6 +988,7 @@ impl Instance { info!(log, "initializing new Instance"; "instance_id" => %id, "propolis_id" => %propolis_id, + "migration_id" => ?migration_id, "state" => ?state); let InstanceInitialState { @@ -1078,7 +1082,7 @@ impl Instance { dhcp_config, requested_disks: hardware.disks, cloud_init_bytes: hardware.cloud_init_bytes, - state: InstanceStates::new(vmm_runtime, propolis_id), + state: InstanceStates::new(vmm_runtime, propolis_id, migration_id), running_state: None, nexus_client, storage, diff --git a/sled-agent/src/instance_manager.rs b/sled-agent/src/instance_manager.rs index 012af14b6ba..2cbaf32dd32 100644 --- a/sled-agent/src/instance_manager.rs +++ b/sled-agent/src/instance_manager.rs @@ -627,6 +627,7 @@ impl InstanceManagerRunner { instance_log, instance_id, propolis_id, + instance_runtime.migration_id, ticket, state, services, diff --git a/sled-agent/src/sim/instance.rs b/sled-agent/src/sim/instance.rs index eba522fbfc5..38f987f67a1 100644 --- a/sled-agent/src/sim/instance.rs +++ b/sled-agent/src/sim/instance.rs @@ -465,6 +465,7 @@ impl Simulatable for SimInstance { state: InstanceStates::new( current.vmm_state, current.propolis_id, + current.migration_in.map(|m| m.migration_id), ), last_response: InstanceStateMonitorResponse { gen: 1, From 64bbb3a17414bc86b2618ef38e8c7c904f057568 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Fri, 21 Jun 2024 12:05:46 -0700 Subject: [PATCH 059/234] start sketching out migration-update subsaga --- nexus/db-model/src/migration.rs | 16 +++ .../app/sagas/instance_update/destroyed.rs | 15 ++- .../app/sagas/instance_update/migration.rs | 100 ++++++++++++++++++ nexus/src/app/sagas/instance_update/mod.rs | 47 ++++++++ 4 files changed, 176 insertions(+), 2 deletions(-) create mode 100644 nexus/src/app/sagas/instance_update/migration.rs diff --git a/nexus/db-model/src/migration.rs b/nexus/db-model/src/migration.rs index 4e3ca1b35d0..eceb7e15b3a 100644 --- a/nexus/db-model/src/migration.rs +++ b/nexus/db-model/src/migration.rs @@ -89,4 +89,20 @@ impl Migration { time_target_updated: None, } } + + pub const COMPLETED: MigrationState = + MigrationState(nexus::MigrationState::Completed); + pub const FAILED: MigrationState = + MigrationState(nexus::MigrationState::Failed); + + /// Returns `true` if either side of the migration has failed. + pub fn either_side_failed(&self) -> bool { + self.source_state == Self::FAILED || self.target_state == Self::FAILED + } + + /// Returns `true` if either side of the migration has completed. + pub fn either_side_completed(&self) -> bool { + self.source_state == Self::COMPLETED + || self.target_state == Self::COMPLETED + } } diff --git a/nexus/src/app/sagas/instance_update/destroyed.rs b/nexus/src/app/sagas/instance_update/destroyed.rs index cba2c31bf39..33589eab5fb 100644 --- a/nexus/src/app/sagas/instance_update/destroyed.rs +++ b/nexus/src/app/sagas/instance_update/destroyed.rs @@ -269,13 +269,24 @@ async fn siud_update_instance( ); // It's okay for this to fail, it just means that the active VMM ID has changed. - let _ = osagactx + if let Err(e) = osagactx .datastore() .instance_update_runtime( &InstanceUuid::from_untyped_uuid(authz_instance.id()), &new_runtime, ) - .await; + .await + { + warn!( + osagactx.log(), + "instance update (VMM destroyed): updating runtime state failed"; + "instance_id" => %authz_instance.id(), + "propolis_id" => %vmm_id, + "new_runtime_state" => ?new_runtime, + "instance_update" => %"VMM destroyed", + "error" => %e, + ); + } Ok(()) } diff --git a/nexus/src/app/sagas/instance_update/migration.rs b/nexus/src/app/sagas/instance_update/migration.rs new file mode 100644 index 00000000000..e7af8acb9d2 --- /dev/null +++ b/nexus/src/app/sagas/instance_update/migration.rs @@ -0,0 +1,100 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use super::{ActionRegistry, NexusActionContext, NexusSaga}; +use crate::app::db::model::Instance; +use crate::app::db::model::Migration; +use crate::app::sagas::declare_saga_actions; +use nexus_db_queries::{authn, authz}; +use omicron_uuid_kinds::GenericUuid; +use omicron_uuid_kinds::InstanceUuid; +use omicron_uuid_kinds::PropolisUuid; +use serde::{Deserialize, Serialize}; +use steno::ActionError; + +// instance update (migration) subsaga: actions + +// This subsaga is responsible for handling an instance update where either side +// of an active migration reports that the migration has failed or completed. +declare_saga_actions! { + instance_update_migration; + + // Clear migration IDs and write back the instance + CLEAR_MIGRATION_IDS -> "clear_migration_ids" { + + sium_clear_migration_ids + } +} + +/// Parameters to the instance update (migration) sub-saga. +#[derive(Debug, Deserialize, Serialize)] +pub(super) struct Params { + /// Authentication context to use to fetch the instance's current state from + /// the database. + pub(super) serialized_authn: authn::saga::Serialized, + + pub(super) authz_instance: authz::Instance, + + pub(super) instance: Instance, + + pub(super) migration: Migration, +} + +#[derive(Debug)] +pub(super) struct SagaMigrationUpdate; +impl NexusSaga for SagaMigrationUpdate { + const NAME: &'static str = "instance-update-migration"; + type Params = Params; + + fn register_actions(registry: &mut ActionRegistry) { + instance_update_migration_register_actions(registry); + } + + fn make_saga_dag( + params: &Self::Params, + mut builder: steno::DagBuilder, + ) -> Result { + todo!("eliza: draw the rest of the saga...") + } +} + +async fn sium_clear_migration_ids( + sagactx: NexusActionContext, +) -> Result<(), ActionError> { + let Params { + ref serialized_authn, ref authz_instance, ref migration, .. + } = sagactx.saga_params()?; + + let opctx = + crate::context::op_context_for_saga_action(&sagactx, serialized_authn); + let osagactx = sagactx.user_data(); + let instance_id = InstanceUuid::from_untyped_uuid(authz_instance.id()); + + let src_propolis_id = + PropolisUuid::from_untyped_uuid(migration.source_propolis_id); + let target_propolis_id = + PropolisUuid::from_untyped_uuid(migration.target_propolis_id); + + slog::info!( + osagactx.log(), + "instance update (migration): clearing migration IDs"; + "instance_id" => %instance_id, + "migration_id" => %migration.id, + "src_propolis_id" => %src_propolis_id, + "target_propolis_id" => %target_propolis_id, + "migration_failed" => migration.either_side_failed(), + ); + + osagactx + .datastore() + .instance_unset_migration_ids( + &opctx, + instance_id, + migration.id, + target_propolis_id, + ) + .await + .map_err(ActionError::action_failed)?; + + Ok(()) +} diff --git a/nexus/src/app/sagas/instance_update/mod.rs b/nexus/src/app/sagas/instance_update/mod.rs index eaac701c9ed..ca446373463 100644 --- a/nexus/src/app/sagas/instance_update/mod.rs +++ b/nexus/src/app/sagas/instance_update/mod.rs @@ -18,6 +18,7 @@ use steno::{ActionError, DagBuilder, Node, SagaName}; use uuid::Uuid; mod destroyed; +mod migration; // The public interface to this saga is actually a smaller saga that starts the // "real" update saga, which inherits the lock from the start saga. This is @@ -129,6 +130,52 @@ impl NexusSaga for SagaDoActualInstanceUpdate { } } + // Next, determine what to do with the migration. A migration update + // saga needs to be scheduled if (and only if) the instance's migration + // ID currently points to a migration. The `instance_fetch_all` query + // will only return a migration if it is the instance's currently active + // migration, so if we have one here, that means that there's a + // migration. + if let Some(migration) = params.state.migration.clone() { + if migration.either_side_failed() + || migration.either_side_completed() + { + const MIGRATION_SUBSAGA_PARAMS: &str = + "params_for_migration_subsaga"; + let subsaga_params = migration::Params { + serialized_authn: params.serialized_authn.clone(), + authz_instance: params.authz_instance.clone(), + instance: params.state.instance.clone(), + migration, + }; + let subsaga_dag = { + let subsaga_builder = DagBuilder::new(SagaName::new( + migration::SagaMigrationUpdate::NAME, + )); + migration::SagaMigrationUpdate::make_saga_dag( + &subsaga_params, + subsaga_builder, + )? + }; + + builder.append(Node::constant( + MIGRATION_SUBSAGA_PARAMS, + serde_json::to_value(&subsaga_params).map_err(|e| { + SagaInitError::SerializeError( + MIGRATION_SUBSAGA_PARAMS.to_string(), + e, + ) + })?, + )); + + builder.append(Node::subsaga( + "migration_subsaga_no_result", + subsaga_dag, + MIGRATION_SUBSAGA_PARAMS, + )); + } + } + builder.append(unlock_instance_action()); Ok(builder.build()?) } From 0cfec53d6b79ca6a60ed5a92c4369ea137aa50fb Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Fri, 21 Jun 2024 12:16:43 -0700 Subject: [PATCH 060/234] fix migrate saga sending states without migration IDs --- nexus/db-queries/src/db/datastore/instance.rs | 16 ++++++++++++---- nexus/src/app/sagas/instance_migrate.rs | 18 ++++++++++++++++++ 2 files changed, 30 insertions(+), 4 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/instance.rs b/nexus/db-queries/src/db/datastore/instance.rs index 37cd08605e7..085e68c1224 100644 --- a/nexus/db-queries/src/db/datastore/instance.rs +++ b/nexus/db-queries/src/db/datastore/instance.rs @@ -519,6 +519,14 @@ impl DataStore { } /// Updates an instance record by setting the instance's migration ID. + // + // TODO-design It's tempting to return the updated state of the Instance + // here because it's convenient for consumers and by using a RETURNING + // clause, we could ensure that the "update" and "fetch" are atomic. + // But in the unusual case that we _don't_ update the row because our + // update is older than the one in the database, we would have to fetch + // the current state explicitly. For now, we'll just require consumers + // to explicitly fetch the state if they want that. pub async fn instance_set_migration_ids( &self, opctx: &OpContext, @@ -526,7 +534,7 @@ impl DataStore { src_propolis_id: PropolisUuid, migration_id: Uuid, target_propolis_id: PropolisUuid, - ) -> Result { + ) -> Result { use db::schema::instance::dsl; let instance_id = instance_id.into_untyped_uuid(); @@ -560,8 +568,8 @@ impl DataStore { match updated { // If we updated the instance, that's great! Good job team! - UpdateAndQueryResult { status: UpdateStatus::Updated, found } => { - Ok(found) + UpdateAndQueryResult { status: UpdateStatus::Updated, .. } => { + Ok(true) } // No update was performed because the migration ID has already been // set to the ID we were trying to set it to. That's fine, count it @@ -577,7 +585,7 @@ impl DataStore { found.runtime_state.propolis_id, Some(src_propolis_id) ); - Ok(found) + Ok(false) } // On the other hand, if there was already a different migration ID, diff --git a/nexus/src/app/sagas/instance_migrate.rs b/nexus/src/app/sagas/instance_migrate.rs index b495b24ccd9..7c51730b252 100644 --- a/nexus/src/app/sagas/instance_migrate.rs +++ b/nexus/src/app/sagas/instance_migrate.rs @@ -365,6 +365,24 @@ async fn sim_set_migration_ids( dst_propolis_id, ) .await + .map_err(ActionError::action_failed)?; + + // Refetch the instance to make sure we have the correct thing to send to + // sled-agents. + // TODO(eliza): we *could* probably just munge the previous + // `InstanceRuntimeState` to have the migration IDs set, but...that feels + // sketchy. Doing another db query here to get the latest state is kinda sad + // but whatever. + let (.., authz_instance) = LookupPath::new(&opctx, &osagactx.datastore()) + .instance_id(db_instance.id()) + .lookup_for(authz::Action::Read) + .await + .map_err(ActionError::action_failed)?; + + osagactx + .datastore() + .instance_refetch(&opctx, &authz_instance) + .await .map_err(ActionError::action_failed) } From 74b9e37d0eb8fa4994f0e8deb3bc555440326924 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Fri, 21 Jun 2024 13:25:03 -0700 Subject: [PATCH 061/234] more sketching out migration update saga --- nexus/db-model/src/migration.rs | 12 ++--- nexus/db-model/src/migration_state.rs | 5 ++ .../app/sagas/instance_update/migration.rs | 52 ++++++++++++++++++- 3 files changed, 60 insertions(+), 9 deletions(-) diff --git a/nexus/db-model/src/migration.rs b/nexus/db-model/src/migration.rs index eceb7e15b3a..56b25fda84e 100644 --- a/nexus/db-model/src/migration.rs +++ b/nexus/db-model/src/migration.rs @@ -90,19 +90,15 @@ impl Migration { } } - pub const COMPLETED: MigrationState = - MigrationState(nexus::MigrationState::Completed); - pub const FAILED: MigrationState = - MigrationState(nexus::MigrationState::Failed); - /// Returns `true` if either side of the migration has failed. pub fn either_side_failed(&self) -> bool { - self.source_state == Self::FAILED || self.target_state == Self::FAILED + self.source_state == MigrationState::FAILED + || self.target_state == MigrationState::FAILED } /// Returns `true` if either side of the migration has completed. pub fn either_side_completed(&self) -> bool { - self.source_state == Self::COMPLETED - || self.target_state == Self::COMPLETED + self.source_state == MigrationState::COMPLETED + || self.target_state == MigrationState::COMPLETED } } diff --git a/nexus/db-model/src/migration_state.rs b/nexus/db-model/src/migration_state.rs index 694198eb56c..380039076b5 100644 --- a/nexus/db-model/src/migration_state.rs +++ b/nexus/db-model/src/migration_state.rs @@ -28,6 +28,11 @@ impl_enum_wrapper!( ); impl MigrationState { + pub const COMPLETED: MigrationState = + MigrationState(nexus::MigrationState::Completed); + pub const FAILED: MigrationState = + MigrationState(nexus::MigrationState::Failed); + /// Returns `true` if this migration state means that the migration is no /// longer in progress (it has either succeeded or failed). #[must_use] diff --git a/nexus/src/app/sagas/instance_update/migration.rs b/nexus/src/app/sagas/instance_update/migration.rs index e7af8acb9d2..dfeb26099be 100644 --- a/nexus/src/app/sagas/instance_update/migration.rs +++ b/nexus/src/app/sagas/instance_update/migration.rs @@ -5,6 +5,7 @@ use super::{ActionRegistry, NexusActionContext, NexusSaga}; use crate::app::db::model::Instance; use crate::app::db::model::Migration; +use crate::app::db::model::MigrationState; use crate::app::sagas::declare_saga_actions; use nexus_db_queries::{authn, authz}; use omicron_uuid_kinds::GenericUuid; @@ -23,6 +24,18 @@ declare_saga_actions! { // Clear migration IDs and write back the instance CLEAR_MIGRATION_IDS -> "clear_migration_ids" { + sium_clear_migration_ids + - sium_unclear_migration_ids + } + + // Set the target VMM to the active VMM. + SET_NEW_ACTIVE_VMM -> "set_new_active_vmm" { + + sium_set_new_active_vmm + - sium_unset_new_active_vmm + } + + // Update network configuration to point to the new active VMM. + UPDATE_NETWORK_CONFIG -> "update_network_config" { + + sium_update_network_config } } @@ -54,7 +67,20 @@ impl NexusSaga for SagaMigrationUpdate { params: &Self::Params, mut builder: steno::DagBuilder, ) -> Result { - todo!("eliza: draw the rest of the saga...") + if params.migration.either_side_failed() { + builder.append(clear_migration_ids_action()); + return Ok(builder.build()?); + } + + if params.migration.either_side_completed() { + builder.append(set_new_active_vmm_action()); + builder.append(update_network_config_action()); + if params.migration.target_state == MigrationState::COMPLETED { + builder.append(clear_migration_ids_action()); + } + } + + Ok(builder.build()?) } } @@ -98,3 +124,27 @@ async fn sium_clear_migration_ids( Ok(()) } + +async fn sium_unclear_migration_ids( + sagactx: NexusActionContext, +) -> Result<(), anyhow::Error> { + todo!("eliza") +} + +async fn sium_set_new_active_vmm( + sagactx: NexusActionContext, +) -> Result { + todo!("eliza") +} + +async fn sium_unset_new_active_vmm( + sagactx: NexusActionContext, +) -> Result<(), anyhow::Error> { + todo!("eliza") +} + +async fn sium_update_network_config( + sagactx: NexusActionContext, +) -> Result { + todo!("eliza") +} From ffcc23f79de52b54a8a7692b805ceb23b564bb57 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Fri, 21 Jun 2024 15:02:05 -0700 Subject: [PATCH 062/234] okay, i think this is the migration part --- nexus/db-model/src/migration.rs | 6 + nexus/db-model/src/migration_state.rs | 4 + nexus/src/app/instance_network.rs | 170 --------------- .../app/sagas/instance_update/migration.rs | 199 ++++++++++++------ nexus/src/app/sagas/instance_update/mod.rs | 10 +- 5 files changed, 148 insertions(+), 241 deletions(-) diff --git a/nexus/db-model/src/migration.rs b/nexus/db-model/src/migration.rs index 56b25fda84e..d7c18ae5dd5 100644 --- a/nexus/db-model/src/migration.rs +++ b/nexus/db-model/src/migration.rs @@ -90,6 +90,12 @@ impl Migration { } } + /// Returns `true` if either side reports that the migration is in a + /// terminal state. + pub fn is_terminal(&self) -> bool { + self.source_state.is_terminal() || self.target_state.is_terminal() + } + /// Returns `true` if either side of the migration has failed. pub fn either_side_failed(&self) -> bool { self.source_state == MigrationState::FAILED diff --git a/nexus/db-model/src/migration_state.rs b/nexus/db-model/src/migration_state.rs index 380039076b5..c06bbd67ea8 100644 --- a/nexus/db-model/src/migration_state.rs +++ b/nexus/db-model/src/migration_state.rs @@ -32,6 +32,10 @@ impl MigrationState { MigrationState(nexus::MigrationState::Completed); pub const FAILED: MigrationState = MigrationState(nexus::MigrationState::Failed); + pub const PENDING: MigrationState = + MigrationState(nexus::MigrationState::Pending); + pub const IN_PROGRESS: MigrationState = + MigrationState(nexus::MigrationState::InProgress); /// Returns `true` if this migration state means that the migration is no /// longer in progress (it has either succeeded or failed). diff --git a/nexus/src/app/instance_network.rs b/nexus/src/app/instance_network.rs index 3ac0757b47e..27edcacabd7 100644 --- a/nexus/src/app/instance_network.rs +++ b/nexus/src/app/instance_network.rs @@ -228,176 +228,6 @@ pub(crate) async fn boundary_switches( Ok(boundary_switches) } -// /// Given old and new instance runtime states, determines the desired -// /// networking configuration for a given instance and ensures it has been -// /// propagated to all relevant sleds. -// /// -// /// # Arguments -// /// -// /// - `datastore`: the datastore to use for lookups and updates. -// /// - `log`: the [`slog::Logger`] to log to. -// /// - `resolver`: an internal DNS resolver to look up DPD service addresses. -// /// - `opctx`: An operation context for this operation. -// /// - `opctx_alloc`: An operational context list permissions for all sleds. When -// /// called by methods on the [`Nexus`] type, this is the `OpContext` used for -// /// instance allocation. In a background task, this may be the background -// /// task's operational context; nothing stops you from passing the same -// /// `OpContext` as both `opctx` and `opctx_alloc`. -// /// - `authz_instance``: A resolved authorization context for the instance of -// /// interest. -// /// - `prev_instance_state``: The most-recently-recorded instance runtime -// /// state for this instance. -// /// - `new_instance_state`: The instance state that the caller of this routine -// /// has observed and that should be used to set up this instance's -// /// networking state. -// /// -// /// # Return value -// /// -// /// `Ok(())` if this routine completed all the operations it wanted to -// /// complete, or an appropriate `Err` otherwise. -// #[allow(clippy::too_many_arguments)] // Yeah, I know, I know, Clippy... -// #[allow(dead_code)] // TODO(eliza): this probably needs to be deleted eventually -// pub(crate) async fn ensure_updated_instance_network_config( -// datastore: &DataStore, -// log: &slog::Logger, -// resolver: &internal_dns::resolver::Resolver, -// opctx: &OpContext, -// opctx_alloc: &OpContext, -// authz_instance: &authz::Instance, -// prev_instance_state: &db::model::InstanceRuntimeState, -// new_instance_state: &nexus::InstanceRuntimeState, -// v2p_manager: &background::Activator, -// ) -> Result<(), Error> { -// let instance_id = InstanceUuid::from_untyped_uuid(authz_instance.id()); - -// // If this instance update is stale, do nothing, since the superseding -// // update may have allowed the instance's location to change further. -// if prev_instance_state.gen >= new_instance_state.gen.into() { -// debug!(log, -// "instance state generation already advanced, \ -// won't touch network config"; -// "instance_id" => %instance_id); - -// return Ok(()); -// } - -// // If this update will retire the instance's active VMM, delete its -// // networking state. It will be re-established the next time the -// // instance starts. -// if new_instance_state.propolis_id.is_none() { -// info!(log, -// "instance cleared its Propolis ID, cleaning network config"; -// "instance_id" => %instance_id, -// "propolis_id" => ?prev_instance_state.propolis_id); - -// clear_instance_networking_state( -// datastore, -// log, -// resolver, -// opctx, -// opctx_alloc, -// authz_instance, -// v2p_manager, -// ) -// .await?; -// return Ok(()); -// } - -// // If the instance still has a migration in progress, don't change -// // any networking state until an update arrives that retires that -// // migration. -// // -// // This is needed to avoid the following race: -// // -// // 1. Migration from S to T completes. -// // 2. Migration source sends an update that changes the instance's -// // active VMM but leaves the migration ID in place. -// // 3. Meanwhile, migration target sends an update that changes the -// // instance's active VMM and clears the migration ID. -// // 4. The migration target's call updates networking state and commits -// // the new instance record. -// // 5. The instance migrates from T to T' and Nexus applies networking -// // configuration reflecting that the instance is on T'. -// // 6. The update in step 2 applies configuration saying the instance -// // is on sled T. -// if new_instance_state.migration_id.is_some() { -// debug!(log, -// "instance still has a migration in progress, won't touch \ -// network config"; -// "instance_id" => %instance_id, -// "migration_id" => ?new_instance_state.migration_id); - -// return Ok(()); -// } - -// let new_propolis_id = new_instance_state.propolis_id.unwrap(); - -// // Updates that end live migration need to push OPTE V2P state even if -// // the instance's active sled did not change (see below). -// let migration_retired = prev_instance_state.migration_id.is_some() -// && new_instance_state.migration_id.is_none(); - -// if (prev_instance_state.propolis_id -// == new_instance_state.propolis_id.map(GenericUuid::into_untyped_uuid)) -// && !migration_retired -// { -// debug!(log, "instance didn't move, won't touch network config"; -// "instance_id" => %instance_id); - -// return Ok(()); -// } - -// // Either the instance moved from one sled to another, or it attempted -// // to migrate and failed. Ensure the correct networking configuration -// // exists for its current home. -// // -// // TODO(#3107) This is necessary even if the instance didn't move, -// // because registering a migration target on a sled creates OPTE ports -// // for its VNICs, and that creates new V2P mappings on that sled that -// // place the relevant virtual IPs on the local sled. Once OPTE stops -// // creating these mappings, this path only needs to be taken if an -// // instance has changed sleds. -// let new_sled_id = match datastore -// .vmm_fetch(&opctx, authz_instance, &new_propolis_id) -// .await -// { -// Ok(vmm) => vmm.sled_id, - -// // A VMM in the active position should never be destroyed. If the -// // sled sending this message is the owner of the instance's last -// // active VMM and is destroying it, it should also have retired that -// // VMM. -// Err(Error::ObjectNotFound { .. }) => { -// error!(log, "instance's active vmm unexpectedly not found"; -// "instance_id" => %instance_id, -// "propolis_id" => %new_propolis_id); - -// return Ok(()); -// } - -// Err(e) => return Err(e), -// }; - -// v2p_manager.activate(); - -// let (.., sled) = -// LookupPath::new(opctx, datastore).sled_id(new_sled_id).fetch().await?; - -// instance_ensure_dpd_config( -// datastore, -// log, -// resolver, -// opctx, -// opctx_alloc, -// instance_id, -// &sled.address(), -// None, -// ) -// .await?; - -// Ok(()) -// } - /// Ensures that the Dendrite configuration for the supplied instance is /// up-to-date. /// diff --git a/nexus/src/app/sagas/instance_update/migration.rs b/nexus/src/app/sagas/instance_update/migration.rs index dfeb26099be..bae8ce7590f 100644 --- a/nexus/src/app/sagas/instance_update/migration.rs +++ b/nexus/src/app/sagas/instance_update/migration.rs @@ -3,11 +3,14 @@ // file, You can obtain one at https://mozilla.org/MPL/2.0/. use super::{ActionRegistry, NexusActionContext, NexusSaga}; +use crate::app::db::model::Generation; use crate::app::db::model::Instance; use crate::app::db::model::Migration; use crate::app::db::model::MigrationState; use crate::app::sagas::declare_saga_actions; +use nexus_db_queries::db::lookup::LookupPath; use nexus_db_queries::{authn, authz}; +use omicron_common::api::external::Error; use omicron_uuid_kinds::GenericUuid; use omicron_uuid_kinds::InstanceUuid; use omicron_uuid_kinds::PropolisUuid; @@ -21,16 +24,13 @@ use steno::ActionError; declare_saga_actions! { instance_update_migration; - // Clear migration IDs and write back the instance - CLEAR_MIGRATION_IDS -> "clear_migration_ids" { - + sium_clear_migration_ids - - sium_unclear_migration_ids - } - - // Set the target VMM to the active VMM. - SET_NEW_ACTIVE_VMM -> "set_new_active_vmm" { - + sium_set_new_active_vmm - - sium_unset_new_active_vmm + // Update the instance record to reflect the migration event. If the + // migration has completed on the target VMM, or if the migration has + // failed, this will clear the migration IDs, allowing the instance to + // migrate again. If the migration has completed on either VMM, the target + // VMM becomes the active VMM. + UPDATE_INSTANCE_RECORD -> "update_instance_record" { + + sium_update_instance_record } // Update network configuration to point to the new active VMM. @@ -64,87 +64,154 @@ impl NexusSaga for SagaMigrationUpdate { } fn make_saga_dag( - params: &Self::Params, + _: &Self::Params, mut builder: steno::DagBuilder, ) -> Result { - if params.migration.either_side_failed() { - builder.append(clear_migration_ids_action()); - return Ok(builder.build()?); - } - - if params.migration.either_side_completed() { - builder.append(set_new_active_vmm_action()); - builder.append(update_network_config_action()); - if params.migration.target_state == MigrationState::COMPLETED { - builder.append(clear_migration_ids_action()); - } - } + builder.append(update_instance_record_action()); + builder.append(update_network_config_action()); Ok(builder.build()?) } } -async fn sium_clear_migration_ids( +async fn sium_update_instance_record( + sagactx: NexusActionContext, +) -> Result { + let Params { ref authz_instance, ref migration, ref instance, .. } = + sagactx.saga_params()?; + + let osagactx = sagactx.user_data(); + let instance_id = InstanceUuid::from_untyped_uuid(authz_instance.id()); + + let mut new_runtime = instance.runtime().clone(); + new_runtime.gen = Generation(new_runtime.gen.next()); + + // Determine how to update the instance record to reflect the current + // migration state. + let failed = migration.either_side_failed(); + // If the migration has failed, or if the target reports that the migration + // has completed, clear the instance record's migration IDs so that a new + // migration can begin. + if failed || migration.target_state == MigrationState::COMPLETED { + info!( + osagactx.log(), + "instance update (migration {}): clearing migration IDs", + if failed { "failed" } else { "target_completed" }; + "instance_id" => %instance_id, + "migration_id" => %migration.id, + "src_propolis_id" => %migration.source_propolis_id, + "target_propolis_id" => %migration.target_propolis_id, + "instance_update" => %"migration", + ); + new_runtime.migration_id = None; + new_runtime.dst_propolis_id = None; + } + + // If either side reports that the migration has completed, move the target + // Propolis ID to the active position. + let new_propolis_id = if !failed && migration.either_side_completed() { + info!( + osagactx.log(), + "instance update (migration completed): setting active VMM ID to target"; + "instance_id" => %instance_id, + "migration_id" => %migration.id, + "src_propolis_id" => %migration.source_propolis_id, + "target_propolis_id" => %migration.target_propolis_id, + "instance_update" => %"migration", + ); + new_runtime.propolis_id = Some(migration.target_propolis_id); + migration.target_propolis_id + } else { + migration.source_propolis_id + }; + + osagactx + .datastore() + .instance_update_runtime(&instance_id, &new_runtime) + .await + .map_err(ActionError::action_failed)?; + + Ok(PropolisUuid::from_untyped_uuid(new_propolis_id)) +} + +async fn sium_update_network_config( sagactx: NexusActionContext, ) -> Result<(), ActionError> { - let Params { - ref serialized_authn, ref authz_instance, ref migration, .. - } = sagactx.saga_params()?; + let Params { ref serialized_authn, ref authz_instance, migration, .. } = + sagactx.saga_params()?; let opctx = crate::context::op_context_for_saga_action(&sagactx, serialized_authn); let osagactx = sagactx.user_data(); let instance_id = InstanceUuid::from_untyped_uuid(authz_instance.id()); - let src_propolis_id = - PropolisUuid::from_untyped_uuid(migration.source_propolis_id); - let target_propolis_id = - PropolisUuid::from_untyped_uuid(migration.target_propolis_id); + // Either the instance moved from one sled to another, or it attempted + // to migrate and failed. Ensure the correct networking configuration + // exists for its current home. + // + // TODO(#3107) This is necessary even if the instance didn't move, + // because registering a migration target on a sled creates OPTE ports + // for its VNICs, and that creates new V2P mappings on that sled that + // place the relevant virtual IPs on the local sled. Once OPTE stops + // creating these mappings, this path only needs to be taken if an + // instance has changed sleds. + + // Look up the ID of the sled that the instance now resides on, so that we + // can look up its address. + let active_propolis_id = + sagactx.lookup::("update_instance_record")?; + let new_sled_id = match osagactx + .datastore() + .vmm_fetch(&opctx, authz_instance, &active_propolis_id) + .await + { + Ok(vmm) => vmm.sled_id, + + // A VMM in the active position should never be destroyed. If the + // sled sending this message is the owner of the instance's last + // active VMM and is destroying it, it should also have retired that + // VMM. + Err(Error::ObjectNotFound { .. }) => { + error!(osagactx.log(), "instance's active vmm unexpectedly not found"; + "instance_id" => %instance_id, + "propolis_id" => %active_propolis_id); + + return Ok(()); + } + Err(e) => return Err(ActionError::action_failed(e)), + }; - slog::info!( + info!( osagactx.log(), - "instance update (migration): clearing migration IDs"; + "instance update (migration): ensuring updated instance network config"; "instance_id" => %instance_id, "migration_id" => %migration.id, - "src_propolis_id" => %src_propolis_id, - "target_propolis_id" => %target_propolis_id, + "src_propolis_id" => %migration.source_propolis_id, + "target_propolis_id" => %migration.target_propolis_id, + "active_propolis_id" => %active_propolis_id, + "sled_id" => %new_sled_id, "migration_failed" => migration.either_side_failed(), ); - osagactx - .datastore() - .instance_unset_migration_ids( - &opctx, - instance_id, - migration.id, - target_propolis_id, + if let Err(e) = osagactx.nexus().v2p_notification_tx.send(()) { + error!( + osagactx.log(), + "error notifying background task of v2p change"; + "error" => ?e ) + }; + + let (.., sled) = LookupPath::new(&opctx, osagactx.datastore()) + .sled_id(new_sled_id) + .fetch() .await .map_err(ActionError::action_failed)?; - Ok(()) -} - -async fn sium_unclear_migration_ids( - sagactx: NexusActionContext, -) -> Result<(), anyhow::Error> { - todo!("eliza") -} - -async fn sium_set_new_active_vmm( - sagactx: NexusActionContext, -) -> Result { - todo!("eliza") -} - -async fn sium_unset_new_active_vmm( - sagactx: NexusActionContext, -) -> Result<(), anyhow::Error> { - todo!("eliza") -} + osagactx + .nexus() + .instance_ensure_dpd_config(&opctx, instance_id, &sled.address(), None) + .await + .map_err(ActionError::action_failed)?; -async fn sium_update_network_config( - sagactx: NexusActionContext, -) -> Result { - todo!("eliza") + Ok(()) } diff --git a/nexus/src/app/sagas/instance_update/mod.rs b/nexus/src/app/sagas/instance_update/mod.rs index ca446373463..89c6f78c209 100644 --- a/nexus/src/app/sagas/instance_update/mod.rs +++ b/nexus/src/app/sagas/instance_update/mod.rs @@ -136,17 +136,17 @@ impl NexusSaga for SagaDoActualInstanceUpdate { // will only return a migration if it is the instance's currently active // migration, so if we have one here, that means that there's a // migration. - if let Some(migration) = params.state.migration.clone() { - if migration.either_side_failed() - || migration.either_side_completed() - { + if let Some(ref migration) = params.state.migration { + // If either side of the migration reports a terminal state, update + // the instance to reflect that. + if migration.is_terminal() { const MIGRATION_SUBSAGA_PARAMS: &str = "params_for_migration_subsaga"; let subsaga_params = migration::Params { serialized_authn: params.serialized_authn.clone(), authz_instance: params.authz_instance.clone(), instance: params.state.instance.clone(), - migration, + migration: migration.clone(), }; let subsaga_dag = { let subsaga_builder = DagBuilder::new(SagaName::new( From 932a17816d80da692e2266491a69d1b08cae2a10 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Fri, 21 Jun 2024 15:39:51 -0700 Subject: [PATCH 063/234] ...you have to actually register the saga actions --- nexus/src/app/sagas/instance_update/start.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/nexus/src/app/sagas/instance_update/start.rs b/nexus/src/app/sagas/instance_update/start.rs index f6ccb1053fe..8c4d79cf19f 100644 --- a/nexus/src/app/sagas/instance_update/start.rs +++ b/nexus/src/app/sagas/instance_update/start.rs @@ -62,6 +62,7 @@ impl NexusSaga for SagaInstanceUpdate { start_instance_update_register_actions(registry); super::SagaDoActualInstanceUpdate::register_actions(registry); super::destroyed::SagaVmmDestroyed::register_actions(registry); + super::migration::SagaMigrationUpdate::register_actions(registry); } fn make_saga_dag( From 1bda35653deb681257864000cf5acbec54ae40a4 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Fri, 21 Jun 2024 16:10:35 -0700 Subject: [PATCH 064/234] obnoxious clippy nonsense --- nexus/src/app/sagas/instance_update/destroyed.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nexus/src/app/sagas/instance_update/destroyed.rs b/nexus/src/app/sagas/instance_update/destroyed.rs index 33589eab5fb..263acf26ded 100644 --- a/nexus/src/app/sagas/instance_update/destroyed.rs +++ b/nexus/src/app/sagas/instance_update/destroyed.rs @@ -254,7 +254,7 @@ async fn siud_update_instance( let osagactx = sagactx.user_data(); let new_runtime = InstanceRuntimeState { propolis_id: None, - nexus_state: InstanceState::NoVmm.into(), + nexus_state: InstanceState::NoVmm, gen: Generation(instance.runtime_state.gen.0.next()), ..instance.runtime_state }; From 9450f314610ea95adb533a68ec36d3f83ca55ad8 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Fri, 21 Jun 2024 16:15:58 -0700 Subject: [PATCH 065/234] shut up clippy in a slightly more polite way --- sled-agent/src/instance.rs | 15 +++++++++------ sled-agent/src/instance_manager.rs | 2 +- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/sled-agent/src/instance.rs b/sled-agent/src/instance.rs index 3b17b68afcb..014993443ad 100644 --- a/sled-agent/src/instance.rs +++ b/sled-agent/src/instance.rs @@ -955,6 +955,9 @@ pub(crate) struct InstanceInitialState { pub hardware: InstanceHardware, pub vmm_runtime: VmmRuntimeState, pub propolis_addr: SocketAddr, + /// UUID of the migration in to this VMM, if the VMM is being created as the + /// target of an active migration. + pub migration_id: Option, } impl Instance { @@ -965,20 +968,16 @@ impl Instance { /// * `log`: Logger for dumping debug information. /// * `id`: UUID of the instance to be created. /// * `propolis_id`: UUID for the VMM to be created. - /// * `migration_id`: UUID of the migration in to this VMM, if the VMM is - /// being created as the target of an active migration. /// * `ticket`: A ticket that ensures this instance is a member of its /// instance manager's tracking table. /// * `state`: The initial state of this instance. /// * `services`: A set of instance manager-provided services. /// * `sled_identifiers`: Sled-related metadata used to track statistics. /// * `metadata`: Instance-related metadata used to track statistics. - #[allow(clippy::too_many_arguments)] pub(crate) fn new( log: Logger, id: InstanceUuid, propolis_id: PropolisUuid, - migration_id: Option, ticket: InstanceTicket, state: InstanceInitialState, services: InstanceManagerServices, @@ -988,11 +987,15 @@ impl Instance { info!(log, "initializing new Instance"; "instance_id" => %id, "propolis_id" => %propolis_id, - "migration_id" => ?migration_id, + "migration_id" => ?state.migration_id, "state" => ?state); let InstanceInitialState { - hardware, vmm_runtime, propolis_addr, .. + hardware, + vmm_runtime, + propolis_addr, + migration_id, + .. } = state; let InstanceManagerServices { diff --git a/sled-agent/src/instance_manager.rs b/sled-agent/src/instance_manager.rs index 2cbaf32dd32..0de890ce63e 100644 --- a/sled-agent/src/instance_manager.rs +++ b/sled-agent/src/instance_manager.rs @@ -621,13 +621,13 @@ impl InstanceManagerRunner { hardware, vmm_runtime, propolis_addr, + migration_id: instance_runtime.migration_id, }; let instance = Instance::new( instance_log, instance_id, propolis_id, - instance_runtime.migration_id, ticket, state, services, From 437cc7ae9cd40f7a9144a3c25b87ee6ea95137dd Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Fri, 21 Jun 2024 16:19:17 -0700 Subject: [PATCH 066/234] fix docs --- nexus/db-queries/src/db/datastore/instance.rs | 8 ++++---- nexus/db-queries/src/db/datastore/migration.rs | 4 ---- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/instance.rs b/nexus/db-queries/src/db/datastore/instance.rs index 085e68c1224..950e5f3118b 100644 --- a/nexus/db-queries/src/db/datastore/instance.rs +++ b/nexus/db-queries/src/db/datastore/instance.rs @@ -177,14 +177,14 @@ pub struct InstanceUpdateResult { pub instance_updated: bool, /// `true` if the VMM record was updated, `false` otherwise. pub vmm_updated: bool, - /// Indicates whether a migration record for this instance was updated, if a - /// [`MigrationRuntimeState`] was provided to + /// Indicates whether a migration record for this instance was updated, if + /// [`Migrations`] were provided to /// [`DataStore::instance_and_vmm_update_runtime`]. /// /// - `Some(true)` if a migration record was updated - /// - `Some(false)` if a [`MigrationRuntimeState`] was provided, but the + /// - `Some(false)` if [`Migrations`] were provided, but the /// migration record was not updated - /// - `None` if no [`MigrationRuntimeState`] was provided + /// - `None` if no [`Migrations`] were provided pub migration_updated: Option, } diff --git a/nexus/db-queries/src/db/datastore/migration.rs b/nexus/db-queries/src/db/datastore/migration.rs index 049f0b0f6ff..8a7d1c645bd 100644 --- a/nexus/db-queries/src/db/datastore/migration.rs +++ b/nexus/db-queries/src/db/datastore/migration.rs @@ -105,10 +105,6 @@ impl DataStore { } /// Unconditionally mark a migration record as deleted. - /// - /// This is distinct from [`DataStore::migration_terminate`], as it will - /// mark a migration as deleted regardless of the states of the source and - /// target VMMs. pub async fn migration_mark_deleted( &self, opctx: &OpContext, From b2cc1a8c18c285b45535a185ddcb7b1d1b2aa7c0 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Mon, 24 Jun 2024 14:53:04 -0700 Subject: [PATCH 067/234] update omdb output --- dev-tools/omdb/tests/env.out | 12 ++++++++++++ dev-tools/omdb/tests/successes.out | 12 ++++++++++++ 2 files changed, 24 insertions(+) diff --git a/dev-tools/omdb/tests/env.out b/dev-tools/omdb/tests/env.out index a6bf4d46670..67f113a801e 100644 --- a/dev-tools/omdb/tests/env.out +++ b/dev-tools/omdb/tests/env.out @@ -86,6 +86,10 @@ task: "external_endpoints" on each one +task: "instance_updater" + detects if instances require update sagas and schedules them + + task: "instance_watcher" periodically checks instance states @@ -231,6 +235,10 @@ task: "external_endpoints" on each one +task: "instance_updater" + detects if instances require update sagas and schedules them + + task: "instance_watcher" periodically checks instance states @@ -363,6 +371,10 @@ task: "external_endpoints" on each one +task: "instance_updater" + detects if instances require update sagas and schedules them + + task: "instance_watcher" periodically checks instance states diff --git a/dev-tools/omdb/tests/successes.out b/dev-tools/omdb/tests/successes.out index cec3fa30529..eee272433df 100644 --- a/dev-tools/omdb/tests/successes.out +++ b/dev-tools/omdb/tests/successes.out @@ -287,6 +287,10 @@ task: "external_endpoints" on each one +task: "instance_updater" + detects if instances require update sagas and schedules them + + task: "instance_watcher" periodically checks instance states @@ -482,6 +486,13 @@ task: "external_endpoints" TLS certificates: 0 +task: "instance_updater" + configured period: every 30s + currently executing: no + last completed activation: , triggered by an explicit signal + started at (s ago) and ran for ms +warning: unknown background task: "instance_updater" (don't know how to interpret details: Object {"destroyed_active_vmms": Number(0), "error": Null, "sagas_started": Number(0)}) + task: "instance_watcher" configured period: every s currently executing: no @@ -490,6 +501,7 @@ task: "instance_watcher" total instances checked: 0 checks completed: 0 successful checks: 0 + update sagas queued: 0 failed checks: 0 checks that could not be completed: 0 stale instance metrics pruned: 0 From 0e38a5a654b37ed2fb78d210a1fbfee966be782f Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Mon, 24 Jun 2024 14:57:23 -0700 Subject: [PATCH 068/234] fix(?) illumos-only tests --- sled-agent/src/instance.rs | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/sled-agent/src/instance.rs b/sled-agent/src/instance.rs index 014993443ad..e29b8458520 100644 --- a/sled-agent/src/instance.rs +++ b/sled-agent/src/instance.rs @@ -995,7 +995,6 @@ impl Instance { vmm_runtime, propolis_addr, migration_id, - .. } = state; let InstanceManagerServices { @@ -1822,19 +1821,13 @@ mod tests { InstanceInitialState { hardware, - instance_runtime: InstanceRuntimeState { - propolis_id: Some(propolis_id), - dst_propolis_id: None, - migration_id: None, - gen: Generation::new(), - time_updated: Default::default(), - }, vmm_runtime: VmmRuntimeState { state: VmmState::Starting, gen: Generation::new(), time_updated: Default::default(), }, propolis_addr, + migration_id: None, } } @@ -2219,9 +2212,9 @@ mod tests { let propolis_id = PropolisUuid::from_untyped_uuid(PROPOLIS_ID); let InstanceInitialState { hardware, - instance_runtime, vmm_runtime, propolis_addr, + migration_id, } = fake_instance_initial_state(propolis_id, propolis_addr); let metadata = InstanceMetadata { @@ -2240,7 +2233,6 @@ mod tests { instance_id, propolis_id, hardware, - instance_runtime, vmm_runtime, propolis_addr, sled_identifiers, From 15b8333458f786e4dfcf7991f2f4efacd349eb50 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Mon, 24 Jun 2024 15:03:33 -0700 Subject: [PATCH 069/234] put that back --- sled-agent/src/instance.rs | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/sled-agent/src/instance.rs b/sled-agent/src/instance.rs index e29b8458520..10b69c72466 100644 --- a/sled-agent/src/instance.rs +++ b/sled-agent/src/instance.rs @@ -2229,10 +2229,19 @@ mod tests { serial: "fake-serial".into(), }; + let instance_runtime = InstanceRuntimeState { + propolis_id: Some(propolis_id), + dst_propolis_id: None, + migration_id: None, + gen: Generation::new(), + time_updated: Default::default(), + }; + mgr.ensure_registered( instance_id, propolis_id, hardware, + instance_runtime, vmm_runtime, propolis_addr, sled_identifiers, From 3170f41c801b7bb69b7c5f526cb6d63b4264f66c Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Mon, 24 Jun 2024 15:43:20 -0700 Subject: [PATCH 070/234] whoops, imports --- sled-agent/src/instance.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sled-agent/src/instance.rs b/sled-agent/src/instance.rs index 10b69c72466..62c7c6ca927 100644 --- a/sled-agent/src/instance.rs +++ b/sled-agent/src/instance.rs @@ -1572,7 +1572,9 @@ mod tests { use omicron_common::api::external::{ ByteCount, Generation, Hostname, InstanceCpuCount, }; - use omicron_common::api::internal::nexus::{InstanceProperties, VmmState}; + use omicron_common::api::internal::nexus::{ + InstanceProperties, InstanceRuntimeState, VmmState, + }; use omicron_common::api::internal::shared::SledIdentifiers; use omicron_common::FileKv; use sled_storage::manager_test_harness::StorageManagerTestHarness; From 85db24b26ced5262a753f19e863945faceec261d Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Mon, 24 Jun 2024 16:10:17 -0700 Subject: [PATCH 071/234] docs build unbreakening --- nexus/src/app/instance.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nexus/src/app/instance.rs b/nexus/src/app/instance.rs index 6542bd962cd..0bf7dd6c815 100644 --- a/nexus/src/app/instance.rs +++ b/nexus/src/app/instance.rs @@ -1861,7 +1861,7 @@ impl super::Nexus { } } -/// [`Nexus::notify_instance_updated`] (~~Taylor~~ background task's version) +/// `Nexus::notify_instance_updated` (~~Taylor~~ background task's version) pub(crate) async fn notify_instance_updated_background( datastore: &DataStore, opctx: &OpContext, From 5091226f4018cd0c6eafaedeb3e552979edab171 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Mon, 24 Jun 2024 16:24:43 -0700 Subject: [PATCH 072/234] you have to actually update the timestamps --- nexus/src/app/sagas/instance_update/destroyed.rs | 2 ++ nexus/src/app/sagas/instance_update/migration.rs | 2 ++ 2 files changed, 4 insertions(+) diff --git a/nexus/src/app/sagas/instance_update/destroyed.rs b/nexus/src/app/sagas/instance_update/destroyed.rs index 263acf26ded..eaf45f1dba0 100644 --- a/nexus/src/app/sagas/instance_update/destroyed.rs +++ b/nexus/src/app/sagas/instance_update/destroyed.rs @@ -7,6 +7,7 @@ use super::NexusActionContext; use super::NexusSaga; use crate::app::sagas::declare_saga_actions; use crate::app::sagas::ActionError; +use chrono::Utc; use nexus_db_model::Generation; use nexus_db_model::Instance; use nexus_db_model::InstanceRuntimeState; @@ -256,6 +257,7 @@ async fn siud_update_instance( propolis_id: None, nexus_state: InstanceState::NoVmm, gen: Generation(instance.runtime_state.gen.0.next()), + time_updated: Utc::now(), ..instance.runtime_state }; diff --git a/nexus/src/app/sagas/instance_update/migration.rs b/nexus/src/app/sagas/instance_update/migration.rs index bae8ce7590f..c382fbf30b7 100644 --- a/nexus/src/app/sagas/instance_update/migration.rs +++ b/nexus/src/app/sagas/instance_update/migration.rs @@ -8,6 +8,7 @@ use crate::app::db::model::Instance; use crate::app::db::model::Migration; use crate::app::db::model::MigrationState; use crate::app::sagas::declare_saga_actions; +use chrono::Utc; use nexus_db_queries::db::lookup::LookupPath; use nexus_db_queries::{authn, authz}; use omicron_common::api::external::Error; @@ -85,6 +86,7 @@ async fn sium_update_instance_record( let mut new_runtime = instance.runtime().clone(); new_runtime.gen = Generation(new_runtime.gen.next()); + new_runtime.time_updated = Utc::now(); // Determine how to update the instance record to reflect the current // migration state. From 3930d0621bb9256954d2172292c06fe4b112950f Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Tue, 25 Jun 2024 10:36:32 -0700 Subject: [PATCH 073/234] fix virtual provisioning record not being deleted --- .../app/sagas/instance_update/destroyed.rs | 71 ++++++++++++++----- 1 file changed, 52 insertions(+), 19 deletions(-) diff --git a/nexus/src/app/sagas/instance_update/destroyed.rs b/nexus/src/app/sagas/instance_update/destroyed.rs index eaf45f1dba0..75c9cc38927 100644 --- a/nexus/src/app/sagas/instance_update/destroyed.rs +++ b/nexus/src/app/sagas/instance_update/destroyed.rs @@ -152,15 +152,23 @@ async fn siud_release_virtual_provisioning( let opctx = crate::context::op_context_for_saga_action(&sagactx, serialized_authn); - info!( - osagactx.log(), - "instance update (VMM destroyed): deallocating virtual provisioning resources"; - "instance_id" => %authz_instance.id(), - "propolis_id" => %vmm_id, - "instance_update" => %"VMM destroyed", - ); - - osagactx + // `virtual_provisioning_collection_delete_instace` will only delete virtual + // provisioning records that are *less than* the max generation parameter, + // not less than or equal to it --- the idea is that the generation number + // has already been advanced when we are deallocating the virtual + // provisioning records. This is kind of an artifact of sled-agent + // previously owning instance runtime state generations, since the + // sled-agent would have already advanced the instance's generation. + // + // However, now that the instance record is owned by Nexus, and we are + // updating the instance in response to a VMM state update from sled-agent, + // the instance record snapshot we are holding has not yet had its + // generation advanced, so we want to allow deleting virtual provisioning + // records that were created with the instance's current generation. The + // generation will be advanced at the end of this saga, once we have updated + // the actual instance record. + let max_gen = instance.runtime_state.gen.next(); + let result = osagactx .datastore() .virtual_provisioning_collection_delete_instance( &opctx, @@ -168,17 +176,42 @@ async fn siud_release_virtual_provisioning( instance.project_id, i64::from(instance.ncpus.0 .0), instance.memory, - i64::try_from(&instance.runtime_state.gen.0).unwrap(), + i64::try_from(&max_gen).unwrap(), ) - .await - .map(|_| ()) - .or_else(|err| { - // Necessary for idempotency - match err { - Error::ObjectNotFound { .. } => Ok(()), - _ => Err(ActionError::action_failed(err)), - } - }) + .await; + match result { + Ok(deleted) => { + info!( + osagactx.log(), + "instance update (VMM destroyed): deallocated virtual \ + provisioning resources"; + "instance_id" => %authz_instance.id(), + "propolis_id" => %vmm_id, + "records_deleted" => ?deleted, + "instance_update" => %"VMM destroyed", + ); + } + // Necessary for idempotency --- the virtual provisioning resources may + // have been deleted already, that's fine. + Err(Error::ObjectNotFound { .. }) => { + // TODO(eliza): it would be nice if we could distinguish + // between errors returned by + // `virtual_provisioning_collection_delete_instance` where + // the instance ID was not found, and errors where the + // generation number was too low... + info!( + osagactx.log(), + "instance update (VMM destroyed): virtual provisioning \ + record not found; perhaps it has already been deleted?"; + "instance_id" => %authz_instance.id(), + "propolis_id" => %vmm_id, + "instance_update" => %"VMM destroyed", + ); + } + Err(err) => return Err(ActionError::action_failed(err)), + }; + + Ok(()) } async fn siud_unassign_oximeter_producer( From 3d35078928a3a45cd945f0e5c5d86fe4e2010773 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Tue, 25 Jun 2024 11:12:52 -0700 Subject: [PATCH 074/234] move migration stuff out of a subsaga It doesn't really need to be a subsaga, since we only ever build one of them...this is a bit simpler. --- .../app/sagas/instance_update/migration.rs | 219 ------------------ nexus/src/app/sagas/instance_update/mod.rs | 217 ++++++++++++++--- nexus/src/app/sagas/instance_update/start.rs | 1 - 3 files changed, 183 insertions(+), 254 deletions(-) delete mode 100644 nexus/src/app/sagas/instance_update/migration.rs diff --git a/nexus/src/app/sagas/instance_update/migration.rs b/nexus/src/app/sagas/instance_update/migration.rs deleted file mode 100644 index c382fbf30b7..00000000000 --- a/nexus/src/app/sagas/instance_update/migration.rs +++ /dev/null @@ -1,219 +0,0 @@ -// This Source Code Form is subject to the terms of the Mozilla Public -// License, v. 2.0. If a copy of the MPL was not distributed with this -// file, You can obtain one at https://mozilla.org/MPL/2.0/. - -use super::{ActionRegistry, NexusActionContext, NexusSaga}; -use crate::app::db::model::Generation; -use crate::app::db::model::Instance; -use crate::app::db::model::Migration; -use crate::app::db::model::MigrationState; -use crate::app::sagas::declare_saga_actions; -use chrono::Utc; -use nexus_db_queries::db::lookup::LookupPath; -use nexus_db_queries::{authn, authz}; -use omicron_common::api::external::Error; -use omicron_uuid_kinds::GenericUuid; -use omicron_uuid_kinds::InstanceUuid; -use omicron_uuid_kinds::PropolisUuid; -use serde::{Deserialize, Serialize}; -use steno::ActionError; - -// instance update (migration) subsaga: actions - -// This subsaga is responsible for handling an instance update where either side -// of an active migration reports that the migration has failed or completed. -declare_saga_actions! { - instance_update_migration; - - // Update the instance record to reflect the migration event. If the - // migration has completed on the target VMM, or if the migration has - // failed, this will clear the migration IDs, allowing the instance to - // migrate again. If the migration has completed on either VMM, the target - // VMM becomes the active VMM. - UPDATE_INSTANCE_RECORD -> "update_instance_record" { - + sium_update_instance_record - } - - // Update network configuration to point to the new active VMM. - UPDATE_NETWORK_CONFIG -> "update_network_config" { - + sium_update_network_config - } -} - -/// Parameters to the instance update (migration) sub-saga. -#[derive(Debug, Deserialize, Serialize)] -pub(super) struct Params { - /// Authentication context to use to fetch the instance's current state from - /// the database. - pub(super) serialized_authn: authn::saga::Serialized, - - pub(super) authz_instance: authz::Instance, - - pub(super) instance: Instance, - - pub(super) migration: Migration, -} - -#[derive(Debug)] -pub(super) struct SagaMigrationUpdate; -impl NexusSaga for SagaMigrationUpdate { - const NAME: &'static str = "instance-update-migration"; - type Params = Params; - - fn register_actions(registry: &mut ActionRegistry) { - instance_update_migration_register_actions(registry); - } - - fn make_saga_dag( - _: &Self::Params, - mut builder: steno::DagBuilder, - ) -> Result { - builder.append(update_instance_record_action()); - builder.append(update_network_config_action()); - - Ok(builder.build()?) - } -} - -async fn sium_update_instance_record( - sagactx: NexusActionContext, -) -> Result { - let Params { ref authz_instance, ref migration, ref instance, .. } = - sagactx.saga_params()?; - - let osagactx = sagactx.user_data(); - let instance_id = InstanceUuid::from_untyped_uuid(authz_instance.id()); - - let mut new_runtime = instance.runtime().clone(); - new_runtime.gen = Generation(new_runtime.gen.next()); - new_runtime.time_updated = Utc::now(); - - // Determine how to update the instance record to reflect the current - // migration state. - let failed = migration.either_side_failed(); - // If the migration has failed, or if the target reports that the migration - // has completed, clear the instance record's migration IDs so that a new - // migration can begin. - if failed || migration.target_state == MigrationState::COMPLETED { - info!( - osagactx.log(), - "instance update (migration {}): clearing migration IDs", - if failed { "failed" } else { "target_completed" }; - "instance_id" => %instance_id, - "migration_id" => %migration.id, - "src_propolis_id" => %migration.source_propolis_id, - "target_propolis_id" => %migration.target_propolis_id, - "instance_update" => %"migration", - ); - new_runtime.migration_id = None; - new_runtime.dst_propolis_id = None; - } - - // If either side reports that the migration has completed, move the target - // Propolis ID to the active position. - let new_propolis_id = if !failed && migration.either_side_completed() { - info!( - osagactx.log(), - "instance update (migration completed): setting active VMM ID to target"; - "instance_id" => %instance_id, - "migration_id" => %migration.id, - "src_propolis_id" => %migration.source_propolis_id, - "target_propolis_id" => %migration.target_propolis_id, - "instance_update" => %"migration", - ); - new_runtime.propolis_id = Some(migration.target_propolis_id); - migration.target_propolis_id - } else { - migration.source_propolis_id - }; - - osagactx - .datastore() - .instance_update_runtime(&instance_id, &new_runtime) - .await - .map_err(ActionError::action_failed)?; - - Ok(PropolisUuid::from_untyped_uuid(new_propolis_id)) -} - -async fn sium_update_network_config( - sagactx: NexusActionContext, -) -> Result<(), ActionError> { - let Params { ref serialized_authn, ref authz_instance, migration, .. } = - sagactx.saga_params()?; - - let opctx = - crate::context::op_context_for_saga_action(&sagactx, serialized_authn); - let osagactx = sagactx.user_data(); - let instance_id = InstanceUuid::from_untyped_uuid(authz_instance.id()); - - // Either the instance moved from one sled to another, or it attempted - // to migrate and failed. Ensure the correct networking configuration - // exists for its current home. - // - // TODO(#3107) This is necessary even if the instance didn't move, - // because registering a migration target on a sled creates OPTE ports - // for its VNICs, and that creates new V2P mappings on that sled that - // place the relevant virtual IPs on the local sled. Once OPTE stops - // creating these mappings, this path only needs to be taken if an - // instance has changed sleds. - - // Look up the ID of the sled that the instance now resides on, so that we - // can look up its address. - let active_propolis_id = - sagactx.lookup::("update_instance_record")?; - let new_sled_id = match osagactx - .datastore() - .vmm_fetch(&opctx, authz_instance, &active_propolis_id) - .await - { - Ok(vmm) => vmm.sled_id, - - // A VMM in the active position should never be destroyed. If the - // sled sending this message is the owner of the instance's last - // active VMM and is destroying it, it should also have retired that - // VMM. - Err(Error::ObjectNotFound { .. }) => { - error!(osagactx.log(), "instance's active vmm unexpectedly not found"; - "instance_id" => %instance_id, - "propolis_id" => %active_propolis_id); - - return Ok(()); - } - Err(e) => return Err(ActionError::action_failed(e)), - }; - - info!( - osagactx.log(), - "instance update (migration): ensuring updated instance network config"; - "instance_id" => %instance_id, - "migration_id" => %migration.id, - "src_propolis_id" => %migration.source_propolis_id, - "target_propolis_id" => %migration.target_propolis_id, - "active_propolis_id" => %active_propolis_id, - "sled_id" => %new_sled_id, - "migration_failed" => migration.either_side_failed(), - ); - - if let Err(e) = osagactx.nexus().v2p_notification_tx.send(()) { - error!( - osagactx.log(), - "error notifying background task of v2p change"; - "error" => ?e - ) - }; - - let (.., sled) = LookupPath::new(&opctx, osagactx.datastore()) - .sled_id(new_sled_id) - .fetch() - .await - .map_err(ActionError::action_failed)?; - - osagactx - .nexus() - .instance_ensure_dpd_config(&opctx, instance_id, &sled.address(), None) - .await - .map_err(ActionError::action_failed)?; - - Ok(()) -} diff --git a/nexus/src/app/sagas/instance_update/mod.rs b/nexus/src/app/sagas/instance_update/mod.rs index 89c6f78c209..872ba03a71a 100644 --- a/nexus/src/app/sagas/instance_update/mod.rs +++ b/nexus/src/app/sagas/instance_update/mod.rs @@ -8,17 +8,22 @@ use super::{ }; use crate::app::db::datastore::instance; use crate::app::db::datastore::InstanceSnapshot; +use crate::app::db::lookup::LookupPath; +use crate::app::db::model::Generation; use crate::app::db::model::VmmState; +use crate::app::db::model::{Migration, MigrationState}; use crate::app::sagas::declare_saga_actions; +use chrono::Utc; use nexus_db_queries::{authn, authz}; +use omicron_common::api::external::Error; use omicron_uuid_kinds::GenericUuid; +use omicron_uuid_kinds::InstanceUuid; use omicron_uuid_kinds::PropolisUuid; use serde::{Deserialize, Serialize}; use steno::{ActionError, DagBuilder, Node, SagaName}; use uuid::Uuid; mod destroyed; -mod migration; // The public interface to this saga is actually a smaller saga that starts the // "real" update saga, which inherits the lock from the start saga. This is @@ -49,6 +54,7 @@ struct RealParams { const INSTANCE_LOCK_ID: &str = "saga_instance_lock_id"; const INSTANCE_LOCK: &str = "updater_lock"; +const MIGRATION: &str = "migration"; // instance update saga: actions @@ -61,6 +67,20 @@ declare_saga_actions! { - siu_unbecome_updater } + // Update the instance record to reflect a migration event. If the + // migration has completed on the target VMM, or if the migration has + // failed, this will clear the migration IDs, allowing the instance to + // migrate again. If the migration has completed on either VMM, the target + // VMM becomes the active VMM. + MIGRATION_UPDATE_INSTANCE -> "migration_update_instance" { + + siu_migration_update_instance + } + + // Update network configuration to point to the new active VMM. + MIGRATION_UPDATE_NETWORK_CONFIG -> "migration_update_network_config" { + + siu_migration_update_network_config + } + UNLOCK_INSTANCE -> "unlocked" { + siu_unlock_instance } @@ -88,6 +108,16 @@ impl NexusSaga for SagaDoActualInstanceUpdate { )); builder.append(become_updater_action()); + fn const_node( + name: &'static str, + value: &impl serde::Serialize, + ) -> Result { + let value = serde_json::to_value(value).map_err(|e| { + SagaInitError::SerializeError(name.to_string(), e) + })?; + Ok(Node::constant(name, value)) + } + // determine which subsaga(s) to execute based on the state of the instance // and the VMMs associated with it. if let Some(ref active_vmm) = params.state.active_vmm { @@ -140,39 +170,13 @@ impl NexusSaga for SagaDoActualInstanceUpdate { // If either side of the migration reports a terminal state, update // the instance to reflect that. if migration.is_terminal() { - const MIGRATION_SUBSAGA_PARAMS: &str = - "params_for_migration_subsaga"; - let subsaga_params = migration::Params { - serialized_authn: params.serialized_authn.clone(), - authz_instance: params.authz_instance.clone(), - instance: params.state.instance.clone(), - migration: migration.clone(), - }; - let subsaga_dag = { - let subsaga_builder = DagBuilder::new(SagaName::new( - migration::SagaMigrationUpdate::NAME, - )); - migration::SagaMigrationUpdate::make_saga_dag( - &subsaga_params, - subsaga_builder, - )? - }; - - builder.append(Node::constant( - MIGRATION_SUBSAGA_PARAMS, - serde_json::to_value(&subsaga_params).map_err(|e| { - SagaInitError::SerializeError( - MIGRATION_SUBSAGA_PARAMS.to_string(), - e, - ) - })?, - )); - - builder.append(Node::subsaga( - "migration_subsaga_no_result", - subsaga_dag, - MIGRATION_SUBSAGA_PARAMS, - )); + builder.append(const_node(MIGRATION, migration)?); + // TODO(eliza): perhaps we could determine the final state in + // `make_saga_dag` and push a constant node for it, and then + // only have one `update_instance` action that's run regardless + // of which path through the saga we build... + builder.append(migration_update_instance_action()); + builder.append(migration_update_network_config_action()); } } @@ -231,6 +235,151 @@ async fn siu_unbecome_updater( Ok(()) } +async fn siu_migration_update_instance( + sagactx: NexusActionContext, +) -> Result { + let RealParams { ref authz_instance, ref state, .. } = + sagactx.saga_params()?; + let migration = sagactx.lookup::(MIGRATION)?; + + let osagactx = sagactx.user_data(); + let instance_id = InstanceUuid::from_untyped_uuid(authz_instance.id()); + + let mut new_runtime = state.instance.runtime().clone(); + new_runtime.gen = Generation(new_runtime.gen.next()); + new_runtime.time_updated = Utc::now(); + + // Determine how to update the instance record to reflect the current + // migration state. + let failed = migration.either_side_failed(); + // If the migration has failed, or if the target reports that the migration + // has completed, clear the instance record's migration IDs so that a new + // migration can begin. + if failed || migration.target_state == MigrationState::COMPLETED { + info!( + osagactx.log(), + "instance update (migration {}): clearing migration IDs", + if failed { "failed" } else { "target_completed" }; + "instance_id" => %instance_id, + "migration_id" => %migration.id, + "src_propolis_id" => %migration.source_propolis_id, + "target_propolis_id" => %migration.target_propolis_id, + "instance_update" => %"migration", + ); + new_runtime.migration_id = None; + new_runtime.dst_propolis_id = None; + } + + // If either side reports that the migration has completed, move the target + // Propolis ID to the active position. + let new_propolis_id = if !failed && migration.either_side_completed() { + info!( + osagactx.log(), + "instance update (migration completed): setting active VMM ID to target"; + "instance_id" => %instance_id, + "migration_id" => %migration.id, + "src_propolis_id" => %migration.source_propolis_id, + "target_propolis_id" => %migration.target_propolis_id, + "instance_update" => %"migration", + ); + new_runtime.propolis_id = Some(migration.target_propolis_id); + migration.target_propolis_id + } else { + migration.source_propolis_id + }; + + osagactx + .datastore() + .instance_update_runtime(&instance_id, &new_runtime) + .await + .map_err(ActionError::action_failed)?; + + Ok(PropolisUuid::from_untyped_uuid(new_propolis_id)) +} + +async fn siu_migration_update_network_config( + sagactx: NexusActionContext, +) -> Result<(), ActionError> { + let Params { ref serialized_authn, ref authz_instance, .. } = + sagactx.saga_params()?; + + let migration = sagactx.lookup::(MIGRATION)?; + let opctx = + crate::context::op_context_for_saga_action(&sagactx, serialized_authn); + let osagactx = sagactx.user_data(); + let instance_id = InstanceUuid::from_untyped_uuid(authz_instance.id()); + + // Either the instance moved from one sled to another, or it attempted + // to migrate and failed. Ensure the correct networking configuration + // exists for its current home. + // + // TODO(#3107) This is necessary even if the instance didn't move, + // because registering a migration target on a sled creates OPTE ports + // for its VNICs, and that creates new V2P mappings on that sled that + // place the relevant virtual IPs on the local sled. Once OPTE stops + // creating these mappings, this path only needs to be taken if an + // instance has changed sleds. + + // Look up the ID of the sled that the instance now resides on, so that we + // can look up its address. + let active_propolis_id = + sagactx.lookup::("update_instance_record")?; + let new_sled_id = match osagactx + .datastore() + .vmm_fetch(&opctx, authz_instance, &active_propolis_id) + .await + { + Ok(vmm) => vmm.sled_id, + + // A VMM in the active position should never be destroyed. If the + // sled sending this message is the owner of the instance's last + // active VMM and is destroying it, it should also have retired that + // VMM. + Err(Error::ObjectNotFound { .. }) => { + error!(osagactx.log(), "instance's active vmm unexpectedly not found"; + "instance_id" => %instance_id, + "propolis_id" => %active_propolis_id); + + return Ok(()); + } + Err(e) => return Err(ActionError::action_failed(e)), + }; + + info!( + osagactx.log(), + "instance update (migration): ensuring updated instance network config"; + "instance_id" => %instance_id, + "migration_id" => %migration.id, + "src_propolis_id" => %migration.source_propolis_id, + "target_propolis_id" => %migration.target_propolis_id, + "active_propolis_id" => %active_propolis_id, + "sled_id" => %new_sled_id, + "migration_failed" => migration.either_side_failed(), + ); + + if let Err(e) = osagactx.nexus().v2p_notification_tx.send(()) { + error!( + osagactx.log(), + "error notifying background task of v2p change"; + "error" => ?e + ) + }; + + let (.., sled) = LookupPath::new(&opctx, osagactx.datastore()) + .sled_id(new_sled_id) + .fetch() + .await + .map_err(ActionError::action_failed)?; + + osagactx + .nexus() + .instance_ensure_dpd_config(&opctx, instance_id, &sled.address(), None) + .await + .map_err(ActionError::action_failed)?; + + Ok(()) +} + async fn siu_unlock_instance( sagactx: NexusActionContext, ) -> Result<(), ActionError> { diff --git a/nexus/src/app/sagas/instance_update/start.rs b/nexus/src/app/sagas/instance_update/start.rs index 8c4d79cf19f..f6ccb1053fe 100644 --- a/nexus/src/app/sagas/instance_update/start.rs +++ b/nexus/src/app/sagas/instance_update/start.rs @@ -62,7 +62,6 @@ impl NexusSaga for SagaInstanceUpdate { start_instance_update_register_actions(registry); super::SagaDoActualInstanceUpdate::register_actions(registry); super::destroyed::SagaVmmDestroyed::register_actions(registry); - super::migration::SagaMigrationUpdate::register_actions(registry); } fn make_saga_dag( From 4c4d648bd91b6fd183c8fada1c795f3e8cd8ffb1 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Tue, 25 Jun 2024 11:38:52 -0700 Subject: [PATCH 075/234] also get rid of subsaga for active-vmm-destroyed --- .../app/sagas/instance_update/destroyed.rs | 143 +++++------------- nexus/src/app/sagas/instance_update/mod.rs | 120 +++++++++------ nexus/src/app/sagas/instance_update/start.rs | 1 - 3 files changed, 111 insertions(+), 153 deletions(-) diff --git a/nexus/src/app/sagas/instance_update/destroyed.rs b/nexus/src/app/sagas/instance_update/destroyed.rs index 75c9cc38927..f81e21c74bb 100644 --- a/nexus/src/app/sagas/instance_update/destroyed.rs +++ b/nexus/src/app/sagas/instance_update/destroyed.rs @@ -2,10 +2,9 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -use super::ActionRegistry; use super::NexusActionContext; -use super::NexusSaga; -use crate::app::sagas::declare_saga_actions; +use super::RealParams; +use super::DESTROYED_VMM_ID; use crate::app::sagas::ActionError; use chrono::Utc; use nexus_db_model::Generation; @@ -21,52 +20,9 @@ use omicron_uuid_kinds::PropolisUuid; use serde::{Deserialize, Serialize}; use slog::info; -// instance update (active VMM destroyed) subsaga: actions - -// This subsaga is responsible for handling an instance update where the -// instance's active VMM has entered the `Destroyed` state. This requires -// deallocating resources assigned to the instance, updating the instance's -// records in the database, and marking the VMM as deleted. -declare_saga_actions! { - instance_update_destroyed; - - // Deallocate physical sled resources reserved for the destroyed VMM, as it - // is no longer using them. - RELEASE_SLED_RESOURCES -> "no_result1" { - + siud_release_sled_resources - } - - // Deallocate virtual provisioning resources reserved by the instance, as it - // is no longer running. - RELEASE_VIRTUAL_PROVISIONING -> "no_result2" { - + siud_release_virtual_provisioning - } - - // Unassign the instance's Oximeter producer. - UNASSIGN_OXIMETER_PRODUCER -> "no_result3" { - + siud_unassign_oximeter_producer - } - - DELETE_V2P_MAPPINGS -> "no_result4" { - + siud_delete_v2p_mappings - } - - DELETE_NAT_ENTRIES -> "no_result5" { - + siud_delete_nat_entries - } - - UPDATE_INSTANCE -> "no_result6" { - + siud_update_instance - } - - MARK_VMM_DELETED -> "no_result7" { - + siud_mark_vmm_deleted - } -} - /// Parameters to the instance update (active VMM destroyed) sub-saga. #[derive(Debug, Deserialize, Serialize)] -pub(super) struct Params { +pub(super) struct RealRealParams { /// Authentication context to use to fetch the instance's current state from /// the database. pub(super) serialized_authn: authn::saga::Serialized, @@ -79,38 +35,13 @@ pub(super) struct Params { pub(super) instance: Instance, } -#[derive(Debug)] -pub(super) struct SagaVmmDestroyed; -impl NexusSaga for SagaVmmDestroyed { - const NAME: &'static str = "instance-update-vmm-destroyed"; - type Params = Params; - - fn register_actions(registry: &mut ActionRegistry) { - instance_update_destroyed_register_actions(registry); - } - - fn make_saga_dag( - _params: &Self::Params, - mut builder: steno::DagBuilder, - ) -> Result { - builder.append(release_sled_resources_action()); - builder.append(release_virtual_provisioning_action()); - builder.append(unassign_oximeter_producer_action()); - builder.append(delete_v2p_mappings_action()); - builder.append(delete_nat_entries_action()); - builder.append(update_instance_action()); - builder.append(mark_vmm_deleted_action()); - - Ok(builder.build()?) - } -} - -async fn siud_release_sled_resources( +pub(super) async fn siu_destroyed_release_sled_resources( sagactx: NexusActionContext, ) -> Result<(), ActionError> { let osagactx = sagactx.user_data(); - let Params { ref serialized_authn, ref authz_instance, vmm_id, .. } = - sagactx.saga_params::()?; + let RealParams { ref serialized_authn, ref authz_instance, .. } = + sagactx.saga_params::()?; + let vmm_id = sagactx.lookup::(DESTROYED_VMM_ID)?; let opctx = crate::context::op_context_for_saga_action(&sagactx, serialized_authn); @@ -137,17 +68,16 @@ async fn siud_release_sled_resources( .map_err(ActionError::action_failed) } -async fn siud_release_virtual_provisioning( +pub(super) async fn siu_destroyed_release_virtual_provisioning( sagactx: NexusActionContext, ) -> Result<(), ActionError> { let osagactx = sagactx.user_data(); - let Params { - ref serialized_authn, - ref authz_instance, - vmm_id, - instance, - .. - } = sagactx.saga_params::()?; + let RealParams { ref serialized_authn, ref authz_instance, state, .. } = + sagactx.saga_params::()?; + + let vmm_id = sagactx.lookup::(DESTROYED_VMM_ID)?; + let instance = state.instance; + let instance_id = InstanceUuid::from_untyped_uuid(authz_instance.id()); let opctx = crate::context::op_context_for_saga_action(&sagactx, serialized_authn); @@ -172,7 +102,7 @@ async fn siud_release_virtual_provisioning( .datastore() .virtual_provisioning_collection_delete_instance( &opctx, - InstanceUuid::from_untyped_uuid(authz_instance.id()), + instance_id, instance.project_id, i64::from(instance.ncpus.0 .0), instance.memory, @@ -185,7 +115,7 @@ async fn siud_release_virtual_provisioning( osagactx.log(), "instance update (VMM destroyed): deallocated virtual \ provisioning resources"; - "instance_id" => %authz_instance.id(), + "instance_id" => %instance_id, "propolis_id" => %vmm_id, "records_deleted" => ?deleted, "instance_update" => %"VMM destroyed", @@ -203,7 +133,7 @@ async fn siud_release_virtual_provisioning( osagactx.log(), "instance update (VMM destroyed): virtual provisioning \ record not found; perhaps it has already been deleted?"; - "instance_id" => %authz_instance.id(), + "instance_id" => %instance_id, "propolis_id" => %vmm_id, "instance_update" => %"VMM destroyed", ); @@ -214,12 +144,12 @@ async fn siud_release_virtual_provisioning( Ok(()) } -async fn siud_unassign_oximeter_producer( +pub(super) async fn siu_destroyed_unassign_oximeter_producer( sagactx: NexusActionContext, ) -> Result<(), ActionError> { let osagactx = sagactx.user_data(); - let Params { ref serialized_authn, ref authz_instance, .. } = - sagactx.saga_params::()?; + let RealParams { ref serialized_authn, ref authz_instance, .. } = + sagactx.saga_params::()?; let opctx = crate::context::op_context_for_saga_action(&sagactx, serialized_authn); @@ -234,11 +164,12 @@ async fn siud_unassign_oximeter_producer( .map_err(ActionError::action_failed) } -async fn siud_delete_v2p_mappings( +pub(super) async fn siu_destroyed_delete_v2p_mappings( sagactx: NexusActionContext, ) -> Result<(), ActionError> { - let Params { ref authz_instance, vmm_id, .. } = - sagactx.saga_params::()?; + let RealParams { ref authz_instance, .. } = + sagactx.saga_params::()?; + let vmm_id = sagactx.lookup::(DESTROYED_VMM_ID)?; let osagactx = sagactx.user_data(); info!( @@ -254,13 +185,13 @@ async fn siud_delete_v2p_mappings( Ok(()) } -async fn siud_delete_nat_entries( +pub(super) async fn siu_destroyed_delete_nat_entries( sagactx: NexusActionContext, ) -> Result<(), ActionError> { let osagactx = sagactx.user_data(); - let Params { ref serialized_authn, ref authz_instance, vmm_id, .. } = - sagactx.saga_params::()?; - + let RealParams { ref serialized_authn, ref authz_instance, .. } = + sagactx.saga_params::()?; + let vmm_id = sagactx.lookup::(DESTROYED_VMM_ID)?; let opctx = crate::context::op_context_for_saga_action(&sagactx, serialized_authn); @@ -280,11 +211,14 @@ async fn siud_delete_nat_entries( Ok(()) } -async fn siud_update_instance( +pub(super) async fn siu_destroyed_update_instance( sagactx: NexusActionContext, ) -> Result<(), ActionError> { - let Params { ref authz_instance, ref vmm_id, instance, .. } = - sagactx.saga_params::()?; + let RealParams { ref authz_instance, state, .. } = + sagactx.saga_params::()?; + + let vmm_id = sagactx.lookup::(DESTROYED_VMM_ID)?; + let instance = state.instance; let osagactx = sagactx.user_data(); let new_runtime = InstanceRuntimeState { propolis_id: None, @@ -325,12 +259,13 @@ async fn siud_update_instance( Ok(()) } -async fn siud_mark_vmm_deleted( +pub(super) async fn siu_destroyed_mark_vmm_deleted( sagactx: NexusActionContext, ) -> Result<(), ActionError> { let osagactx = sagactx.user_data(); - let Params { ref authz_instance, ref vmm_id, ref serialized_authn, .. } = - sagactx.saga_params::()?; + let RealParams { ref authz_instance, ref serialized_authn, .. } = + sagactx.saga_params::()?; + let vmm_id = sagactx.lookup::(DESTROYED_VMM_ID)?; let opctx = crate::context::op_context_for_saga_action(&sagactx, serialized_authn); @@ -345,7 +280,7 @@ async fn siud_mark_vmm_deleted( osagactx .datastore() - .vmm_mark_deleted(&opctx, vmm_id) + .vmm_mark_deleted(&opctx, &vmm_id) .await .map(|_| ()) .map_err(ActionError::action_failed) diff --git a/nexus/src/app/sagas/instance_update/mod.rs b/nexus/src/app/sagas/instance_update/mod.rs index 872ba03a71a..600cf233535 100644 --- a/nexus/src/app/sagas/instance_update/mod.rs +++ b/nexus/src/app/sagas/instance_update/mod.rs @@ -20,10 +20,11 @@ use omicron_uuid_kinds::GenericUuid; use omicron_uuid_kinds::InstanceUuid; use omicron_uuid_kinds::PropolisUuid; use serde::{Deserialize, Serialize}; -use steno::{ActionError, DagBuilder, Node, SagaName}; +use steno::{ActionError, DagBuilder, Node}; use uuid::Uuid; mod destroyed; +use destroyed::*; // The public interface to this saga is actually a smaller saga that starts the // "real" update saga, which inherits the lock from the start saga. This is @@ -54,6 +55,7 @@ struct RealParams { const INSTANCE_LOCK_ID: &str = "saga_instance_lock_id"; const INSTANCE_LOCK: &str = "updater_lock"; +const DESTROYED_VMM_ID: &str = "destroyed_vmm_id"; const MIGRATION: &str = "migration"; // instance update saga: actions @@ -67,6 +69,12 @@ declare_saga_actions! { - siu_unbecome_updater } + UNLOCK_INSTANCE -> "unlocked" { + + siu_unlock_instance + } + + // === migration update actions === + // Update the instance record to reflect a migration event. If the // migration has completed on the target VMM, or if the migration has // failed, this will clear the migration IDs, allowing the instance to @@ -81,8 +89,39 @@ declare_saga_actions! { + siu_migration_update_network_config } - UNLOCK_INSTANCE -> "unlocked" { - + siu_unlock_instance + // === active VMM destroyed actions === + + // Deallocate physical sled resources reserved for the destroyed VMM, as it + // is no longer using them. + DESTROYED_RELEASE_SLED_RESOURCES -> "destroyed_vmm_release_sled_resources" { + + siu_destroyed_release_sled_resources + } + + // Deallocate virtual provisioning resources reserved by the instance, as it + // is no longer running. + DESTROYED_RELEASE_VIRTUAL_PROVISIONING -> "destroyed_vmm_release_virtual_provisioning" { + + siu_destroyed_release_virtual_provisioning + } + + // Unassign the instance's Oximeter producer. + DESTROYED_UNASSIGN_OXIMETER_PRODUCER -> "destroyed_vmm_unassign_oximeter" { + + siu_destroyed_unassign_oximeter_producer + } + + DESTROYED_DELETE_V2P_MAPPINGS -> "destroyed_vmm_delete_v2p_mappings" { + + siu_destroyed_delete_v2p_mappings + } + + DESTROYED_DELETE_NAT_ENTRIES -> "destroyed_vmm_delete_nat_entries" { + + siu_destroyed_delete_nat_entries + } + + DESTROYED_UPDATE_INSTANCE -> "destroyed_vmm_update_instance" { + + siu_destroyed_update_instance + } + + DESTROYED_MARK_VMM_DELETED -> "destroyed_mark_vmm_deleted" { + + siu_destroyed_mark_vmm_deleted } } @@ -101,13 +140,6 @@ impl NexusSaga for SagaDoActualInstanceUpdate { params: &Self::Params, mut builder: DagBuilder, ) -> Result { - builder.append(Node::action( - INSTANCE_LOCK_ID, - "GenerateInstanceLockId", - ACTION_GENERATE_ID.as_ref(), - )); - builder.append(become_updater_action()); - fn const_node( name: &'static str, value: &impl serde::Serialize, @@ -118,45 +150,34 @@ impl NexusSaga for SagaDoActualInstanceUpdate { Ok(Node::constant(name, value)) } - // determine which subsaga(s) to execute based on the state of the instance - // and the VMMs associated with it. + builder.append(Node::action( + INSTANCE_LOCK_ID, + "GenerateInstanceLockId", + ACTION_GENERATE_ID.as_ref(), + )); + builder.append(become_updater_action()); + + // TODO(eliza): perhaps the start saga could determine whether it even + // needs to create a "real" saga based on whether either of the relevant + // state changes have occurred? + + // If the active VMM's state has changed to "destroyed", then clean up + // after it. if let Some(ref active_vmm) = params.state.active_vmm { // If the active VMM is `Destroyed`, schedule the active VMM // destroyed subsaga. if active_vmm.runtime.state == VmmState::Destroyed { - const DESTROYED_SUBSAGA_PARAMS: &str = - "params_for_vmm_destroyed_subsaga"; - let subsaga_params = destroyed::Params { - serialized_authn: params.serialized_authn.clone(), - authz_instance: params.authz_instance.clone(), - vmm_id: PropolisUuid::from_untyped_uuid(active_vmm.id), - instance: params.state.instance.clone(), - }; - let subsaga_dag = { - let subsaga_builder = DagBuilder::new(SagaName::new( - destroyed::SagaVmmDestroyed::NAME, - )); - destroyed::SagaVmmDestroyed::make_saga_dag( - &subsaga_params, - subsaga_builder, - )? - }; - - builder.append(Node::constant( - DESTROYED_SUBSAGA_PARAMS, - serde_json::to_value(&subsaga_params).map_err(|e| { - SagaInitError::SerializeError( - DESTROYED_SUBSAGA_PARAMS.to_string(), - e, - ) - })?, - )); - - builder.append(Node::subsaga( - "vmm_destroyed_subsaga_no_result", - subsaga_dag, - DESTROYED_SUBSAGA_PARAMS, - )); + builder.append(const_node( + DESTROYED_VMM_ID, + &PropolisUuid::from_untyped_uuid(active_vmm.id), + )?); + builder.append(destroyed_release_sled_resources_action()); + builder.append(destroyed_release_virtual_provisioning_action()); + builder.append(destroyed_unassign_oximeter_producer_action()); + builder.append(destroyed_delete_v2p_mappings_action()); + builder.append(destroyed_delete_nat_entries_action()); + builder.append(destroyed_update_instance_action()); + builder.append(destroyed_mark_vmm_deleted_action()); } } @@ -336,9 +357,12 @@ async fn siu_migration_update_network_config( // active VMM and is destroying it, it should also have retired that // VMM. Err(Error::ObjectNotFound { .. }) => { - error!(osagactx.log(), "instance's active vmm unexpectedly not found"; - "instance_id" => %instance_id, - "propolis_id" => %active_propolis_id); + error!( + osagactx.log(), + "instance's active vmm unexpectedly not found"; + "instance_id" => %instance_id, + "propolis_id" => %active_propolis_id, + ); return Ok(()); } diff --git a/nexus/src/app/sagas/instance_update/start.rs b/nexus/src/app/sagas/instance_update/start.rs index f6ccb1053fe..b53b4e5ed4c 100644 --- a/nexus/src/app/sagas/instance_update/start.rs +++ b/nexus/src/app/sagas/instance_update/start.rs @@ -61,7 +61,6 @@ impl NexusSaga for SagaInstanceUpdate { fn register_actions(registry: &mut ActionRegistry) { start_instance_update_register_actions(registry); super::SagaDoActualInstanceUpdate::register_actions(registry); - super::destroyed::SagaVmmDestroyed::register_actions(registry); } fn make_saga_dag( From d21b60d21f56f86048cc5803ac1b251930273149 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Tue, 25 Jun 2024 11:49:10 -0700 Subject: [PATCH 076/234] clean up network config actions --- .../app/sagas/instance_update/destroyed.rs | 45 +++---------------- nexus/src/app/sagas/instance_update/mod.rs | 30 +++++-------- 2 files changed, 19 insertions(+), 56 deletions(-) diff --git a/nexus/src/app/sagas/instance_update/destroyed.rs b/nexus/src/app/sagas/instance_update/destroyed.rs index f81e21c74bb..75bd27793ee 100644 --- a/nexus/src/app/sagas/instance_update/destroyed.rs +++ b/nexus/src/app/sagas/instance_update/destroyed.rs @@ -8,32 +8,12 @@ use super::DESTROYED_VMM_ID; use crate::app::sagas::ActionError; use chrono::Utc; use nexus_db_model::Generation; -use nexus_db_model::Instance; use nexus_db_model::InstanceRuntimeState; use nexus_db_model::InstanceState; -use nexus_db_queries::authn; -use nexus_db_queries::authz; use omicron_common::api::external::Error; use omicron_uuid_kinds::GenericUuid; use omicron_uuid_kinds::InstanceUuid; use omicron_uuid_kinds::PropolisUuid; -use serde::{Deserialize, Serialize}; -use slog::info; - -/// Parameters to the instance update (active VMM destroyed) sub-saga. -#[derive(Debug, Deserialize, Serialize)] -pub(super) struct RealRealParams { - /// Authentication context to use to fetch the instance's current state from - /// the database. - pub(super) serialized_authn: authn::saga::Serialized, - - pub(super) authz_instance: authz::Instance, - - /// The UUID of the VMM that was destroyed. - pub(super) vmm_id: PropolisUuid, - - pub(super) instance: Instance, -} pub(super) async fn siu_destroyed_release_sled_resources( sagactx: NexusActionContext, @@ -164,14 +144,15 @@ pub(super) async fn siu_destroyed_unassign_oximeter_producer( .map_err(ActionError::action_failed) } -pub(super) async fn siu_destroyed_delete_v2p_mappings( +pub(super) async fn siu_destroyed_update_network_config( sagactx: NexusActionContext, ) -> Result<(), ActionError> { - let RealParams { ref authz_instance, .. } = + let osagactx = sagactx.user_data(); + let RealParams { ref serialized_authn, ref authz_instance, .. } = sagactx.saga_params::()?; let vmm_id = sagactx.lookup::(DESTROYED_VMM_ID)?; + let nexus = osagactx.nexus(); - let osagactx = sagactx.user_data(); info!( osagactx.log(), "instance update (VMM destroyed): deleting V2P mappings"; @@ -180,20 +161,7 @@ pub(super) async fn siu_destroyed_delete_v2p_mappings( "instance_update" => %"VMM destroyed", ); - let nexus = osagactx.nexus(); nexus.background_tasks.activate(&nexus.background_tasks.task_v2p_manager); - Ok(()) -} - -pub(super) async fn siu_destroyed_delete_nat_entries( - sagactx: NexusActionContext, -) -> Result<(), ActionError> { - let osagactx = sagactx.user_data(); - let RealParams { ref serialized_authn, ref authz_instance, .. } = - sagactx.saga_params::()?; - let vmm_id = sagactx.lookup::(DESTROYED_VMM_ID)?; - let opctx = - crate::context::op_context_for_saga_action(&sagactx, serialized_authn); info!( osagactx.log(), @@ -203,8 +171,9 @@ pub(super) async fn siu_destroyed_delete_nat_entries( "instance_update" => %"VMM destroyed", ); - osagactx - .nexus() + let opctx = + crate::context::op_context_for_saga_action(&sagactx, serialized_authn); + nexus .instance_delete_dpd_config(&opctx, &authz_instance) .await .map_err(ActionError::action_failed)?; diff --git a/nexus/src/app/sagas/instance_update/mod.rs b/nexus/src/app/sagas/instance_update/mod.rs index 600cf233535..3087321cce6 100644 --- a/nexus/src/app/sagas/instance_update/mod.rs +++ b/nexus/src/app/sagas/instance_update/mod.rs @@ -108,12 +108,10 @@ declare_saga_actions! { + siu_destroyed_unassign_oximeter_producer } - DESTROYED_DELETE_V2P_MAPPINGS -> "destroyed_vmm_delete_v2p_mappings" { - + siu_destroyed_delete_v2p_mappings - } - - DESTROYED_DELETE_NAT_ENTRIES -> "destroyed_vmm_delete_nat_entries" { - + siu_destroyed_delete_nat_entries + // Notify the V2P manager background task to delete the destroyed VMM's V2P + // mappings, and delete the destroyed VMM's NAT entries. + DESTROYED_UPDATE_NETWORK_CONFIG -> "destroyed_update_network_config" { + + siu_destroyed_update_network_config } DESTROYED_UPDATE_INSTANCE -> "destroyed_vmm_update_instance" { @@ -174,8 +172,7 @@ impl NexusSaga for SagaDoActualInstanceUpdate { builder.append(destroyed_release_sled_resources_action()); builder.append(destroyed_release_virtual_provisioning_action()); builder.append(destroyed_unassign_oximeter_producer_action()); - builder.append(destroyed_delete_v2p_mappings_action()); - builder.append(destroyed_delete_nat_entries_action()); + builder.append(destroyed_update_network_config_action()); builder.append(destroyed_update_instance_action()); builder.append(destroyed_mark_vmm_deleted_action()); } @@ -318,6 +315,9 @@ async fn siu_migration_update_instance( Ok(PropolisUuid::from_untyped_uuid(new_propolis_id)) } +// TODO(eliza): the `update_network_config` actions for migration and +// destroyed-active-vmm *could* probably be combined...look into whether this is +// a good idea or not. async fn siu_migration_update_network_config( sagactx: NexusActionContext, ) -> Result<(), ActionError> { @@ -344,7 +344,7 @@ async fn siu_migration_update_network_config( // Look up the ID of the sled that the instance now resides on, so that we // can look up its address. let active_propolis_id = - sagactx.lookup::("update_instance_record")?; + sagactx.lookup::("migration_update_instance")?; let new_sled_id = match osagactx .datastore() .vmm_fetch(&opctx, authz_instance, &active_propolis_id) @@ -381,13 +381,8 @@ async fn siu_migration_update_network_config( "migration_failed" => migration.either_side_failed(), ); - if let Err(e) = osagactx.nexus().v2p_notification_tx.send(()) { - error!( - osagactx.log(), - "error notifying background task of v2p change"; - "error" => ?e - ) - }; + let nexus = osagactx.nexus(); + nexus.background_tasks.activate(&nexus.background_tasks.task_v2p_manager); let (.., sled) = LookupPath::new(&opctx, osagactx.datastore()) .sled_id(new_sled_id) @@ -395,8 +390,7 @@ async fn siu_migration_update_network_config( .await .map_err(ActionError::action_failed)?; - osagactx - .nexus() + nexus .instance_ensure_dpd_config(&opctx, instance_id, &sled.address(), None) .await .map_err(ActionError::action_failed)?; From 76aee1c83309de62fa5a3d7d0eda02f05dc75a22 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Mon, 1 Jul 2024 11:19:21 -0700 Subject: [PATCH 077/234] post-rebase fixy-uppy --- nexus/src/app/background/init.rs | 2 +- nexus/src/app/background/tasks/instance_updater.rs | 2 +- nexus/src/app/instance.rs | 9 +++++---- nexus/src/app/sagas/instance_update/start.rs | 3 ++- 4 files changed, 9 insertions(+), 7 deletions(-) diff --git a/nexus/src/app/background/init.rs b/nexus/src/app/background/init.rs index e808b37557a..f9917548bbd 100644 --- a/nexus/src/app/background/init.rs +++ b/nexus/src/app/background/init.rs @@ -647,7 +647,7 @@ impl BackgroundTasksInitializer { Box::new(updater), opctx.child(BTreeMap::new()), vec![], - task_instance_updaterm, + task_instance_updater, ); } diff --git a/nexus/src/app/background/tasks/instance_updater.rs b/nexus/src/app/background/tasks/instance_updater.rs index 4db099645cb..ec1b3096800 100644 --- a/nexus/src/app/background/tasks/instance_updater.rs +++ b/nexus/src/app/background/tasks/instance_updater.rs @@ -6,7 +6,7 @@ //! //! TODO this is currently a placeholder for a future PR -use super::common::BackgroundTask; +use crate::app::background::BackgroundTask; use crate::app::sagas::instance_update; use crate::app::sagas::SagaRequest; use anyhow::Context; diff --git a/nexus/src/app/instance.rs b/nexus/src/app/instance.rs index 0bf7dd6c815..f989454e28c 100644 --- a/nexus/src/app/instance.rs +++ b/nexus/src/app/instance.rs @@ -1418,10 +1418,11 @@ impl super::Nexus { serialized_authn: authn::saga::Serialized::for_opctx(opctx), authz_instance, }; - self.execute_saga::( - saga_params, - ) - .await?; + self.sagas + .saga_execute::( + saga_params, + ) + .await?; } Ok(()) } diff --git a/nexus/src/app/sagas/instance_update/start.rs b/nexus/src/app/sagas/instance_update/start.rs index b53b4e5ed4c..476e6f0bde2 100644 --- a/nexus/src/app/sagas/instance_update/start.rs +++ b/nexus/src/app/sagas/instance_update/start.rs @@ -153,7 +153,8 @@ async fn siu_fetch_state_and_start_real_saga( .map_err(ActionError::action_failed)?; osagactx .nexus() - .execute_saga::(super::RealParams { + .sagas + .saga_execute::(super::RealParams { serialized_authn, authz_instance, state, From e97e6e0fb9067abc1fa589688e895426a13e9a27 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Mon, 1 Jul 2024 11:48:17 -0700 Subject: [PATCH 078/234] queue update sagas for terminated migrations --- nexus/db-model/src/migration_state.rs | 3 + nexus/db-queries/src/db/datastore/instance.rs | 54 ++++++++--- .../app/background/tasks/instance_updater.rs | 95 +++++++++++++------ 3 files changed, 114 insertions(+), 38 deletions(-) diff --git a/nexus/db-model/src/migration_state.rs b/nexus/db-model/src/migration_state.rs index c06bbd67ea8..e1662f2c28d 100644 --- a/nexus/db-model/src/migration_state.rs +++ b/nexus/db-model/src/migration_state.rs @@ -37,6 +37,9 @@ impl MigrationState { pub const IN_PROGRESS: MigrationState = MigrationState(nexus::MigrationState::InProgress); + pub const TERMINAL_STATES: &'static [MigrationState] = + &[Self::COMPLETED, Self::FAILED]; + /// Returns `true` if this migration state means that the migration is no /// longer in progress (it has either succeeded or failed). #[must_use] diff --git a/nexus/db-queries/src/db/datastore/instance.rs b/nexus/db-queries/src/db/datastore/instance.rs index 950e5f3118b..eab6a93e388 100644 --- a/nexus/db-queries/src/db/datastore/instance.rs +++ b/nexus/db-queries/src/db/datastore/instance.rs @@ -299,15 +299,17 @@ impl DataStore { /// List all instances with active VMMs in the `Destroyed` state that don't /// have currently-running instance-updater sagas. + /// + /// This is used by the `instance_updater` background task to ensure that + /// update sagas are scheduled for these instances. pub async fn find_instances_with_destroyed_active_vmms( &self, opctx: &OpContext, - ) -> ListResultVec { + ) -> ListResultVec { use db::model::VmmState; use db::schema::instance::dsl; use db::schema::vmm::dsl as vmm_dsl; Ok(vmm_dsl::vmm - .filter(vmm_dsl::time_deleted.is_not_null()) .filter(vmm_dsl::state.eq(VmmState::Destroyed)) .inner_join( dsl::instance.on(dsl::active_propolis_id @@ -315,18 +317,48 @@ impl DataStore { .and(dsl::time_deleted.is_null()) .and(dsl::updater_id.is_null())), ) - .select((Instance::as_select(), Vmm::as_select())) - .load_async::<(Instance, Vmm)>( + .select(Instance::as_select()) + .load_async::( &*self.pool_connection_authorized(opctx).await?, ) .await - .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))? - .into_iter() - .map(|(instance, vmm)| InstanceAndActiveVmm { - instance, - vmm: Some(vmm), - }) - .collect()) + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?) + } + + /// List all instances with active migrations that have terminated (either + /// completed or failed) and don't have currently-running instance-updater + /// sagas. + /// + /// This is used by the `instance_updater` background task to ensure that + /// update sagas are scheduled for these instances. + pub async fn find_instances_with_terminated_active_migrations( + &self, + opctx: &OpContext, + ) -> ListResultVec { + use db::model::MigrationState; + use db::schema::instance::dsl; + use db::schema::migration::dsl as migration_dsl; + + Ok(dsl::instance + .filter(dsl::time_deleted.is_null()) + .filter(dsl::migration_id.is_not_null()) + .filter(dsl::updater_id.is_null()) + .inner_join( + migration_dsl::migration.on(dsl::migration_id + .eq(migration_dsl::id.nullable()) + .and( + migration_dsl::target_state + .eq_any(MigrationState::TERMINAL_STATES) + .or(migration_dsl::source_state + .eq_any(MigrationState::TERMINAL_STATES)), + )), + ) + .select(Instance::as_select()) + .load_async::( + &*self.pool_connection_authorized(opctx).await?, + ) + .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?) } /// Fetches information about an Instance that the caller has previously diff --git a/nexus/src/app/background/tasks/instance_updater.rs b/nexus/src/app/background/tasks/instance_updater.rs index ec1b3096800..cced8e82036 100644 --- a/nexus/src/app/background/tasks/instance_updater.rs +++ b/nexus/src/app/background/tasks/instance_updater.rs @@ -3,8 +3,6 @@ // file, You can obtain one at https://mozilla.org/MPL/2.0/. //! Background task for detecting instances in need of update sagas. -//! -//! TODO this is currently a placeholder for a future PR use crate::app::background::BackgroundTask; use crate::app::sagas::instance_update; @@ -12,13 +10,15 @@ use crate::app::sagas::SagaRequest; use anyhow::Context; use futures::future::BoxFuture; use futures::FutureExt; +use nexus_db_model::Instance; use nexus_db_queries::context::OpContext; -use nexus_db_queries::db::datastore::InstanceAndActiveVmm; use nexus_db_queries::db::lookup::LookupPath; use nexus_db_queries::db::DataStore; use nexus_db_queries::{authn, authz}; use nexus_types::identity::Resource; +use omicron_common::api::external::ListResultVec; use serde_json::json; +use std::future::Future; use std::sync::Arc; use tokio::sync::mpsc::Sender; @@ -40,28 +40,65 @@ impl InstanceUpdater { opctx: &OpContext, stats: &mut ActivationStats, ) -> Result<(), anyhow::Error> { - let log = &opctx.log; - - slog::debug!( - &log, - "looking for instances with destroyed active VMMs..." - ); - - let destroyed_active_vmms = self - .datastore - .find_instances_with_destroyed_active_vmms(opctx) - .await - .context("failed to find instances with destroyed active VMMs")?; + async fn find_instances( + what: &'static str, + log: &slog::Logger, + last_err: &mut Result<(), anyhow::Error>, + query: impl Future>, + ) -> Vec { + slog::debug!(&log, "looking for instances with {what}..."); + match query.await { + Ok(list) => { + slog::info!( + &log, + "listed instances with {what}"; + "count" => list.len(), + ); + list + } + Err(error) => { + slog::error!( + &log, + "failed to list instances with {what}"; + "error" => %error, + ); + *last_err = Err(error).with_context(|| { + format!("failed to find instances with {what}",) + }); + Vec::new() + } + } + } - slog::info!( - &log, - "listed instances with destroyed active VMMs"; - "count" => destroyed_active_vmms.len(), - ); + let mut last_err = Ok(()); + // NOTE(eliza): These don't, strictly speaking, need to be two separate + // queries, they probably could instead be `OR`ed together in SQL. I + // just thought it was nice to be able to record the number of instances + // found separately for each state. + let destroyed_active_vmms = find_instances( + "destroyed active VMMs", + &opctx.log, + &mut last_err, + self.datastore.find_instances_with_destroyed_active_vmms(opctx), + ) + .await; stats.destroyed_active_vmms = destroyed_active_vmms.len(); - for InstanceAndActiveVmm { instance, .. } in destroyed_active_vmms { + let terminated_active_migrations = find_instances( + "terminated active migrations", + &opctx.log, + &mut last_err, + self.datastore + .find_instances_with_terminated_active_migrations(opctx), + ) + .await; + stats.terminated_active_migrations = terminated_active_migrations.len(); + + for instance in destroyed_active_vmms + .iter() + .chain(terminated_active_migrations.iter()) + { let serialized_authn = authn::saga::Serialized::for_opctx(opctx); let (.., authz_instance) = LookupPath::new(&opctx, &self.datastore) .instance_id(instance.id()) @@ -77,17 +114,18 @@ impl InstanceUpdater { .send(saga) .await .context("SagaRequest receiver missing")?; - stats.sagas_started += 1; + stats.update_sagas_queued += 1; } - Ok(()) + last_err } } #[derive(Default)] struct ActivationStats { destroyed_active_vmms: usize, - sagas_started: usize, + terminated_active_migrations: usize, + update_sagas_queued: usize, } impl BackgroundTask for InstanceUpdater { @@ -103,7 +141,8 @@ impl BackgroundTask for InstanceUpdater { &opctx.log, "instance updater activation completed"; "destroyed_active_vmms" => stats.destroyed_active_vmms, - "sagas_started" => stats.sagas_started, + "terminated_active_migrations" => stats.terminated_active_migrations, + "update_sagas_queued" => stats.update_sagas_queued, ); None } @@ -113,14 +152,16 @@ impl BackgroundTask for InstanceUpdater { "instance updater activation failed!"; "error" => %error, "destroyed_active_vmms" => stats.destroyed_active_vmms, - "sagas_started" => stats.sagas_started, + "terminated_active_migrations" => stats.terminated_active_migrations, + "update_sagas_queued" => stats.update_sagas_queued, ); Some(error.to_string()) } }; json!({ "destroyed_active_vmms": stats.destroyed_active_vmms, - "sagas_started": stats.sagas_started, + "terminated_active_migrations": stats.terminated_active_migrations, + "update_sagas_queued": stats.update_sagas_queued, "error": error, }) } From d79050c928e3e3543fd14a93db4215b4308f6afe Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Mon, 1 Jul 2024 13:03:33 -0700 Subject: [PATCH 079/234] add instance-updater omdb stuff --- dev-tools/omdb/src/bin/omdb/nexus.rs | 56 +++++++++++++++++++++++++--- dev-tools/omdb/tests/successes.out | 7 +++- 2 files changed, 56 insertions(+), 7 deletions(-) diff --git a/dev-tools/omdb/src/bin/omdb/nexus.rs b/dev-tools/omdb/src/bin/omdb/nexus.rs index e19c998c3d3..d7351789769 100644 --- a/dev-tools/omdb/src/bin/omdb/nexus.rs +++ b/dev-tools/omdb/src/bin/omdb/nexus.rs @@ -1243,11 +1243,6 @@ fn print_task_details(bgtask: &BackgroundTask, details: &serde_json::Value) { } else if name == "lookup_region_port" { match serde_json::from_value::(details.clone()) { - Err(error) => eprintln!( - "warning: failed to interpret task details: {:?}: {:?}", - error, details - ), - Ok(LookupRegionPortStatus { found_port_ok, errors }) => { println!(" total filled in ports: {}", found_port_ok.len()); for line in &found_port_ok { @@ -1259,6 +1254,57 @@ fn print_task_details(bgtask: &BackgroundTask, details: &serde_json::Value) { println!(" > {line}"); } } + + Err(error) => eprintln!( + "warning: failed to interpret task details: {:?}: {:?}", + error, details, + ), + } + } else if name == "instance_updater" { + #[derive(Deserialize)] + struct UpdaterStatus { + /// number of instances found with destroyed active VMMs + destroyed_active_vmms: usize, + + /// number of instances found with terminated active migrations + terminated_active_migrations: usize, + + /// number of update sagas queued. + update_sagas_queued: usize, + + /// the last error that occurred during execution. + error: Option, + } + match serde_json::from_value::(details.clone()) { + Err(error) => eprintln!( + "warning: failed to interpret task details: {:?}: {:?}", + error, details + ), + Ok(UpdaterStatus { + destroyed_active_vmms, + terminated_active_migrations, + update_sagas_queued, + error, + }) => { + if let Some(error) = error { + println!(" task did not complete successfully!"); + println!(" most recent error: {error}"); + } + + println!( + " total instances in need of updates: {}", + destroyed_active_vmms + terminated_active_migrations + ); + println!( + " instances with destroyed active VMMs: {}", + destroyed_active_vmms, + ); + println!( + " instances with terminated active migrations: {}", + terminated_active_migrations, + ); + println!(" update sagas queued: {update_sagas_queued}"); + } }; } else { println!( diff --git a/dev-tools/omdb/tests/successes.out b/dev-tools/omdb/tests/successes.out index eee272433df..7c8dadae4e0 100644 --- a/dev-tools/omdb/tests/successes.out +++ b/dev-tools/omdb/tests/successes.out @@ -489,9 +489,12 @@ task: "external_endpoints" task: "instance_updater" configured period: every 30s currently executing: no - last completed activation: , triggered by an explicit signal + last completed activation: , triggered by a periodic timer firing started at (s ago) and ran for ms -warning: unknown background task: "instance_updater" (don't know how to interpret details: Object {"destroyed_active_vmms": Number(0), "error": Null, "sagas_started": Number(0)}) + total instances in need of updates: 0 + instances with destroyed active VMMs: 0 + instances with terminated active migrations: 0 + update sagas queued: 0 task: "instance_watcher" configured period: every s From 583b709f6951d13937b31a66b2991e2a85f38070 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Tue, 2 Jul 2024 10:12:19 -0700 Subject: [PATCH 080/234] remove max gen from `virtual_provisioning_collection_delete_instance` --- .../virtual_provisioning_collection.rs | 34 +--------------- .../virtual_provisioning_collection_update.rs | 40 +++---------------- .../app/sagas/instance_update/destroyed.rs | 1 - 3 files changed, 7 insertions(+), 68 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/virtual_provisioning_collection.rs b/nexus/db-queries/src/db/datastore/virtual_provisioning_collection.rs index 247eefd3d5b..7c3e1c4b8ff 100644 --- a/nexus/db-queries/src/db/datastore/virtual_provisioning_collection.rs +++ b/nexus/db-queries/src/db/datastore/virtual_provisioning_collection.rs @@ -280,10 +280,7 @@ impl DataStore { } /// Transitively removes the CPU and memory charges for an instance from the - /// instance's project, silo, and fleet, provided that the instance's state - /// generation is less than `max_instance_gen`. This allows a caller who is - /// about to apply generation G to an instance to avoid deleting resources - /// if its update was superseded. + /// instance's project, silo, and fleet. pub async fn virtual_provisioning_collection_delete_instance( &self, opctx: &OpContext, @@ -291,12 +288,10 @@ impl DataStore { project_id: Uuid, cpus_diff: i64, ram_diff: ByteCount, - max_instance_gen: i64, ) -> Result, Error> { let provisions = VirtualProvisioningCollectionUpdate::new_delete_instance( id, - max_instance_gen, cpus_diff, ram_diff, project_id, @@ -518,8 +513,6 @@ mod test { // Delete the instance - // Make this value outrageously high, so that as a "max" it is ignored. - let max_instance_gen: i64 = 1000; datastore .virtual_provisioning_collection_delete_instance( &opctx, @@ -527,7 +520,6 @@ mod test { project_id, cpus, ram, - max_instance_gen, ) .await .unwrap(); @@ -614,10 +606,6 @@ mod test { // Delete the instance - // If the "instance gen" is too low, the delete operation should be - // dropped. This mimics circumstances where an instance update arrives - // late to the query. - let max_instance_gen = 0; datastore .virtual_provisioning_collection_delete_instance( &opctx, @@ -625,25 +613,6 @@ mod test { project_id, cpus, ram, - max_instance_gen, - ) - .await - .unwrap(); - for id in ids { - verify_collection_usage(&datastore, &opctx, id, 12, 1 << 30, 0) - .await; - } - - // Make this value outrageously high, so that as a "max" it is ignored. - let max_instance_gen = 1000; - datastore - .virtual_provisioning_collection_delete_instance( - &opctx, - instance_id, - project_id, - cpus, - ram, - max_instance_gen, ) .await .unwrap(); @@ -664,7 +633,6 @@ mod test { project_id, cpus, ram, - max_instance_gen, ) .await .unwrap(); diff --git a/nexus/db-queries/src/db/queries/virtual_provisioning_collection_update.rs b/nexus/db-queries/src/db/queries/virtual_provisioning_collection_update.rs index fd86912107f..3381ec3e8a9 100644 --- a/nexus/db-queries/src/db/queries/virtual_provisioning_collection_update.rs +++ b/nexus/db-queries/src/db/queries/virtual_provisioning_collection_update.rs @@ -81,17 +81,9 @@ pub fn from_diesel(e: DieselError) -> external::Error { #[derive(Clone)] enum UpdateKind { InsertStorage(VirtualProvisioningResource), - DeleteStorage { - id: uuid::Uuid, - disk_byte_diff: ByteCount, - }, + DeleteStorage { id: uuid::Uuid, disk_byte_diff: ByteCount }, InsertInstance(VirtualProvisioningResource), - DeleteInstance { - id: uuid::Uuid, - max_instance_gen: i64, - cpus_diff: i64, - ram_diff: ByteCount, - }, + DeleteInstance { id: uuid::Uuid, cpus_diff: i64, ram_diff: ByteCount }, } type SelectableSql = < @@ -246,15 +238,7 @@ WITH ),") .bind::(id) }, - UpdateKind::DeleteInstance { id, max_instance_gen, .. } => { - // The filter condition here ensures that the provisioning record is - // only deleted if the corresponding instance has a generation - // number less than the supplied `max_instance_gen`. This allows a - // caller that is about to apply an instance update that will stop - // the instance and that bears generation G to avoid deleting - // resources if the instance generation was already advanced to or - // past G. - // + UpdateKind::DeleteInstance { id, .. } => { // If the relevant instance ID is not in the database, then some // other operation must have ensured the instance was previously // stopped (because that's the only way it could have been deleted), @@ -279,14 +263,13 @@ WITH FROM instance WHERE - instance.id = ").param().sql(" AND instance.state_generation < ").param().sql(" + instance.id = ").param().sql(" LIMIT 1 ) AS update ),") .bind::(id) .bind::(id) - .bind::(max_instance_gen) }, }; @@ -477,7 +460,6 @@ FROM pub fn new_delete_instance( id: InstanceUuid, - max_instance_gen: i64, cpus_diff: i64, ram_diff: ByteCount, project_id: uuid::Uuid, @@ -485,7 +467,6 @@ FROM Self::apply_update( UpdateKind::DeleteInstance { id: id.into_untyped_uuid(), - max_instance_gen, cpus_diff, ram_diff, }, @@ -567,14 +548,9 @@ mod test { let project_id = Uuid::nil(); let cpus_diff = 4; let ram_diff = 2048.try_into().unwrap(); - let max_instance_gen = 0; let query = VirtualProvisioningCollectionUpdate::new_delete_instance( - id, - max_instance_gen, - cpus_diff, - ram_diff, - project_id, + id, cpus_diff, ram_diff, project_id, ); expectorate_query_contents( @@ -684,11 +660,7 @@ mod test { let ram_diff = 2048.try_into().unwrap(); let query = VirtualProvisioningCollectionUpdate::new_delete_instance( - id, - max_instance_gen, - cpus_diff, - ram_diff, - project_id, + id, cpus_diff, ram_diff, project_id, ); let _ = query .explain_async(&conn) diff --git a/nexus/src/app/sagas/instance_update/destroyed.rs b/nexus/src/app/sagas/instance_update/destroyed.rs index 75bd27793ee..206c1e40263 100644 --- a/nexus/src/app/sagas/instance_update/destroyed.rs +++ b/nexus/src/app/sagas/instance_update/destroyed.rs @@ -86,7 +86,6 @@ pub(super) async fn siu_destroyed_release_virtual_provisioning( instance.project_id, i64::from(instance.ncpus.0 .0), instance.memory, - i64::try_from(&max_gen).unwrap(), ) .await; match result { From 10594de6887ad7c89733b4bc02566db06335b096 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Tue, 2 Jul 2024 10:43:33 -0700 Subject: [PATCH 081/234] clippiness --- nexus/db-queries/src/db/datastore/instance.rs | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/instance.rs b/nexus/db-queries/src/db/datastore/instance.rs index eab6a93e388..20bfb0fc322 100644 --- a/nexus/db-queries/src/db/datastore/instance.rs +++ b/nexus/db-queries/src/db/datastore/instance.rs @@ -309,7 +309,8 @@ impl DataStore { use db::model::VmmState; use db::schema::instance::dsl; use db::schema::vmm::dsl as vmm_dsl; - Ok(vmm_dsl::vmm + + vmm_dsl::vmm .filter(vmm_dsl::state.eq(VmmState::Destroyed)) .inner_join( dsl::instance.on(dsl::active_propolis_id @@ -322,7 +323,7 @@ impl DataStore { &*self.pool_connection_authorized(opctx).await?, ) .await - .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?) + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) } /// List all instances with active migrations that have terminated (either @@ -339,7 +340,7 @@ impl DataStore { use db::schema::instance::dsl; use db::schema::migration::dsl as migration_dsl; - Ok(dsl::instance + dsl::instance .filter(dsl::time_deleted.is_null()) .filter(dsl::migration_id.is_not_null()) .filter(dsl::updater_id.is_null()) @@ -358,7 +359,7 @@ impl DataStore { &*self.pool_connection_authorized(opctx).await?, ) .await - .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?) + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) } /// Fetches information about an Instance that the caller has previously From 882ebb25e634672ecc3db2695475490c4c6c00ac Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Tue, 2 Jul 2024 11:14:43 -0700 Subject: [PATCH 082/234] whoops the generation number gets used here --- nexus/src/app/sagas/instance_start.rs | 8 --- nexus/src/app/sagas/instance_update/start.rs | 64 +++++++++++++++----- 2 files changed, 50 insertions(+), 22 deletions(-) diff --git a/nexus/src/app/sagas/instance_start.rs b/nexus/src/app/sagas/instance_start.rs index adde040a774..ecc75e886a5 100644 --- a/nexus/src/app/sagas/instance_start.rs +++ b/nexus/src/app/sagas/instance_start.rs @@ -363,9 +363,6 @@ async fn sis_account_virtual_resources_undo( ¶ms.serialized_authn, ); - let started_record = - sagactx.lookup::("started_record")?; - osagactx .datastore() .virtual_provisioning_collection_delete_instance( @@ -374,11 +371,6 @@ async fn sis_account_virtual_resources_undo( params.db_instance.project_id, i64::from(params.db_instance.ncpus.0 .0), nexus_db_model::ByteCount(*params.db_instance.memory), - // Use the next instance generation number as the generation limit - // to ensure the provisioning counters are released. (The "mark as - // starting" undo step will "publish" this new state generation when - // it moves the instance back to Stopped.) - (&started_record.runtime().gen.next()).into(), ) .await .map_err(ActionError::action_failed)?; diff --git a/nexus/src/app/sagas/instance_update/start.rs b/nexus/src/app/sagas/instance_update/start.rs index 476e6f0bde2..483fc457877 100644 --- a/nexus/src/app/sagas/instance_update/start.rs +++ b/nexus/src/app/sagas/instance_update/start.rs @@ -9,7 +9,8 @@ // file, You can obtain one at https://mozilla.org/MPL/2.0/. use super::{ - ActionRegistry, NexusActionContext, NexusSaga, SagaInitError, + ActionRegistry, NexusActionContext, NexusSaga, RealParams, + SagaDoActualInstanceUpdate, SagaInitError, UpdatesRequired, ACTION_GENERATE_ID, INSTANCE_LOCK, INSTANCE_LOCK_ID, }; use crate::app::sagas::declare_saga_actions; @@ -117,8 +118,13 @@ async fn siu_lock_instance_undo( ) -> Result<(), anyhow::Error> { let Params { ref serialized_authn, ref authz_instance, .. } = sagactx.saga_params::()?; - super::unlock_instance_inner(serialized_authn, authz_instance, &sagactx) - .await?; + super::unlock_instance_inner( + serialized_authn, + authz_instance, + &sagactx, + None, + ) + .await?; Ok(()) } @@ -151,17 +157,47 @@ async fn siu_fetch_state_and_start_real_saga( .instance_fetch_all(&opctx, &authz_instance) .await .map_err(ActionError::action_failed)?; - osagactx - .nexus() - .sagas - .saga_execute::(super::RealParams { - serialized_authn, - authz_instance, - state, - orig_lock, - }) - .await - .map_err(ActionError::action_failed)?; + + // Determine what updates are required based on the instance's current + // state snapshot. If there are updates to perform, execute the "real" + // update saga. Otherwise, if we don't need to do anything else, simply + // release the lock and finish this saga. + if let Some(update) = UpdatesRequired::for_snapshot(osagactx.log(), &state) + { + info!( + osagactx.log(), + "instance update: starting real update saga..."; + "instance_id" => %authz_instance.id(), + "new_runtime_state" => ?update.new_runtime, + "network_config_update" => ?update.network_config, + "destroy_vmm" => ?update.destroy_vmm, + ); + osagactx + .nexus() + .sagas + .saga_execute::(RealParams { + serialized_authn, + authz_instance, + state, + update, + orig_lock, + }) + .await + .map_err(ActionError::action_failed)?; + } else { + info!( + osagactx.log(), + "instance update: no updates required, releasing lock."; + "instance_id" => %authz_instance.id(), + ); + super::unlock_instance_inner( + &serialized_authn, + &authz_instance, + &sagactx, + None, + ) + .await?; + } Ok(()) } From e3191f177816d0b2b005e2455768391a4e63188c Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Tue, 2 Jul 2024 12:53:52 -0700 Subject: [PATCH 083/234] review feedback from @gjcolombo This changes how the instance update saga is constructed to better handle cases where the active VMM was destroyed *because* a migration out was successful, which should be treated as a "migration success" and should not release virtual provisioning resources or delete the instance's oximeter producer. Also, the instance record is now written back in the same query as releasing the lock, to avoid potential race conditions. --- nexus/db-queries/src/db/datastore/instance.rs | 5 + .../app/sagas/instance_update/destroyed.rs | 104 ---- nexus/src/app/sagas/instance_update/mod.rs | 448 ++++++++++-------- 3 files changed, 256 insertions(+), 301 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/instance.rs b/nexus/db-queries/src/db/datastore/instance.rs index 20bfb0fc322..0cb2969fe88 100644 --- a/nexus/db-queries/src/db/datastore/instance.rs +++ b/nexus/db-queries/src/db/datastore/instance.rs @@ -1157,11 +1157,15 @@ impl DataStore { /// - `authz_instance`: the instance to attempt to unlock /// - `updater_lock`: an [`UpdaterLock`] token representing the acquired /// lock to release. + /// - `new_runtime`: an optional [`InstanceRuntimeState`] to write + /// back to the database when the lock is released. If this is [`None`], + /// the instance's runtime state will not be modified. pub async fn instance_updater_unlock( &self, opctx: &OpContext, authz_instance: &authz::Instance, UpdaterLock { updater_id, locked_gen }: UpdaterLock, + new_runtime: Option<&InstanceRuntimeState>, ) -> Result { use db::schema::instance::dsl; @@ -1180,6 +1184,7 @@ impl DataStore { .set(( dsl::updater_gen.eq(Generation(locked_gen.0.next())), dsl::updater_id.eq(None::), + new_runtime.cloned(), )) .check_if_exists::(instance_id) .execute_and_check(&*self.pool_connection_authorized(opctx).await?) diff --git a/nexus/src/app/sagas/instance_update/destroyed.rs b/nexus/src/app/sagas/instance_update/destroyed.rs index 206c1e40263..1335267672b 100644 --- a/nexus/src/app/sagas/instance_update/destroyed.rs +++ b/nexus/src/app/sagas/instance_update/destroyed.rs @@ -6,10 +6,6 @@ use super::NexusActionContext; use super::RealParams; use super::DESTROYED_VMM_ID; use crate::app::sagas::ActionError; -use chrono::Utc; -use nexus_db_model::Generation; -use nexus_db_model::InstanceRuntimeState; -use nexus_db_model::InstanceState; use omicron_common::api::external::Error; use omicron_uuid_kinds::GenericUuid; use omicron_uuid_kinds::InstanceUuid; @@ -62,22 +58,6 @@ pub(super) async fn siu_destroyed_release_virtual_provisioning( let opctx = crate::context::op_context_for_saga_action(&sagactx, serialized_authn); - // `virtual_provisioning_collection_delete_instace` will only delete virtual - // provisioning records that are *less than* the max generation parameter, - // not less than or equal to it --- the idea is that the generation number - // has already been advanced when we are deallocating the virtual - // provisioning records. This is kind of an artifact of sled-agent - // previously owning instance runtime state generations, since the - // sled-agent would have already advanced the instance's generation. - // - // However, now that the instance record is owned by Nexus, and we are - // updating the instance in response to a VMM state update from sled-agent, - // the instance record snapshot we are holding has not yet had its - // generation advanced, so we want to allow deleting virtual provisioning - // records that were created with the instance's current generation. The - // generation will be advanced at the end of this saga, once we have updated - // the actual instance record. - let max_gen = instance.runtime_state.gen.next(); let result = osagactx .datastore() .virtual_provisioning_collection_delete_instance( @@ -143,90 +123,6 @@ pub(super) async fn siu_destroyed_unassign_oximeter_producer( .map_err(ActionError::action_failed) } -pub(super) async fn siu_destroyed_update_network_config( - sagactx: NexusActionContext, -) -> Result<(), ActionError> { - let osagactx = sagactx.user_data(); - let RealParams { ref serialized_authn, ref authz_instance, .. } = - sagactx.saga_params::()?; - let vmm_id = sagactx.lookup::(DESTROYED_VMM_ID)?; - let nexus = osagactx.nexus(); - - info!( - osagactx.log(), - "instance update (VMM destroyed): deleting V2P mappings"; - "instance_id" => %authz_instance.id(), - "propolis_id" => %vmm_id, - "instance_update" => %"VMM destroyed", - ); - - nexus.background_tasks.activate(&nexus.background_tasks.task_v2p_manager); - - info!( - osagactx.log(), - "instance update (VMM destroyed): deleting NAT entries"; - "instance_id" => %authz_instance.id(), - "propolis_id" => %vmm_id, - "instance_update" => %"VMM destroyed", - ); - - let opctx = - crate::context::op_context_for_saga_action(&sagactx, serialized_authn); - nexus - .instance_delete_dpd_config(&opctx, &authz_instance) - .await - .map_err(ActionError::action_failed)?; - Ok(()) -} - -pub(super) async fn siu_destroyed_update_instance( - sagactx: NexusActionContext, -) -> Result<(), ActionError> { - let RealParams { ref authz_instance, state, .. } = - sagactx.saga_params::()?; - - let vmm_id = sagactx.lookup::(DESTROYED_VMM_ID)?; - let instance = state.instance; - let osagactx = sagactx.user_data(); - let new_runtime = InstanceRuntimeState { - propolis_id: None, - nexus_state: InstanceState::NoVmm, - gen: Generation(instance.runtime_state.gen.0.next()), - time_updated: Utc::now(), - ..instance.runtime_state - }; - - info!( - osagactx.log(), - "instance update (VMM destroyed): updating runtime state"; - "instance_id" => %authz_instance.id(), - "propolis_id" => %vmm_id, - "new_runtime_state" => ?new_runtime, - "instance_update" => %"VMM destroyed", - ); - - // It's okay for this to fail, it just means that the active VMM ID has changed. - if let Err(e) = osagactx - .datastore() - .instance_update_runtime( - &InstanceUuid::from_untyped_uuid(authz_instance.id()), - &new_runtime, - ) - .await - { - warn!( - osagactx.log(), - "instance update (VMM destroyed): updating runtime state failed"; - "instance_id" => %authz_instance.id(), - "propolis_id" => %vmm_id, - "new_runtime_state" => ?new_runtime, - "instance_update" => %"VMM destroyed", - "error" => %e, - ); - } - Ok(()) -} - pub(super) async fn siu_destroyed_mark_vmm_deleted( sagactx: NexusActionContext, ) -> Result<(), ActionError> { diff --git a/nexus/src/app/sagas/instance_update/mod.rs b/nexus/src/app/sagas/instance_update/mod.rs index 3087321cce6..8b7b4d761c6 100644 --- a/nexus/src/app/sagas/instance_update/mod.rs +++ b/nexus/src/app/sagas/instance_update/mod.rs @@ -10,12 +10,13 @@ use crate::app::db::datastore::instance; use crate::app::db::datastore::InstanceSnapshot; use crate::app::db::lookup::LookupPath; use crate::app::db::model::Generation; +use crate::app::db::model::InstanceRuntimeState; +use crate::app::db::model::MigrationState; use crate::app::db::model::VmmState; -use crate::app::db::model::{Migration, MigrationState}; use crate::app::sagas::declare_saga_actions; use chrono::Utc; use nexus_db_queries::{authn, authz}; -use omicron_common::api::external::Error; +use nexus_types::identity::Resource; use omicron_uuid_kinds::GenericUuid; use omicron_uuid_kinds::InstanceUuid; use omicron_uuid_kinds::PropolisUuid; @@ -41,6 +42,154 @@ use destroyed::*; mod start; pub(crate) use self::start::{Params, SagaInstanceUpdate}; +#[derive(Debug, Deserialize, Serialize)] +struct UpdatesRequired { + /// The new runtime state that must be written back to the database. + new_runtime: InstanceRuntimeState, + + /// If `true`, this VMM must be destroyed. + destroy_vmm: Option, + + network_config: Option, +} + +#[derive(Debug, Deserialize, Serialize)] +enum NetworkConfigUpdate { + Delete, + Update(PropolisUuid), +} + +/// An active VMM has been destroyed. +#[derive(Debug, Deserialize, Serialize)] +struct DestroyedVmm { + /// The UUID of the destroyed active VMM. + id: PropolisUuid, + /// If `true`, the virtual provisioning records for this instance should be + /// deallocated. + /// + /// This occurs when the instance's VMM is destroyed *without* migrating out. + /// If the instance's current active VMM has been destroyed because the + /// instance has successfully migrated out, the virtual provisioning records + /// are left in place, as the instance is still consuming those virtual + /// resources on its new sled. + deprovision: bool, +} + +impl UpdatesRequired { + fn for_snapshot( + log: &slog::Logger, + snapshot: &InstanceSnapshot, + ) -> Option { + let mut new_runtime = snapshot.instance.runtime().clone(); + new_runtime.gen = Generation(new_runtime.gen.next()); + new_runtime.time_updated = Utc::now(); + let instance_id = snapshot.instance.id(); + + let mut update_required = false; + let mut network_config = None; + let mut deprovision = true; + + // Has the active VMM been destroyed? + let destroy_vmm = snapshot.active_vmm.as_ref().and_then(|active_vmm| { + if active_vmm.runtime.state == VmmState::Destroyed { + // Unlink the active VMM ID. If the active VMM was destroyed + // because a migration out completed, the next block, which + // handles migration updates, will set this to the new VMM's ID, + // instead. + new_runtime.propolis_id = None; + update_required = true; + network_config = Some(NetworkConfigUpdate::Delete); + Some(PropolisUuid::from_untyped_uuid(active_vmm.id)) + } else { + None + } + }); + + // Determine what to do with the migration. + if let Some(ref migration) = snapshot.migration { + // Determine how to update the instance record to reflect the current + // migration state. + let failed = migration.either_side_failed(); + // If the migration has failed, or if the target reports that the migration + // has completed, clear the instance record's migration IDs so that a new + // migration can begin. + if failed || migration.target_state == MigrationState::COMPLETED { + info!( + log, + "instance update (migration {}): clearing migration IDs", + if failed { "failed" } else { "target completed" }; + "instance_id" => %instance_id, + "migration_id" => %migration.id, + "src_propolis_id" => %migration.source_propolis_id, + "target_propolis_id" => %migration.target_propolis_id, + ); + new_runtime.migration_id = None; + new_runtime.dst_propolis_id = None; + update_required = true; + } + + // If the active VMM was destroyed, the network config must be + // deleted (which was determined above). Otherwise, if the + // migration failed but the active VMM was still there, we must + // still ensure the correct networking configuration + // exists for its current home. + // + // TODO(#3107) This is necessary even if the instance didn't move, + // because registering a migration target on a sled creates OPTE ports + // for its VNICs, and that creates new V2P mappings on that sled that + // place the relevant virtual IPs on the local sled. Once OPTE stops + // creating these mappings, this path only needs to be taken if an + // instance has changed sleds. + if failed && destroy_vmm.is_none() { + network_config = Some(NetworkConfigUpdate::Update( + PropolisUuid::from_untyped_uuid( + migration.source_propolis_id, + ), + )); + update_required = true; + } + + // If either side reports that the migration has completed, move the target + // Propolis ID to the active position. + if !failed && migration.either_side_completed() { + info!( + log, + "instance update (migration completed): setting active VMM ID to target"; + "instance_id" => %instance_id, + "migration_id" => %migration.id, + "src_propolis_id" => %migration.source_propolis_id, + "target_propolis_id" => %migration.target_propolis_id, + ); + + network_config = Some(NetworkConfigUpdate::Update( + PropolisUuid::from_untyped_uuid( + migration.target_propolis_id, + ), + )); + new_runtime.propolis_id = Some(migration.target_propolis_id); + let _prev_target_id = new_runtime.dst_propolis_id.take(); + debug_assert_eq!( + _prev_target_id, + Some(migration.target_propolis_id) + ); + // If the active VMM has also been destroyed, don't delete + // virtual provisioning records while cleaning it up. + deprovision = false; + update_required = true; + } + } + + if !update_required { + return None; + } + Some(Self { + new_runtime, + destroy_vmm: destroy_vmm.map(|id| DestroyedVmm { id, deprovision }), + network_config, + }) + } +} + /// Parameters to the "real" instance update saga. #[derive(Debug, Deserialize, Serialize)] struct RealParams { @@ -50,13 +199,15 @@ struct RealParams { state: InstanceSnapshot, + update: UpdatesRequired, + orig_lock: instance::UpdaterLock, } const INSTANCE_LOCK_ID: &str = "saga_instance_lock_id"; const INSTANCE_LOCK: &str = "updater_lock"; const DESTROYED_VMM_ID: &str = "destroyed_vmm_id"; -const MIGRATION: &str = "migration"; +const NETWORK_CONFIG_UPDATE: &str = "network_config_update"; // instance update saga: actions @@ -69,24 +220,14 @@ declare_saga_actions! { - siu_unbecome_updater } - UNLOCK_INSTANCE -> "unlocked" { - + siu_unlock_instance + // Update network configuration. + UPDATE_NETWORK_CONFIG -> "update_network_config" { + + siu_update_network_config } - // === migration update actions === - - // Update the instance record to reflect a migration event. If the - // migration has completed on the target VMM, or if the migration has - // failed, this will clear the migration IDs, allowing the instance to - // migrate again. If the migration has completed on either VMM, the target - // VMM becomes the active VMM. - MIGRATION_UPDATE_INSTANCE -> "migration_update_instance" { - + siu_migration_update_instance - } - - // Update network configuration to point to the new active VMM. - MIGRATION_UPDATE_NETWORK_CONFIG -> "migration_update_network_config" { - + siu_migration_update_network_config + // Release the lock and write back the new instance record. + UPDATE_AND_UNLOCK_INSTANCE -> "unlocked" { + + siu_update_and_unlock_instance } // === active VMM destroyed actions === @@ -108,19 +249,11 @@ declare_saga_actions! { + siu_destroyed_unassign_oximeter_producer } - // Notify the V2P manager background task to delete the destroyed VMM's V2P - // mappings, and delete the destroyed VMM's NAT entries. - DESTROYED_UPDATE_NETWORK_CONFIG -> "destroyed_update_network_config" { - + siu_destroyed_update_network_config - } - - DESTROYED_UPDATE_INSTANCE -> "destroyed_vmm_update_instance" { - + siu_destroyed_update_instance - } - DESTROYED_MARK_VMM_DELETED -> "destroyed_mark_vmm_deleted" { + siu_destroyed_mark_vmm_deleted } + + } // instance update saga: definition @@ -155,50 +288,38 @@ impl NexusSaga for SagaDoActualInstanceUpdate { )); builder.append(become_updater_action()); - // TODO(eliza): perhaps the start saga could determine whether it even - // needs to create a "real" saga based on whether either of the relevant - // state changes have occurred? - - // If the active VMM's state has changed to "destroyed", then clean up - // after it. - if let Some(ref active_vmm) = params.state.active_vmm { - // If the active VMM is `Destroyed`, schedule the active VMM - // destroyed subsaga. - if active_vmm.runtime.state == VmmState::Destroyed { - builder.append(const_node( - DESTROYED_VMM_ID, - &PropolisUuid::from_untyped_uuid(active_vmm.id), - )?); - builder.append(destroyed_release_sled_resources_action()); + // If the active VMM has been destroyed, clean up after it. + // TODO(eliza): if we also wished to delete destroyed target VMMs after + // a failed migration, we could move all the "VMM destroyed" actions into + // a subsaga that we can push twice... + if let Some(DestroyedVmm { ref id, deprovision }) = + params.update.destroy_vmm + { + builder.append(const_node(DESTROYED_VMM_ID, id)?); + builder.append(destroyed_release_sled_resources_action()); + // If the instance hasn't migrated out of the destroyed VMM, also release virtual + // provisioning records and unassign the Oximeter producer. + if deprovision { builder.append(destroyed_release_virtual_provisioning_action()); builder.append(destroyed_unassign_oximeter_producer_action()); - builder.append(destroyed_update_network_config_action()); - builder.append(destroyed_update_instance_action()); - builder.append(destroyed_mark_vmm_deleted_action()); } } - // Next, determine what to do with the migration. A migration update - // saga needs to be scheduled if (and only if) the instance's migration - // ID currently points to a migration. The `instance_fetch_all` query - // will only return a migration if it is the instance's currently active - // migration, so if we have one here, that means that there's a - // migration. - if let Some(ref migration) = params.state.migration { - // If either side of the migration reports a terminal state, update - // the instance to reflect that. - if migration.is_terminal() { - builder.append(const_node(MIGRATION, migration)?); - // TODO(eliza): perhaps we could determine the final state in - // `make_saga_dag` and push a constant node for it, and then - // only have one `update_instance` action that's run regardless - // of which path through the saga we build... - builder.append(migration_update_instance_action()); - builder.append(migration_update_network_config_action()); - } + // If a network config update is required, do that. + if let Some(ref update) = params.update.network_config { + builder.append(const_node(NETWORK_CONFIG_UPDATE, update)?); + builder.append(update_network_config_action()); } - builder.append(unlock_instance_action()); + builder.append(update_and_unlock_instance_action()); + + // Delete the active VMM only *after* the instance record is + // updated, to avoid creating a "dangling pointer" where the instance + // record's active VMM ID points to a VMM record that has now been + // deleted. + if params.update.destroy_vmm.is_some() { + builder.append(destroyed_mark_vmm_deleted_action()); + } Ok(builder.build()?) } } @@ -248,168 +369,101 @@ async fn siu_unbecome_updater( ) -> Result<(), anyhow::Error> { let RealParams { ref serialized_authn, ref authz_instance, .. } = sagactx.saga_params::()?; - unlock_instance_inner(serialized_authn, authz_instance, &sagactx).await?; + unlock_instance_inner(serialized_authn, authz_instance, &sagactx, None) + .await?; Ok(()) } -async fn siu_migration_update_instance( - sagactx: NexusActionContext, -) -> Result { - let RealParams { ref authz_instance, ref state, .. } = - sagactx.saga_params()?; - let migration = sagactx.lookup::(MIGRATION)?; - - let osagactx = sagactx.user_data(); - let instance_id = InstanceUuid::from_untyped_uuid(authz_instance.id()); - - let mut new_runtime = state.instance.runtime().clone(); - new_runtime.gen = Generation(new_runtime.gen.next()); - new_runtime.time_updated = Utc::now(); - - // Determine how to update the instance record to reflect the current - // migration state. - let failed = migration.either_side_failed(); - // If the migration has failed, or if the target reports that the migration - // has completed, clear the instance record's migration IDs so that a new - // migration can begin. - if failed || migration.target_state == MigrationState::COMPLETED { - info!( - osagactx.log(), - "instance update (migration {}): clearing migration IDs", - if failed { "failed" } else { "target_completed" }; - "instance_id" => %instance_id, - "migration_id" => %migration.id, - "src_propolis_id" => %migration.source_propolis_id, - "target_propolis_id" => %migration.target_propolis_id, - "instance_update" => %"migration", - ); - new_runtime.migration_id = None; - new_runtime.dst_propolis_id = None; - } - - // If either side reports that the migration has completed, move the target - // Propolis ID to the active position. - let new_propolis_id = if !failed && migration.either_side_completed() { - info!( - osagactx.log(), - "instance update (migration completed): setting active VMM ID to target"; - "instance_id" => %instance_id, - "migration_id" => %migration.id, - "src_propolis_id" => %migration.source_propolis_id, - "target_propolis_id" => %migration.target_propolis_id, - "instance_update" => %"migration", - ); - new_runtime.propolis_id = Some(migration.target_propolis_id); - migration.target_propolis_id - } else { - migration.source_propolis_id - }; - - osagactx - .datastore() - .instance_update_runtime(&instance_id, &new_runtime) - .await - .map_err(ActionError::action_failed)?; - - Ok(PropolisUuid::from_untyped_uuid(new_propolis_id)) -} - -// TODO(eliza): the `update_network_config` actions for migration and -// destroyed-active-vmm *could* probably be combined...look into whether this is -// a good idea or not. -async fn siu_migration_update_network_config( +async fn siu_update_network_config( sagactx: NexusActionContext, ) -> Result<(), ActionError> { let Params { ref serialized_authn, ref authz_instance, .. } = sagactx.saga_params()?; - - let migration = sagactx.lookup::(MIGRATION)?; let opctx = crate::context::op_context_for_saga_action(&sagactx, serialized_authn); let osagactx = sagactx.user_data(); + let nexus = osagactx.nexus(); let instance_id = InstanceUuid::from_untyped_uuid(authz_instance.id()); - // Either the instance moved from one sled to another, or it attempted - // to migrate and failed. Ensure the correct networking configuration - // exists for its current home. - // - // TODO(#3107) This is necessary even if the instance didn't move, - // because registering a migration target on a sled creates OPTE ports - // for its VNICs, and that creates new V2P mappings on that sled that - // place the relevant virtual IPs on the local sled. Once OPTE stops - // creating these mappings, this path only needs to be taken if an - // instance has changed sleds. - - // Look up the ID of the sled that the instance now resides on, so that we - // can look up its address. - let active_propolis_id = - sagactx.lookup::("migration_update_instance")?; - let new_sled_id = match osagactx - .datastore() - .vmm_fetch(&opctx, authz_instance, &active_propolis_id) - .await - { - Ok(vmm) => vmm.sled_id, - - // A VMM in the active position should never be destroyed. If the - // sled sending this message is the owner of the instance's last - // active VMM and is destroying it, it should also have retired that - // VMM. - Err(Error::ObjectNotFound { .. }) => { - error!( + let update = + sagactx.lookup::(NETWORK_CONFIG_UPDATE)?; + + match update { + NetworkConfigUpdate::Delete => { + info!( osagactx.log(), - "instance's active vmm unexpectedly not found"; + "instance update: deleting network config"; "instance_id" => %instance_id, - "propolis_id" => %active_propolis_id, ); - - return Ok(()); + nexus + .instance_delete_dpd_config(&opctx, authz_instance) + .await + .map_err(ActionError::action_failed)?; } - Err(e) => return Err(ActionError::action_failed(e)), - }; + NetworkConfigUpdate::Update(active_propolis_id) => { + // Look up the ID of the sled that the instance now resides on, so that we + // can look up its address. + let new_sled_id = osagactx + .datastore() + .vmm_fetch(&opctx, authz_instance, &active_propolis_id) + .await + .map_err(ActionError::action_failed)? + .sled_id; + + info!( + osagactx.log(), + "instance update: ensuring updated instance network config"; + "instance_id" => %instance_id, + "active_propolis_id" => %active_propolis_id, + "new_sled_id" => %new_sled_id, + ); - info!( - osagactx.log(), - "instance update (migration): ensuring updated instance network config"; - "instance_id" => %instance_id, - "migration_id" => %migration.id, - "src_propolis_id" => %migration.source_propolis_id, - "target_propolis_id" => %migration.target_propolis_id, - "active_propolis_id" => %active_propolis_id, - "sled_id" => %new_sled_id, - "migration_failed" => migration.either_side_failed(), - ); + let (.., sled) = LookupPath::new(&opctx, osagactx.datastore()) + .sled_id(new_sled_id) + .fetch() + .await + .map_err(ActionError::action_failed)?; + + nexus + .instance_ensure_dpd_config( + &opctx, + instance_id, + &sled.address(), + None, + ) + .await + .map_err(ActionError::action_failed)?; + } + } - let nexus = osagactx.nexus(); + // Make sure the V2P manager background task runs to ensure the V2P mappings + // for this instance are up to date. nexus.background_tasks.activate(&nexus.background_tasks.task_v2p_manager); - let (.., sled) = LookupPath::new(&opctx, osagactx.datastore()) - .sled_id(new_sled_id) - .fetch() - .await - .map_err(ActionError::action_failed)?; - - nexus - .instance_ensure_dpd_config(&opctx, instance_id, &sled.address(), None) - .await - .map_err(ActionError::action_failed)?; - Ok(()) } -async fn siu_unlock_instance( +async fn siu_update_and_unlock_instance( sagactx: NexusActionContext, ) -> Result<(), ActionError> { - let RealParams { ref serialized_authn, ref authz_instance, .. } = - sagactx.saga_params::()?; - unlock_instance_inner(serialized_authn, authz_instance, &sagactx).await + let RealParams { + ref serialized_authn, ref authz_instance, ref update, .. + } = sagactx.saga_params::()?; + unlock_instance_inner( + serialized_authn, + authz_instance, + &sagactx, + Some(&update.new_runtime), + ) + .await } async fn unlock_instance_inner( serialized_authn: &authn::saga::Serialized, authz_instance: &authz::Instance, sagactx: &NexusActionContext, + new_runtime: Option<&InstanceRuntimeState>, ) -> Result<(), ActionError> { let lock = sagactx.lookup::(INSTANCE_LOCK)?; let opctx = @@ -424,7 +478,7 @@ async fn unlock_instance_inner( let did_unlock = osagactx .datastore() - .instance_updater_unlock(&opctx, authz_instance, lock) + .instance_updater_unlock(&opctx, authz_instance, lock, new_runtime) .await .map_err(ActionError::action_failed)?; From 5a155f507de1717539cf501558f399598d2aeab5 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Tue, 2 Jul 2024 12:55:13 -0700 Subject: [PATCH 084/234] oops forgot to update tests --- nexus/db-queries/src/db/datastore/instance.rs | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/instance.rs b/nexus/db-queries/src/db/datastore/instance.rs index 0cb2969fe88..c88ba460b8f 100644 --- a/nexus/db-queries/src/db/datastore/instance.rs +++ b/nexus/db-queries/src/db/datastore/instance.rs @@ -1367,7 +1367,7 @@ mod tests { // unlock the instance from saga 1 let unlocked = datastore - .instance_updater_unlock(&opctx, &authz_instance, lock1) + .instance_updater_unlock(&opctx, &authz_instance, lock1, None) .await .expect("instance must be unlocked by saga 1"); assert!(unlocked, "instance must actually be unlocked"); @@ -1380,7 +1380,7 @@ mod tests { // unlock the instance from saga 2 let unlocked = datastore - .instance_updater_unlock(&opctx, &authz_instance, lock2) + .instance_updater_unlock(&opctx, &authz_instance, lock2, None) .await .expect("instance must be unlocked by saga 2"); assert!(unlocked, "instance must actually be unlocked"); @@ -1426,7 +1426,7 @@ mod tests { // now, unlock the instance. let unlocked = dbg!( datastore - .instance_updater_unlock(&opctx, &authz_instance, lock1) + .instance_updater_unlock(&opctx, &authz_instance, lock1, None) .await ) .expect("instance should unlock"); @@ -1435,7 +1435,7 @@ mod tests { // unlocking it again should also succeed... let unlocked = dbg!( datastore - .instance_updater_unlock(&opctx, &authz_instance, lock2) + .instance_updater_unlock(&opctx, &authz_instance, lock2, None) .await ) .expect("instance should unlock again"); @@ -1482,6 +1482,7 @@ mod tests { updater_id: saga2, locked_gen: lock1.locked_gen, }, + None, ) .await ) @@ -1500,7 +1501,7 @@ mod tests { // unlocking with the correct ID should succeed. let unlocked = dbg!( datastore - .instance_updater_unlock(&opctx, &authz_instance, lock1) + .instance_updater_unlock(&opctx, &authz_instance, lock1, None) .await ) .expect("instance should unlock"); @@ -1517,6 +1518,7 @@ mod tests { // you from doing this exact thing. But, we should still // test that we handle it gracefully. UpdaterLock { updater_id: saga1, locked_gen: next_gen }, + None, ) .await ) From c526add9841f4b0fd1210eeab4930a9fcd51b94a Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Tue, 2 Jul 2024 13:16:26 -0700 Subject: [PATCH 085/234] whoops i forgot to update tests --- .../queries/virtual_provisioning_collection_update.rs | 1 - ..._provisioning_collection_update_delete_instance.sql | 10 ++++------ 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/nexus/db-queries/src/db/queries/virtual_provisioning_collection_update.rs b/nexus/db-queries/src/db/queries/virtual_provisioning_collection_update.rs index 3381ec3e8a9..902d955a796 100644 --- a/nexus/db-queries/src/db/queries/virtual_provisioning_collection_update.rs +++ b/nexus/db-queries/src/db/queries/virtual_provisioning_collection_update.rs @@ -654,7 +654,6 @@ mod test { let conn = pool.pool().get().await.unwrap(); let id = InstanceUuid::nil(); - let max_instance_gen = 0; let project_id = Uuid::nil(); let cpus_diff = 16.try_into().unwrap(); let ram_diff = 2048.try_into().unwrap(); diff --git a/nexus/db-queries/tests/output/virtual_provisioning_collection_update_delete_instance.sql b/nexus/db-queries/tests/output/virtual_provisioning_collection_update_delete_instance.sql index 3c97b7efc71..69b2e017fd6 100644 --- a/nexus/db-queries/tests/output/virtual_provisioning_collection_update_delete_instance.sql +++ b/nexus/db-queries/tests/output/virtual_provisioning_collection_update_delete_instance.sql @@ -40,9 +40,7 @@ WITH 1 ) = 1 - AND EXISTS( - SELECT 1 FROM instance WHERE instance.id = $5 AND instance.state_generation < $6 LIMIT 1 - ) + AND EXISTS(SELECT 1 FROM instance WHERE instance.id = $5 LIMIT 1) AS update ), unused_cte_arm @@ -50,7 +48,7 @@ WITH DELETE FROM virtual_provisioning_resource WHERE - virtual_provisioning_resource.id = $7 AND (SELECT do_update.update FROM do_update LIMIT 1) + virtual_provisioning_resource.id = $6 AND (SELECT do_update.update FROM do_update LIMIT 1) RETURNING virtual_provisioning_resource.id, virtual_provisioning_resource.time_modified, @@ -65,8 +63,8 @@ WITH virtual_provisioning_collection SET time_modified = current_timestamp(), - cpus_provisioned = virtual_provisioning_collection.cpus_provisioned - $8, - ram_provisioned = virtual_provisioning_collection.ram_provisioned - $9 + cpus_provisioned = virtual_provisioning_collection.cpus_provisioned - $7, + ram_provisioned = virtual_provisioning_collection.ram_provisioned - $8 WHERE virtual_provisioning_collection.id = ANY (SELECT all_collections.id FROM all_collections) AND (SELECT do_update.update FROM do_update LIMIT 1) From d17575d154e07f89c17b22786338886b667e5e00 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Tue, 2 Jul 2024 14:45:44 -0700 Subject: [PATCH 086/234] gotta actually put the instance in the NoVmm state --- nexus/src/app/sagas/instance_update/mod.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/nexus/src/app/sagas/instance_update/mod.rs b/nexus/src/app/sagas/instance_update/mod.rs index 8b7b4d761c6..1857c3d5398 100644 --- a/nexus/src/app/sagas/instance_update/mod.rs +++ b/nexus/src/app/sagas/instance_update/mod.rs @@ -11,6 +11,7 @@ use crate::app::db::datastore::InstanceSnapshot; use crate::app::db::lookup::LookupPath; use crate::app::db::model::Generation; use crate::app::db::model::InstanceRuntimeState; +use crate::app::db::model::InstanceState; use crate::app::db::model::MigrationState; use crate::app::db::model::VmmState; use crate::app::sagas::declare_saga_actions; @@ -97,6 +98,7 @@ impl UpdatesRequired { // handles migration updates, will set this to the new VMM's ID, // instead. new_runtime.propolis_id = None; + new_runtime.nexus_state = InstanceState::NoVmm; update_required = true; network_config = Some(NetworkConfigUpdate::Delete); Some(PropolisUuid::from_untyped_uuid(active_vmm.id)) @@ -167,6 +169,10 @@ impl UpdatesRequired { ), )); new_runtime.propolis_id = Some(migration.target_propolis_id); + // Even if the active VMM was destroyed (and we set the + // instance's state to `NoVmm` above), it has successfully + // migrated, so leave it in the VMM state. + new_runtime.nexus_state = InstanceState::Vmm; let _prev_target_id = new_runtime.dst_propolis_id.take(); debug_assert_eq!( _prev_target_id, From a62c99dbed4ec25376bb4197c6b49037c7fe36ff Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Tue, 2 Jul 2024 15:20:03 -0700 Subject: [PATCH 087/234] oh i guess we can't assert that --- nexus/src/app/sagas/instance_update/mod.rs | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/nexus/src/app/sagas/instance_update/mod.rs b/nexus/src/app/sagas/instance_update/mod.rs index 1857c3d5398..9fc2eec7ebf 100644 --- a/nexus/src/app/sagas/instance_update/mod.rs +++ b/nexus/src/app/sagas/instance_update/mod.rs @@ -173,11 +173,7 @@ impl UpdatesRequired { // instance's state to `NoVmm` above), it has successfully // migrated, so leave it in the VMM state. new_runtime.nexus_state = InstanceState::Vmm; - let _prev_target_id = new_runtime.dst_propolis_id.take(); - debug_assert_eq!( - _prev_target_id, - Some(migration.target_propolis_id) - ); + new_runtime.dst_propolis_id = None; // If the active VMM has also been destroyed, don't delete // virtual provisioning records while cleaning it up. deprovision = false; From 7b6f5a0764e245de72cb56343438f173e21fc117 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Wed, 3 Jul 2024 11:53:40 -0700 Subject: [PATCH 088/234] destroy both VMMs from the update saga As per [this comment][1] from @gjcolombo, we've determined that the update saga *should* be responsible for cleaning up sled resource allocations if handling an update where the active *or* target VMM has been `Destroyed`. This commit updates the saga to do that, turning the destroyed VMM cleanup actions into a subsaga so that we can, potentially, perform it for both VMMs. Releasing sled resources and marking VMMs as deleted is performed after the saga has written back the instance record with both VMMs unlinked, and dropped the lock. This ensures we don't leave "dangling pointers" in the instance record, by having a foreign key into the VMM table for a VMM that no longer exists. Since writing back the instance record releases the lock, this means that cleaning up resource allocations happens outside the lock. This way, sled resources are deallocated immediately, to prevent sled resource exhaustion from rapidly churning failed migrations, but we also release the update lock so that another update saga can start. Resources owned by the *instance* --- the oximeter producer and virtual provisioning resources --- are deallocated within the lock if and only if the instance has no active VMM. VMMs are cleaned up when they are destroyed regardless of whether the instance has an active VMM (e.g. destroyed targets are deleted when a migration fails, and destroyed source VMMs are deleted when a migration succeeds, but the instance's virtual resources are not deallocated in either case). [1]: https://github.com/oxidecomputer/omicron/pull/5749#discussion_r1663189548 --- .../app/sagas/instance_update/destroyed.rs | 160 ++++------ nexus/src/app/sagas/instance_update/mod.rs | 300 +++++++++++++----- nexus/src/app/sagas/instance_update/start.rs | 5 +- 3 files changed, 285 insertions(+), 180 deletions(-) diff --git a/nexus/src/app/sagas/instance_update/destroyed.rs b/nexus/src/app/sagas/instance_update/destroyed.rs index 1335267672b..a43abad8eba 100644 --- a/nexus/src/app/sagas/instance_update/destroyed.rs +++ b/nexus/src/app/sagas/instance_update/destroyed.rs @@ -2,22 +2,80 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -use super::NexusActionContext; -use super::RealParams; -use super::DESTROYED_VMM_ID; +use super::{ + declare_saga_actions, ActionRegistry, DagBuilder, NexusActionContext, + NexusSaga, SagaInitError, +}; use crate::app::sagas::ActionError; +use nexus_db_queries::authn; use omicron_common::api::external::Error; use omicron_uuid_kinds::GenericUuid; use omicron_uuid_kinds::InstanceUuid; use omicron_uuid_kinds::PropolisUuid; +use serde::{Deserialize, Serialize}; -pub(super) async fn siu_destroyed_release_sled_resources( +// destroy VMM subsaga: input parameters + +#[derive(Debug, Deserialize, Serialize)] +pub(super) struct Params { + /// Authentication context to use to fetch the instance's current state from + /// the database. + pub(super) serialized_authn: authn::saga::Serialized, + + /// Instance UUID of the instance being updated. This is only just used + /// for logging, so we just use the instance ID here instead of serializing + /// a whole instance record. + pub(super) instance_id: InstanceUuid, + + /// UUID of the VMM to destroy. + pub(super) vmm_id: PropolisUuid, +} + +// destroy VMM subsaga: actions + +declare_saga_actions! { + destroy_vmm; + + // Deallocate physical sled resources reserved for the destroyed VMM, as it + // is no longer using them. + RELEASE_SLED_RESOURCES -> "release_sled_resources" { + + siu_destroyed_release_sled_resources + } + + // Mark the VMM record as deleted. + MARK_VMM_DELETED -> "mark_vmm_deleted" { + + siu_destroyed_mark_vmm_deleted + } +} + +// destroy VMM subsaga: definition + +#[derive(Debug)] +pub(super) struct SagaDestroyVmm; +impl NexusSaga for SagaDestroyVmm { + const NAME: &'static str = "destroy-vmm"; + type Params = Params; + + fn register_actions(registry: &mut ActionRegistry) { + destroy_vmm_register_actions(registry) + } + + fn make_saga_dag( + _: &Self::Params, + mut builder: DagBuilder, + ) -> Result { + builder.append(release_sled_resources_action()); + builder.append(mark_vmm_deleted_action()); + Ok(builder.build()?) + } +} + +async fn siu_destroyed_release_sled_resources( sagactx: NexusActionContext, ) -> Result<(), ActionError> { let osagactx = sagactx.user_data(); - let RealParams { ref serialized_authn, ref authz_instance, .. } = - sagactx.saga_params::()?; - let vmm_id = sagactx.lookup::(DESTROYED_VMM_ID)?; + let Params { ref serialized_authn, instance_id, vmm_id, .. } = + sagactx.saga_params::()?; let opctx = crate::context::op_context_for_saga_action(&sagactx, serialized_authn); @@ -25,7 +83,7 @@ pub(super) async fn siu_destroyed_release_sled_resources( info!( osagactx.log(), "instance update (active VMM destroyed): deallocating sled resource reservation"; - "instance_id" => %authz_instance.id(), + "instance_id" => %instance_id, "propolis_id" => %vmm_id, "instance_update" => %"VMM destroyed", ); @@ -44,92 +102,12 @@ pub(super) async fn siu_destroyed_release_sled_resources( .map_err(ActionError::action_failed) } -pub(super) async fn siu_destroyed_release_virtual_provisioning( - sagactx: NexusActionContext, -) -> Result<(), ActionError> { - let osagactx = sagactx.user_data(); - let RealParams { ref serialized_authn, ref authz_instance, state, .. } = - sagactx.saga_params::()?; - - let vmm_id = sagactx.lookup::(DESTROYED_VMM_ID)?; - let instance = state.instance; - let instance_id = InstanceUuid::from_untyped_uuid(authz_instance.id()); - - let opctx = - crate::context::op_context_for_saga_action(&sagactx, serialized_authn); - - let result = osagactx - .datastore() - .virtual_provisioning_collection_delete_instance( - &opctx, - instance_id, - instance.project_id, - i64::from(instance.ncpus.0 .0), - instance.memory, - ) - .await; - match result { - Ok(deleted) => { - info!( - osagactx.log(), - "instance update (VMM destroyed): deallocated virtual \ - provisioning resources"; - "instance_id" => %instance_id, - "propolis_id" => %vmm_id, - "records_deleted" => ?deleted, - "instance_update" => %"VMM destroyed", - ); - } - // Necessary for idempotency --- the virtual provisioning resources may - // have been deleted already, that's fine. - Err(Error::ObjectNotFound { .. }) => { - // TODO(eliza): it would be nice if we could distinguish - // between errors returned by - // `virtual_provisioning_collection_delete_instance` where - // the instance ID was not found, and errors where the - // generation number was too low... - info!( - osagactx.log(), - "instance update (VMM destroyed): virtual provisioning \ - record not found; perhaps it has already been deleted?"; - "instance_id" => %instance_id, - "propolis_id" => %vmm_id, - "instance_update" => %"VMM destroyed", - ); - } - Err(err) => return Err(ActionError::action_failed(err)), - }; - - Ok(()) -} - -pub(super) async fn siu_destroyed_unassign_oximeter_producer( - sagactx: NexusActionContext, -) -> Result<(), ActionError> { - let osagactx = sagactx.user_data(); - let RealParams { ref serialized_authn, ref authz_instance, .. } = - sagactx.saga_params::()?; - - let opctx = - crate::context::op_context_for_saga_action(&sagactx, serialized_authn); - - crate::app::oximeter::unassign_producer( - osagactx.datastore(), - osagactx.log(), - &opctx, - &authz_instance.id(), - ) - .await - .map_err(ActionError::action_failed) -} - pub(super) async fn siu_destroyed_mark_vmm_deleted( sagactx: NexusActionContext, ) -> Result<(), ActionError> { let osagactx = sagactx.user_data(); - let RealParams { ref authz_instance, ref serialized_authn, .. } = - sagactx.saga_params::()?; - let vmm_id = sagactx.lookup::(DESTROYED_VMM_ID)?; + let Params { ref serialized_authn, instance_id, vmm_id, .. } = + sagactx.saga_params::()?; let opctx = crate::context::op_context_for_saga_action(&sagactx, serialized_authn); @@ -137,7 +115,7 @@ pub(super) async fn siu_destroyed_mark_vmm_deleted( info!( osagactx.log(), "instance update (VMM destroyed): marking VMM record deleted"; - "instance_id" => %authz_instance.id(), + "instance_id" => %instance_id, "propolis_id" => %vmm_id, "instance_update" => %"VMM destroyed", ); diff --git a/nexus/src/app/sagas/instance_update/mod.rs b/nexus/src/app/sagas/instance_update/mod.rs index 9fc2eec7ebf..bdb754f0eef 100644 --- a/nexus/src/app/sagas/instance_update/mod.rs +++ b/nexus/src/app/sagas/instance_update/mod.rs @@ -18,6 +18,7 @@ use crate::app::sagas::declare_saga_actions; use chrono::Utc; use nexus_db_queries::{authn, authz}; use nexus_types::identity::Resource; +use omicron_common::api::external::Error; use omicron_uuid_kinds::GenericUuid; use omicron_uuid_kinds::InstanceUuid; use omicron_uuid_kinds::PropolisUuid; @@ -25,9 +26,6 @@ use serde::{Deserialize, Serialize}; use steno::{ActionError, DagBuilder, Node}; use uuid::Uuid; -mod destroyed; -use destroyed::*; - // The public interface to this saga is actually a smaller saga that starts the // "real" update saga, which inherits the lock from the start saga. This is // because the decision of which subsaga(s) to run depends on the state of the @@ -43,14 +41,16 @@ use destroyed::*; mod start; pub(crate) use self::start::{Params, SagaInstanceUpdate}; +mod destroyed; + #[derive(Debug, Deserialize, Serialize)] struct UpdatesRequired { /// The new runtime state that must be written back to the database. new_runtime: InstanceRuntimeState, - /// If `true`, this VMM must be destroyed. - destroy_vmm: Option, - + destroy_active_vmm: Option, + destroy_target_vmm: Option, + deprovision: bool, network_config: Option, } @@ -60,22 +60,6 @@ enum NetworkConfigUpdate { Update(PropolisUuid), } -/// An active VMM has been destroyed. -#[derive(Debug, Deserialize, Serialize)] -struct DestroyedVmm { - /// The UUID of the destroyed active VMM. - id: PropolisUuid, - /// If `true`, the virtual provisioning records for this instance should be - /// deallocated. - /// - /// This occurs when the instance's VMM is destroyed *without* migrating out. - /// If the instance's current active VMM has been destroyed because the - /// instance has successfully migrated out, the virtual provisioning records - /// are left in place, as the instance is still consuming those virtual - /// resources on its new sled. - deprovision: bool, -} - impl UpdatesRequired { fn for_snapshot( log: &slog::Logger, @@ -88,24 +72,53 @@ impl UpdatesRequired { let mut update_required = false; let mut network_config = None; - let mut deprovision = true; + let mut deprovision = false; // Has the active VMM been destroyed? - let destroy_vmm = snapshot.active_vmm.as_ref().and_then(|active_vmm| { - if active_vmm.runtime.state == VmmState::Destroyed { - // Unlink the active VMM ID. If the active VMM was destroyed - // because a migration out completed, the next block, which - // handles migration updates, will set this to the new VMM's ID, - // instead. - new_runtime.propolis_id = None; - new_runtime.nexus_state = InstanceState::NoVmm; - update_required = true; - network_config = Some(NetworkConfigUpdate::Delete); - Some(PropolisUuid::from_untyped_uuid(active_vmm.id)) - } else { - None - } - }); + let destroy_active_vmm = + snapshot.active_vmm.as_ref().and_then(|active_vmm| { + if active_vmm.runtime.state == VmmState::Destroyed { + // Unlink the active VMM ID. If the active VMM was destroyed + // because a migration out completed, the next block, which + // handles migration updates, will set this to the new VMM's ID, + // instead. + new_runtime.propolis_id = None; + new_runtime.nexus_state = InstanceState::NoVmm; + update_required = true; + // If and only if the active VMM was destroyed *and* we did + // not successfully migrate out of it, the instance's + // virtual provisioning records and oximeter producer must + // be cleaned up. + // + // If the active VMM was destroyed as a result of a + // successful migration out, the subsequent code for + // determining what to do with the migration will change + // this back. + deprovision = true; + // Similarly, if the active VMM was destroyed and the + // instance has not migrated out of it, we must delete the + // instance's network configuration. Again, if there was a + // migration out, the subsequent migration-handling code + // will change this to a network config update if the + // instance is now living somewhere else. + network_config = Some(NetworkConfigUpdate::Delete); + Some(PropolisUuid::from_untyped_uuid(active_vmm.id)) + } else { + None + } + }); + + let destroy_target_vmm = + snapshot.target_vmm.as_ref().and_then(|target_vmm| { + if target_vmm.runtime.state == VmmState::Destroyed { + // Unlink the target VMM ID. + new_runtime.dst_propolis_id = None; + update_required = true; + Some(PropolisUuid::from_untyped_uuid(target_vmm.id)) + } else { + None + } + }); // Determine what to do with the migration. if let Some(ref migration) = snapshot.migration { @@ -142,7 +155,7 @@ impl UpdatesRequired { // place the relevant virtual IPs on the local sled. Once OPTE stops // creating these mappings, this path only needs to be taken if an // instance has changed sleds. - if failed && destroy_vmm.is_none() { + if failed && destroy_active_vmm.is_none() { network_config = Some(NetworkConfigUpdate::Update( PropolisUuid::from_untyped_uuid( migration.source_propolis_id, @@ -184,9 +197,12 @@ impl UpdatesRequired { if !update_required { return None; } + Some(Self { new_runtime, - destroy_vmm: destroy_vmm.map(|id| DestroyedVmm { id, deprovision }), + destroy_active_vmm, + destroy_target_vmm, + deprovision, network_config, }) } @@ -208,7 +224,6 @@ struct RealParams { const INSTANCE_LOCK_ID: &str = "saga_instance_lock_id"; const INSTANCE_LOCK: &str = "updater_lock"; -const DESTROYED_VMM_ID: &str = "destroyed_vmm_id"; const NETWORK_CONFIG_UPDATE: &str = "network_config_update"; // instance update saga: actions @@ -227,35 +242,22 @@ declare_saga_actions! { + siu_update_network_config } - // Release the lock and write back the new instance record. - UPDATE_AND_UNLOCK_INSTANCE -> "unlocked" { - + siu_update_and_unlock_instance - } - - // === active VMM destroyed actions === - - // Deallocate physical sled resources reserved for the destroyed VMM, as it - // is no longer using them. - DESTROYED_RELEASE_SLED_RESOURCES -> "destroyed_vmm_release_sled_resources" { - + siu_destroyed_release_sled_resources - } - // Deallocate virtual provisioning resources reserved by the instance, as it // is no longer running. - DESTROYED_RELEASE_VIRTUAL_PROVISIONING -> "destroyed_vmm_release_virtual_provisioning" { - + siu_destroyed_release_virtual_provisioning + RELEASE_VIRTUAL_PROVISIONING -> "release_virtual_provisioning" { + + siu_release_virtual_provisioning } // Unassign the instance's Oximeter producer. - DESTROYED_UNASSIGN_OXIMETER_PRODUCER -> "destroyed_vmm_unassign_oximeter" { - + siu_destroyed_unassign_oximeter_producer + UNASSIGN_OXIMETER_PRODUCER -> "unassign_oximeter_producer" { + + siu_unassign_oximeter_producer } - DESTROYED_MARK_VMM_DELETED -> "destroyed_mark_vmm_deleted" { - + siu_destroyed_mark_vmm_deleted + // Release the lock and write back the new instance record. + UPDATE_AND_UNLOCK_INSTANCE -> "unlocked" { + + siu_update_and_unlock_instance } - } // instance update saga: definition @@ -272,17 +274,19 @@ impl NexusSaga for SagaDoActualInstanceUpdate { fn make_saga_dag( params: &Self::Params, mut builder: DagBuilder, - ) -> Result { + ) -> Result { + // Helper function for constructing a constant node. fn const_node( - name: &'static str, + name: impl AsRef, value: &impl serde::Serialize, - ) -> Result { + ) -> Result { let value = serde_json::to_value(value).map_err(|e| { - SagaInitError::SerializeError(name.to_string(), e) + SagaInitError::SerializeError(name.as_ref().to_string(), e) })?; Ok(Node::constant(name, value)) } + // Generate a new ID and attempt to inherit the lock from the start saga. builder.append(Node::action( INSTANCE_LOCK_ID, "GenerateInstanceLockId", @@ -290,38 +294,72 @@ impl NexusSaga for SagaDoActualInstanceUpdate { )); builder.append(become_updater_action()); - // If the active VMM has been destroyed, clean up after it. - // TODO(eliza): if we also wished to delete destroyed target VMMs after - // a failed migration, we could move all the "VMM destroyed" actions into - // a subsaga that we can push twice... - if let Some(DestroyedVmm { ref id, deprovision }) = - params.update.destroy_vmm - { - builder.append(const_node(DESTROYED_VMM_ID, id)?); - builder.append(destroyed_release_sled_resources_action()); - // If the instance hasn't migrated out of the destroyed VMM, also release virtual - // provisioning records and unassign the Oximeter producer. - if deprovision { - builder.append(destroyed_release_virtual_provisioning_action()); - builder.append(destroyed_unassign_oximeter_producer_action()); - } - } - // If a network config update is required, do that. if let Some(ref update) = params.update.network_config { builder.append(const_node(NETWORK_CONFIG_UPDATE, update)?); builder.append(update_network_config_action()); } + // If the instance now has no active VMM, release its virtual + // provisioning resources and unassign its Oximeter producer. + if params.update.deprovision { + builder.append(release_virtual_provisioning_action()); + builder.append(unassign_oximeter_producer_action()); + } + + // Once we've finished mutating everything owned by the instance, we can + // write ck the updated state and release the instance lock. builder.append(update_and_unlock_instance_action()); - // Delete the active VMM only *after* the instance record is - // updated, to avoid creating a "dangling pointer" where the instance - // record's active VMM ID points to a VMM record that has now been - // deleted. - if params.update.destroy_vmm.is_some() { - builder.append(destroyed_mark_vmm_deleted_action()); + // If either VMM linked to this instance has been destroyed, append + // subsagas to clean up the VMMs resources and mark them as deleted. + // + // Note that we must not mark the VMMs as deleted until *after* we have + // written back the updated instance record. Otherwise, if we mark a VMM + // as deleted while the instance record still references its ID, we will + // have created a state where the instance record contains a "dangling + // pointer" (database version) where the foreign key points to a record + // that no longer exists. Other consumers of the instance record may be + // unpleasantly surprised by this, so we avoid marking these rows as + // deleted until they've been unlinked from the instance by the + // `update_and_unlock_instance` action. + let mut append_destroyed_vmm_subsaga = + |vmm_id: PropolisUuid, which_vmm: &'static str| { + let params = destroyed::Params { + vmm_id, + instance_id: InstanceUuid::from_untyped_uuid( + params.authz_instance.id(), + ), + serialized_authn: params.serialized_authn.clone(), + }; + let name = format!("destroy_{which_vmm}_vmm"); + + let subsaga = destroyed::SagaDestroyVmm::make_saga_dag( + ¶ms, + DagBuilder::new(steno::SagaName::new(&name)), + )?; + + let params_name = format!("{name}_params"); + builder.append(const_node(¶ms_name, ¶ms)?); + + let output_name = format!("{which_vmm}_vmm_destroyed"); + builder.append(Node::subsaga( + output_name.as_str(), + subsaga, + ¶ms_name, + )); + + Ok::<(), SagaInitError>(()) + }; + + if let Some(vmm_id) = params.update.destroy_active_vmm { + append_destroyed_vmm_subsaga(vmm_id, "active")?; + } + + if let Some(vmm_id) = params.update.destroy_target_vmm { + append_destroyed_vmm_subsaga(vmm_id, "target")?; } + Ok(builder.build()?) } } @@ -446,6 +484,92 @@ async fn siu_update_network_config( Ok(()) } +pub(super) async fn siu_release_virtual_provisioning( + sagactx: NexusActionContext, +) -> Result<(), ActionError> { + let osagactx = sagactx.user_data(); + let RealParams { ref serialized_authn, ref authz_instance, state, .. } = + sagactx.saga_params::()?; + + let instance = state.instance; + let vmm_id = { + let id = instance + .runtime() + .propolis_id + .expect("a `release_virtual_provisioning` action should not have been pushed if there is no active VMM ID"); + PropolisUuid::from_untyped_uuid(id) + }; + let instance_id = InstanceUuid::from_untyped_uuid(authz_instance.id()); + + let opctx = + crate::context::op_context_for_saga_action(&sagactx, serialized_authn); + + let result = osagactx + .datastore() + .virtual_provisioning_collection_delete_instance( + &opctx, + instance_id, + instance.project_id, + i64::from(instance.ncpus.0 .0), + instance.memory, + ) + .await; + match result { + Ok(deleted) => { + info!( + osagactx.log(), + "instance update (VMM destroyed): deallocated virtual \ + provisioning resources"; + "instance_id" => %instance_id, + "propolis_id" => %vmm_id, + "records_deleted" => ?deleted, + "instance_update" => %"active VMM destroyed", + ); + } + // Necessary for idempotency --- the virtual provisioning resources may + // have been deleted already, that's fine. + Err(Error::ObjectNotFound { .. }) => { + info!( + osagactx.log(), + "instance update (VMM destroyed): virtual provisioning \ + record not found; perhaps it has already been deleted?"; + "instance_id" => %instance_id, + "propolis_id" => %vmm_id, + "instance_update" => %"active VMM destroyed", + ); + } + Err(err) => return Err(ActionError::action_failed(err)), + }; + + Ok(()) +} + +pub(super) async fn siu_unassign_oximeter_producer( + sagactx: NexusActionContext, +) -> Result<(), ActionError> { + let osagactx = sagactx.user_data(); + let RealParams { ref serialized_authn, ref authz_instance, .. } = + sagactx.saga_params::()?; + + let opctx = + crate::context::op_context_for_saga_action(&sagactx, serialized_authn); + + info!( + osagactx.log(), + "instance update (VMM destroyed): unassigning oximeter producer"; + "instance_id" => %authz_instance.id(), + "instance_update" => %"active VMM destroyed", + ); + crate::app::oximeter::unassign_producer( + osagactx.datastore(), + osagactx.log(), + &opctx, + &authz_instance.id(), + ) + .await + .map_err(ActionError::action_failed) +} + async fn siu_update_and_unlock_instance( sagactx: NexusActionContext, ) -> Result<(), ActionError> { diff --git a/nexus/src/app/sagas/instance_update/start.rs b/nexus/src/app/sagas/instance_update/start.rs index 483fc457877..6e501b5c550 100644 --- a/nexus/src/app/sagas/instance_update/start.rs +++ b/nexus/src/app/sagas/instance_update/start.rs @@ -62,6 +62,7 @@ impl NexusSaga for SagaInstanceUpdate { fn register_actions(registry: &mut ActionRegistry) { start_instance_update_register_actions(registry); super::SagaDoActualInstanceUpdate::register_actions(registry); + super::destroyed::SagaDestroyVmm::register_actions(registry); } fn make_saga_dag( @@ -170,7 +171,9 @@ async fn siu_fetch_state_and_start_real_saga( "instance_id" => %authz_instance.id(), "new_runtime_state" => ?update.new_runtime, "network_config_update" => ?update.network_config, - "destroy_vmm" => ?update.destroy_vmm, + "destroy_active_vmm" => ?update.destroy_active_vmm, + "destroy_target_vmm" => ?update.destroy_target_vmm, + "deprovision" => update.deprovision, ); osagactx .nexus() From 1b7bfd8037103bbcbed6ab285620e149bd12fd6b Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Wed, 3 Jul 2024 12:55:32 -0700 Subject: [PATCH 089/234] clippy-clean helios sled-agent tests --- sled-agent/src/instance.rs | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/sled-agent/src/instance.rs b/sled-agent/src/instance.rs index 62c7c6ca927..60164fd7485 100644 --- a/sled-agent/src/instance.rs +++ b/sled-agent/src/instance.rs @@ -1756,8 +1756,7 @@ mod tests { let ticket = InstanceTicket::new_without_manager_for_test(id); - let initial_state = - fake_instance_initial_state(propolis_id, propolis_addr); + let initial_state = fake_instance_initial_state(propolis_addr); let (services, rx) = fake_instance_manager_services( log, @@ -1793,7 +1792,6 @@ mod tests { } fn fake_instance_initial_state( - propolis_id: PropolisUuid, propolis_addr: SocketAddr, ) -> InstanceInitialState { let hardware = InstanceHardware { @@ -2216,8 +2214,8 @@ mod tests { hardware, vmm_runtime, propolis_addr, - migration_id, - } = fake_instance_initial_state(propolis_id, propolis_addr); + migration_id: _, + } = fake_instance_initial_state(propolis_addr); let metadata = InstanceMetadata { silo_id: Uuid::new_v4(), From 0260f14e77404263ffdbf5add7777f86ac60836c Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Wed, 3 Jul 2024 13:24:10 -0700 Subject: [PATCH 090/234] post-rebase update for #5985 --- nexus/src/app/background/init.rs | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/nexus/src/app/background/init.rs b/nexus/src/app/background/init.rs index f9917548bbd..fe08d02a308 100644 --- a/nexus/src/app/background/init.rs +++ b/nexus/src/app/background/init.rs @@ -639,16 +639,15 @@ impl BackgroundTasksInitializer { datastore.clone(), saga_request.clone(), ); - driver.register( - "instance_updater".to_string(), - "detects if instances require update sagas and schedules them" - .to_string(), - config.instance_updater.period_secs, - Box::new(updater), - opctx.child(BTreeMap::new()), - vec![], - task_instance_updater, - ); + driver.register( TaskDefinition { + name: "instance_updater", + description: "detects if instances require update sagas and schedules them", + period: config.instance_updater.period_secs, + task_impl: Box::new(updater), + opctx: opctx.child(BTreeMap::new()), + watchers: vec![], + activator: task_instance_updater, + }); } // Background task: service firewall rule propagation From 5f3718d245309eb38a032d78726aba72397aa3b0 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Wed, 3 Jul 2024 14:23:47 -0700 Subject: [PATCH 091/234] post-rebase update for #5964 --- dev-tools/omdb/src/bin/omdb/nexus.rs | 34 +++- nexus/src/app/background/init.rs | 4 +- .../app/background/tasks/instance_updater.rs | 153 ++++++++++++++---- .../app/background/tasks/instance_watcher.rs | 67 ++++---- nexus/src/app/instance.rs | 36 ++--- nexus/src/app/instance_network.rs | 1 - 6 files changed, 200 insertions(+), 95 deletions(-) diff --git a/dev-tools/omdb/src/bin/omdb/nexus.rs b/dev-tools/omdb/src/bin/omdb/nexus.rs index d7351789769..ec3e519cbca 100644 --- a/dev-tools/omdb/src/bin/omdb/nexus.rs +++ b/dev-tools/omdb/src/bin/omdb/nexus.rs @@ -1269,8 +1269,17 @@ fn print_task_details(bgtask: &BackgroundTask, details: &serde_json::Value) { /// number of instances found with terminated active migrations terminated_active_migrations: usize, - /// number of update sagas queued. - update_sagas_queued: usize, + /// number of update sagas started. + sagas_started: usize, + + /// number of sagas completed successfully + sagas_completed: usize, + + /// number of sagas which failed + sagas_failed: usize, + + /// number of sagas which could not be started + saga_start_failures: usize, /// the last error that occurred during execution. error: Option, @@ -1283,7 +1292,10 @@ fn print_task_details(bgtask: &BackgroundTask, details: &serde_json::Value) { Ok(UpdaterStatus { destroyed_active_vmms, terminated_active_migrations, - update_sagas_queued, + sagas_started, + sagas_completed, + sagas_failed, + saga_start_failures, error, }) => { if let Some(error) = error { @@ -1303,7 +1315,21 @@ fn print_task_details(bgtask: &BackgroundTask, details: &serde_json::Value) { " instances with terminated active migrations: {}", terminated_active_migrations, ); - println!(" update sagas queued: {update_sagas_queued}"); + println!(" update sagas started: {sagas_started}"); + println!( + " update sagas completed successfully: {}", + sagas_completed, + ); + + let total_failed = sagas_failed + saga_start_failures; + if total_failed > 0 { + println!(" unsuccessful update sagas: {total_failed}"); + println!( + " sagas which could not be started: {}", + saga_start_failures + ); + println!(" sagas failed: {sagas_failed}"); + } } }; } else { diff --git a/nexus/src/app/background/init.rs b/nexus/src/app/background/init.rs index fe08d02a308..385d95c317c 100644 --- a/nexus/src/app/background/init.rs +++ b/nexus/src/app/background/init.rs @@ -617,9 +617,9 @@ impl BackgroundTasksInitializer { { let watcher = instance_watcher::InstanceWatcher::new( datastore.clone(), + sagas.clone(), producer_registry, instance_watcher::WatcherIdentity { nexus_id, rack_id }, - saga_request.clone(), ); driver.register(TaskDefinition { name: "instance_watcher", @@ -637,7 +637,7 @@ impl BackgroundTasksInitializer { { let updater = instance_updater::InstanceUpdater::new( datastore.clone(), - saga_request.clone(), + sagas.clone(), ); driver.register( TaskDefinition { name: "instance_updater", diff --git a/nexus/src/app/background/tasks/instance_updater.rs b/nexus/src/app/background/tasks/instance_updater.rs index cced8e82036..4aa31c07023 100644 --- a/nexus/src/app/background/tasks/instance_updater.rs +++ b/nexus/src/app/background/tasks/instance_updater.rs @@ -5,8 +5,9 @@ //! Background task for detecting instances in need of update sagas. use crate::app::background::BackgroundTask; +use crate::app::saga::StartSaga; use crate::app::sagas::instance_update; -use crate::app::sagas::SagaRequest; +use crate::app::sagas::NexusSaga; use anyhow::Context; use futures::future::BoxFuture; use futures::FutureExt; @@ -20,19 +21,16 @@ use omicron_common::api::external::ListResultVec; use serde_json::json; use std::future::Future; use std::sync::Arc; -use tokio::sync::mpsc::Sender; +use tokio::task::JoinSet; pub struct InstanceUpdater { datastore: Arc, - saga_req: Sender, + sagas: Arc, } impl InstanceUpdater { - pub fn new( - datastore: Arc, - saga_req: Sender, - ) -> Self { - InstanceUpdater { datastore, saga_req } + pub fn new(datastore: Arc, sagas: Arc) -> Self { + InstanceUpdater { datastore, sagas } } async fn activate2( @@ -71,6 +69,7 @@ impl InstanceUpdater { } let mut last_err = Ok(()); + let mut sagas = JoinSet::new(); // NOTE(eliza): These don't, strictly speaking, need to be two separate // queries, they probably could instead be `OR`ed together in SQL. I @@ -84,6 +83,14 @@ impl InstanceUpdater { ) .await; stats.destroyed_active_vmms = destroyed_active_vmms.len(); + self.start_sagas( + &opctx, + stats, + &mut last_err, + &mut sagas, + destroyed_active_vmms, + ) + .await; let terminated_active_migrations = find_instances( "terminated active migrations", @@ -94,38 +101,101 @@ impl InstanceUpdater { ) .await; stats.terminated_active_migrations = terminated_active_migrations.len(); + self.start_sagas( + &opctx, + stats, + &mut last_err, + &mut sagas, + terminated_active_migrations, + ) + .await; - for instance in destroyed_active_vmms - .iter() - .chain(terminated_active_migrations.iter()) - { - let serialized_authn = authn::saga::Serialized::for_opctx(opctx); - let (.., authz_instance) = LookupPath::new(&opctx, &self.datastore) - .instance_id(instance.id()) - .lookup_for(authz::Action::Modify) - .await?; - let saga = SagaRequest::InstanceUpdate { - params: instance_update::Params { - serialized_authn, - authz_instance, - }, - }; - self.saga_req - .send(saga) - .await - .context("SagaRequest receiver missing")?; - stats.update_sagas_queued += 1; + // Now, wait for the sagas to complete. + while let Some(saga_result) = sagas.join_next().await { + match saga_result { + Err(err) => { + debug_assert!( + false, + "since nexus is compiled with `panic=\"abort\"`, and \ + we never cancel the tasks on the `JoinSet`, a \ + `JoinError` should never be observed!", + ); + stats.sagas_failed += 1; + last_err = Err(err.into()); + } + Ok(Err(err)) => { + warn!(opctx.log, "update saga failed!"; "error" => %err); + stats.sagas_failed += 1; + last_err = Err(err.into()); + } + Ok(Ok(())) => stats.sagas_completed += 1, + } } last_err } + + async fn start_sagas( + &self, + opctx: &OpContext, + stats: &mut ActivationStats, + last_err: &mut Result<(), anyhow::Error>, + sagas: &mut JoinSet>, + instances: impl IntoIterator, + ) { + let serialized_authn = authn::saga::Serialized::for_opctx(opctx); + for instance in instances { + let instance_id = instance.id(); + let saga = async { + let (.., authz_instance) = + LookupPath::new(&opctx, &self.datastore) + .instance_id(instance_id) + .lookup_for(authz::Action::Modify) + .await?; + instance_update::SagaInstanceUpdate::prepare( + &instance_update::Params { + serialized_authn: serialized_authn.clone(), + authz_instance, + }, + ) + .with_context(|| { + format!("failed to prepare instance-update saga for {instance_id}") + }) + } + .await; + match saga { + Ok(saga) => { + let start_saga = self.sagas.clone(); + sagas.spawn(async move { + start_saga.saga_start(saga).await.with_context(|| { + format!("update saga for {instance_id} failed") + }) + }); + stats.sagas_started += 1; + } + Err(err) => { + warn!( + opctx.log, + "failed to start instance-update saga!"; + "instance_id" => %instance_id, + "error" => %err, + ); + stats.saga_start_failures += 1; + *last_err = Err(err); + } + } + } + } } #[derive(Default)] struct ActivationStats { destroyed_active_vmms: usize, terminated_active_migrations: usize, - update_sagas_queued: usize, + sagas_started: usize, + sagas_completed: usize, + sagas_failed: usize, + saga_start_failures: usize, } impl BackgroundTask for InstanceUpdater { @@ -142,7 +212,20 @@ impl BackgroundTask for InstanceUpdater { "instance updater activation completed"; "destroyed_active_vmms" => stats.destroyed_active_vmms, "terminated_active_migrations" => stats.terminated_active_migrations, - "update_sagas_queued" => stats.update_sagas_queued, + "update_sagas_started" => stats.sagas_started, + "update_sagas_completed" => stats.sagas_completed, + ); + debug_assert_eq!( + stats.sagas_failed, + 0, + "if the task completed successfully, then no sagas \ + should have failed", + ); + debug_assert_eq!( + stats.saga_start_failures, + 0, + "if the task completed successfully, all sagas \ + should have started successfully" ); None } @@ -153,7 +236,10 @@ impl BackgroundTask for InstanceUpdater { "error" => %error, "destroyed_active_vmms" => stats.destroyed_active_vmms, "terminated_active_migrations" => stats.terminated_active_migrations, - "update_sagas_queued" => stats.update_sagas_queued, + "update_sagas_started" => stats.sagas_started, + "update_sagas_completed" => stats.sagas_completed, + "update_sagas_failed" => stats.sagas_failed, + "update_saga_start_failures" => stats.saga_start_failures, ); Some(error.to_string()) } @@ -161,7 +247,10 @@ impl BackgroundTask for InstanceUpdater { json!({ "destroyed_active_vmms": stats.destroyed_active_vmms, "terminated_active_migrations": stats.terminated_active_migrations, - "update_sagas_queued": stats.update_sagas_queued, + "sagas_started": stats.sagas_started, + "sagas_completed": stats.sagas_completed, + "sagas_failed": stats.sagas_failed, + "saga_start_failures": stats.saga_start_failures, "error": error, }) } diff --git a/nexus/src/app/background/tasks/instance_watcher.rs b/nexus/src/app/background/tasks/instance_watcher.rs index 03b34c44609..7f4cb2c1aea 100644 --- a/nexus/src/app/background/tasks/instance_watcher.rs +++ b/nexus/src/app/background/tasks/instance_watcher.rs @@ -4,9 +4,8 @@ //! Background task for pulling instance state from sled-agents. -use crate::app::background::Activator; use crate::app::background::BackgroundTask; -use crate::app::sagas; +use crate::app::saga::StartSaga; use futures::{future::BoxFuture, FutureExt}; use http::StatusCode; use nexus_db_model::Instance; @@ -31,7 +30,6 @@ use std::future::Future; use std::num::NonZeroU32; use std::sync::Arc; use std::sync::Mutex; -use tokio::sync::mpsc::Sender; use uuid::Uuid; oximeter::use_timeseries!("vm-health-check.toml"); @@ -40,9 +38,9 @@ use virtual_machine::VirtualMachine; /// Background task that periodically checks instance states. pub(crate) struct InstanceWatcher { datastore: Arc, + sagas: Arc, metrics: Arc>, id: WatcherIdentity, - v2p_manager: Activator, } const MAX_SLED_AGENTS: NonZeroU32 = unsafe { @@ -53,15 +51,15 @@ const MAX_SLED_AGENTS: NonZeroU32 = unsafe { impl InstanceWatcher { pub(crate) fn new( datastore: Arc, + sagas: Arc, producer_registry: &ProducerRegistry, id: WatcherIdentity, - v2p_manager: Activator, ) -> Self { let metrics = Arc::new(Mutex::new(metrics::Metrics::default())); producer_registry .register_producer(metrics::Producer(metrics.clone())) .unwrap(); - Self { datastore, resolver, metrics, id, v2p_manager } + Self { datastore, sagas, metrics, id } } fn check_instance( @@ -71,6 +69,7 @@ impl InstanceWatcher { target: VirtualMachine, ) -> impl Future + Send + 'static { let datastore = self.datastore.clone(); + let sagas = self.sagas.clone(); let opctx = opctx.child( std::iter::once(( @@ -80,7 +79,6 @@ impl InstanceWatcher { .collect(), ); let client = client.clone(); - let v2p_manager = self.v2p_manager.clone(); async move { slog::trace!(opctx.log, "checking on instance..."); @@ -161,35 +159,34 @@ impl InstanceWatcher { "updating instance state"; "state" => ?new_runtime_state.vmm_state.state, ); - check.result = - crate::app::instance::notify_instance_updated_background( - &datastore, - &opctx, - &saga_req, - InstanceUuid::from_untyped_uuid(target.instance_id), - new_runtime_state, - ) - .await - .map_err(|e| { - slog::warn!( - opctx.log, - "error updating instance"; - "error" => ?e, - ); - match e { - Error::ObjectNotFound { .. } => { - Incomplete::InstanceNotFound - } - _ => Incomplete::UpdateFailed, + check.result = crate::app::instance::notify_instance_updated( + &datastore, + sagas.as_ref(), + &opctx, + InstanceUuid::from_untyped_uuid(target.instance_id), + new_runtime_state, + ) + .await + .map_err(|e| { + slog::warn!( + opctx.log, + "error updating instance"; + "error" => ?e, + ); + match e { + Error::ObjectNotFound { .. } => { + Incomplete::InstanceNotFound } - }) - .map(|updated| { - slog::debug!( - opctx.log, "update successful"; - "vmm_updated" => ?updated, - ); - check.update_saga_queued = updated; - }); + _ => Incomplete::UpdateFailed, + } + }) + .map(|updated| { + slog::debug!( + opctx.log, "update successful"; + "vmm_updated" => ?updated, + ); + check.update_saga_queued = updated; + }); check } } diff --git a/nexus/src/app/instance.rs b/nexus/src/app/instance.rs index f989454e28c..7369403e08d 100644 --- a/nexus/src/app/instance.rs +++ b/nexus/src/app/instance.rs @@ -12,7 +12,9 @@ use super::MAX_NICS_PER_INSTANCE; use super::MAX_SSH_KEYS_PER_INSTANCE; use super::MAX_VCPU_PER_INSTANCE; use super::MIN_MEMORY_BYTES_PER_INSTANCE; +use crate::app::saga::StartSaga; use crate::app::sagas; +use crate::app::sagas::NexusSaga; use crate::cidata::InstanceCiData; use crate::external_api::params; use cancel_safe_futures::prelude::*; @@ -1862,14 +1864,17 @@ impl super::Nexus { } } -/// `Nexus::notify_instance_updated` (~~Taylor~~ background task's version) -pub(crate) async fn notify_instance_updated_background( +/// Invoked by a sled agent to publish an updated runtime state for an +/// Instance. +pub(crate) async fn notify_instance_updated( datastore: &DataStore, + sagas: &dyn StartSaga, opctx: &OpContext, - saga_request: &tokio::sync::mpsc::Sender, instance_id: InstanceUuid, new_runtime_state: nexus::SledInstanceState, ) -> Result { + use sagas::instance_update; + let migrations = new_runtime_state.migrations(); let propolis_id = new_runtime_state.propolis_id; info!(opctx.log, "received new VMM runtime state from sled agent"; @@ -1894,24 +1899,13 @@ pub(crate) async fn notify_instance_updated_background( .instance_id(instance_id.into_untyped_uuid()) .lookup_for(authz::Action::Modify) .await?; - let params = sagas::instance_update::Params { - serialized_authn: authn::saga::Serialized::for_opctx(opctx), - authz_instance, - }; - info!(opctx.log, "queueing update saga for {instance_id}"; - "instance_id" => %instance_id, - "propolis_id" => %propolis_id, - "vmm_state" => ?new_runtime_state.vmm_state, - "migration_state" => ?migrations, - ); - saga_request - .send(sagas::SagaRequest::InstanceUpdate { params }) - .await - .map_err(|_| { - Error::internal_error( - "background saga executor is gone! this is not supposed to happen" - ) - })?; + let saga = instance_update::SagaInstanceUpdate::prepare( + &instance_update::Params { + serialized_authn: authn::saga::Serialized::for_opctx(opctx), + authz_instance, + }, + )?; + sagas.saga_start(saga).await?; } Ok(updated) diff --git a/nexus/src/app/instance_network.rs b/nexus/src/app/instance_network.rs index 27edcacabd7..8cd0a34fbf2 100644 --- a/nexus/src/app/instance_network.rs +++ b/nexus/src/app/instance_network.rs @@ -4,7 +4,6 @@ //! Routines that manage instance-related networking state. -use crate::app::background; use crate::app::switch_port; use ipnetwork::IpNetwork; use nexus_db_model::ExternalIp; From 5e566da4d75fec0e9320ee3c9ff7ec3f0140d2a5 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Wed, 3 Jul 2024 14:59:12 -0700 Subject: [PATCH 092/234] don't wait for saga completion in API endpoint this is still gross... --- nexus/src/app/instance.rs | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/nexus/src/app/instance.rs b/nexus/src/app/instance.rs index 7369403e08d..dd3dad21b95 100644 --- a/nexus/src/app/instance.rs +++ b/nexus/src/app/instance.rs @@ -1420,11 +1420,14 @@ impl super::Nexus { serialized_authn: authn::saga::Serialized::for_opctx(opctx), authz_instance, }; - self.sagas - .saga_execute::( - saga_params, - ) - .await?; + let sagas = self.sagas.clone(); + tokio::spawn(async move { + sagas + .saga_execute::( + saga_params, + ) + .await + }); } Ok(()) } From 130fca2423291cf265b29101f297039bd14f83c9 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Thu, 4 Jul 2024 09:02:35 -0700 Subject: [PATCH 093/234] use `wait_for_condition` in instance start tests --- nexus/src/app/sagas/instance_start.rs | 55 ++++++++++++++++++--------- 1 file changed, 37 insertions(+), 18 deletions(-) diff --git a/nexus/src/app/sagas/instance_start.rs b/nexus/src/app/sagas/instance_start.rs index ecc75e886a5..e5cf0433810 100644 --- a/nexus/src/app/sagas/instance_start.rs +++ b/nexus/src/app/sagas/instance_start.rs @@ -685,6 +685,8 @@ mod test { use omicron_common::api::external::{ ByteCount, IdentityMetadataCreateParams, InstanceCpuCount, }; + use omicron_test_utils::dev::poll; + use std::time::Duration; use uuid::Uuid; use super::*; @@ -802,28 +804,45 @@ mod test { }) }, || { - Box::pin({ - async { - let new_db_instance = test_helpers::instance_fetch( - cptestctx, - instance_id, + Box::pin(async { + let new_db_instance = + // Wait until the instance has advanced to the `NoVmm` + // state. This may not happen immediately, as the + // `Nexus::cpapi_instances_put` API endpoint simply + // writes the new VMM state to the database and *starts* + // an `instance-update` saga, and the instance record + // isn't updated until that saga completes. + poll::wait_for_condition( + || async { + let new_db_instance = test_helpers::instance_fetch( + cptestctx, + instance_id, + ) + .await.instance().clone(); + if new_db_instance.runtime().nexus_state == nexus_db_model::InstanceState::Vmm { + Err(poll::CondCheckError::::NotYet) + } else { + Ok(new_db_instance) + } + }, + &Duration::from_secs(5), + &Duration::from_secs(300), ) - .await.instance().clone(); + .await.expect("instance did not transition to NoVmm state after 300 seconds"); - info!(log, - "fetched instance runtime state after saga execution"; - "instance_id" => %instance.identity.id, - "instance_runtime" => ?new_db_instance.runtime()); + info!(log, + "fetched instance runtime state after saga execution"; + "instance_id" => %instance.identity.id, + "instance_runtime" => ?new_db_instance.runtime()); - assert!(new_db_instance.runtime().propolis_id.is_none()); - assert_eq!( - new_db_instance.runtime().nexus_state, - nexus_db_model::InstanceState::NoVmm - ); + assert!(new_db_instance.runtime().propolis_id.is_none()); + assert_eq!( + new_db_instance.runtime().nexus_state, + nexus_db_model::InstanceState::NoVmm + ); - assert!(test_helpers::no_virtual_provisioning_resource_records_exist(cptestctx).await); - assert!(test_helpers::no_virtual_provisioning_collection_records_using_instances(cptestctx).await); - } + assert!(test_helpers::no_virtual_provisioning_resource_records_exist(cptestctx).await); + assert!(test_helpers::no_virtual_provisioning_collection_records_using_instances(cptestctx).await); }) }, log, From 92b538895ec88fa90b008c896a46665605b9d4f7 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Thu, 4 Jul 2024 09:35:51 -0700 Subject: [PATCH 094/234] also use it in `instance_migrate` tests --- nexus/src/app/sagas/instance_migrate.rs | 26 ++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/nexus/src/app/sagas/instance_migrate.rs b/nexus/src/app/sagas/instance_migrate.rs index 7c51730b252..1ca2379cb81 100644 --- a/nexus/src/app/sagas/instance_migrate.rs +++ b/nexus/src/app/sagas/instance_migrate.rs @@ -847,9 +847,29 @@ mod tests { test_helpers::instance_simulate(cptestctx, &instance_id) .await; - let new_state = - test_helpers::instance_fetch(cptestctx, instance_id) - .await; + // Wait until the instance has advanced to the `NoVmm` + // state. This may not happen immediately, as the + // `Nexus::cpapi_instances_put` API endpoint simply + // writes the new VMM state to the database and *starts* + // an `instance-update` saga, and the instance record + // isn't updated until that saga completes. + let new_state = poll::wait_for_condition( + || async { + let new_state = test_helpers::instance_fetch( + cptestctx, + instance_id, + ) + .await.instance().clone(); + if new_state.runtime().nexus_state == nexus_db_model::InstanceState::Vmm { + Err(poll::CondCheckError::::NotYet) + } else { + Ok(new_state) + } + }, + &Duration::from_secs(5), + &Duration::from_secs(300), + ) + .await.expect("instance did not transition to NoVmm state after 300 seconds"); let new_instance = new_state.instance(); let new_vmm = new_state.vmm().as_ref(); From f805c16e6a6cda39f5b7c1f41ac4f7c688e17869 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Thu, 4 Jul 2024 09:42:46 -0700 Subject: [PATCH 095/234] refactor `notify_instance_updated` a bit --- .../app/background/tasks/instance_watcher.rs | 43 +++++----- nexus/src/app/instance.rs | 81 +++++++++---------- nexus/src/app/sagas/instance_migrate.rs | 9 ++- nexus/src/internal_api/http_entrypoints.rs | 2 +- 4 files changed, 65 insertions(+), 70 deletions(-) diff --git a/nexus/src/app/background/tasks/instance_watcher.rs b/nexus/src/app/background/tasks/instance_watcher.rs index 7f4cb2c1aea..f63c21105e2 100644 --- a/nexus/src/app/background/tasks/instance_watcher.rs +++ b/nexus/src/app/background/tasks/instance_watcher.rs @@ -154,39 +154,38 @@ impl InstanceWatcher { let new_runtime_state: SledInstanceState = state.into(); check.outcome = CheckOutcome::Success(new_runtime_state.vmm_state.state.into()); - slog::debug!( + debug!( opctx.log, "updating instance state"; "state" => ?new_runtime_state.vmm_state.state, ); - check.result = crate::app::instance::notify_instance_updated( + match crate::app::instance::notify_instance_updated( &datastore, - sagas.as_ref(), &opctx, InstanceUuid::from_untyped_uuid(target.instance_id), - new_runtime_state, + &new_runtime_state, ) .await - .map_err(|e| { - slog::warn!( - opctx.log, - "error updating instance"; - "error" => ?e, - ); - match e { - Error::ObjectNotFound { .. } => { - Incomplete::InstanceNotFound + { + Err(e) => { + warn!(opctx.log, "error updating instance"; "error" => %e); + check.result = match e { + Error::ObjectNotFound { .. } => { + Err(Incomplete::InstanceNotFound) + } + _ => Err(Incomplete::UpdateFailed), + }; + } + Ok(Some(saga)) => { + check.update_saga_queued = true; + if let Err(e) = sagas.saga_start(saga).await { + warn!(opctx.log, "update saga failed"; "error" => ?e); + check.result = Err(Incomplete::UpdateFailed); } - _ => Incomplete::UpdateFailed, } - }) - .map(|updated| { - slog::debug!( - opctx.log, "update successful"; - "vmm_updated" => ?updated, - ); - check.update_saga_queued = updated; - }); + Ok(None) => {} + }; + check } } diff --git a/nexus/src/app/instance.rs b/nexus/src/app/instance.rs index dd3dad21b95..f2a159d2037 100644 --- a/nexus/src/app/instance.rs +++ b/nexus/src/app/instance.rs @@ -1382,53 +1382,44 @@ impl super::Nexus { pub(crate) async fn notify_instance_updated( self: &Arc, opctx: &OpContext, - instance_id: &InstanceUuid, + instance_id: InstanceUuid, new_runtime_state: &nexus::SledInstanceState, ) -> Result<(), Error> { - let migrations = new_runtime_state.migrations(); - let propolis_id = new_runtime_state.propolis_id; - info!(opctx.log, "received new VMM runtime state from sled agent"; - "instance_id" => %instance_id, - "propolis_id" => %propolis_id, - "vmm_state" => ?new_runtime_state.vmm_state, - "migration_state" => ?migrations, - ); + let saga = notify_instance_updated( + &self.db_datastore, + opctx, + instance_id, + new_runtime_state, + ) + .await?; - let (vmm_updated, migration_updated) = self - .db_datastore - .vmm_and_migration_update_runtime( - propolis_id, - // TODO(eliza): probably should take this by value... - &new_runtime_state.vmm_state.clone().into(), - migrations, - ) - .await?; - let updated = vmm_updated || migration_updated.unwrap_or(false); - if updated { + // We don't need to wait for the instance update saga to run to + // completion to return OK to the sled-agent --- all it needs to care + // about is that the VMM/migration state in the database was updated. + // Even if we fail to successfully start an update saga, the + // instance-updater background task will eventually see that the + // instance is in a state which requires an update saga, and ensure that + // one is eventually executed. + // + // Therefore, just spawn the update saga in a new task, and return. + if let Some(saga) = saga { info!(opctx.log, "starting update saga for {instance_id}"; "instance_id" => %instance_id, - "propolis_id" => %propolis_id, "vmm_state" => ?new_runtime_state.vmm_state, - "migration_state" => ?migrations, + "migration_state" => ?new_runtime_state.migrations(), ); - let (.., authz_instance) = - LookupPath::new(&opctx, &self.db_datastore) - .instance_id(instance_id.into_untyped_uuid()) - .lookup_for(authz::Action::Modify) - .await?; - let saga_params = sagas::instance_update::Params { - serialized_authn: authn::saga::Serialized::for_opctx(opctx), - authz_instance, - }; let sagas = self.sagas.clone(); + let log = opctx.log.clone(); tokio::spawn(async move { - sagas - .saga_execute::( - saga_params, - ) - .await + if let Err(error) = sagas.saga_start(saga).await { + warn!(&log, "update saga for {instance_id} failed!"; + "instance_id" => %instance_id, + "error" => %error, + ); + } }); } + Ok(()) } @@ -1868,14 +1859,13 @@ impl super::Nexus { } /// Invoked by a sled agent to publish an updated runtime state for an -/// Instance. +/// Instance, returning an update saga for that instance. pub(crate) async fn notify_instance_updated( datastore: &DataStore, - sagas: &dyn StartSaga, opctx: &OpContext, instance_id: InstanceUuid, - new_runtime_state: nexus::SledInstanceState, -) -> Result { + new_runtime_state: &nexus::SledInstanceState, +) -> Result, Error> { use sagas::instance_update; let migrations = new_runtime_state.migrations(); @@ -1895,8 +1885,11 @@ pub(crate) async fn notify_instance_updated( migrations, ) .await?; - let updated = vmm_updated || migration_updated.unwrap_or(false); + // If the instance or VMM records in the database have changed as a result + // of this update, prepare an `instance-update` saga to ensure that the + // changes are reflected by the instance record. + let updated = vmm_updated || migration_updated.unwrap_or(false); if updated { let (.., authz_instance) = LookupPath::new(&opctx, datastore) .instance_id(instance_id.into_untyped_uuid()) @@ -1908,10 +1901,10 @@ pub(crate) async fn notify_instance_updated( authz_instance, }, )?; - sagas.saga_start(saga).await?; + Ok(Some(saga)) + } else { + Ok(None) } - - Ok(updated) } /// Determines the disposition of a request to start an instance given its state diff --git a/nexus/src/app/sagas/instance_migrate.rs b/nexus/src/app/sagas/instance_migrate.rs index 1ca2379cb81..5d1ccbf109e 100644 --- a/nexus/src/app/sagas/instance_migrate.rs +++ b/nexus/src/app/sagas/instance_migrate.rs @@ -609,6 +609,7 @@ async fn sim_instance_migrate( #[cfg(test)] mod tests { + use crate::app::db::datastore::InstanceAndActiveVmm; use crate::app::sagas::test_helpers; use camino::Utf8Path; use dropshot::test_util::ClientTestContext; @@ -622,6 +623,8 @@ mod tests { ByteCount, IdentityMetadataCreateParams, InstanceCpuCount, }; use omicron_sled_agent::sim::Server; + use omicron_test_utils::dev::poll; + use std::time::Duration; use super::*; @@ -859,9 +862,9 @@ mod tests { cptestctx, instance_id, ) - .await.instance().clone(); - if new_state.runtime().nexus_state == nexus_db_model::InstanceState::Vmm { - Err(poll::CondCheckError::::NotYet) + .await; + if new_state.instance().runtime().nexus_state == nexus_db_model::InstanceState::Vmm { + Err(poll::CondCheckError::::NotYet) } else { Ok(new_state) } diff --git a/nexus/src/internal_api/http_entrypoints.rs b/nexus/src/internal_api/http_entrypoints.rs index 28ff712c241..33b626a7fcd 100644 --- a/nexus/src/internal_api/http_entrypoints.rs +++ b/nexus/src/internal_api/http_entrypoints.rs @@ -177,7 +177,7 @@ impl NexusInternalApi for NexusInternalApiImpl { nexus .notify_instance_updated( &opctx, - &InstanceUuid::from_untyped_uuid(path.instance_id), + InstanceUuid::from_untyped_uuid(path.instance_id), &new_state, ) .await?; From f0f20c2bb0c40bc2df57e11bfd6d608b6fefb569 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Thu, 4 Jul 2024 09:57:47 -0700 Subject: [PATCH 096/234] also make snapshot_create tests wait for NoVmm --- nexus/src/app/sagas/snapshot_create.rs | 27 ++++++++++++++++++++++ nexus/src/app/sagas/test_helpers.rs | 32 ++++++++++++++++++++++++++ 2 files changed, 59 insertions(+) diff --git a/nexus/src/app/sagas/snapshot_create.rs b/nexus/src/app/sagas/snapshot_create.rs index 76a82e74912..797ca842665 100644 --- a/nexus/src/app/sagas/snapshot_create.rs +++ b/nexus/src/app/sagas/snapshot_create.rs @@ -1749,9 +1749,11 @@ mod test { use omicron_common::api::external::InstanceCpuCount; use omicron_common::api::external::Name; use omicron_common::api::external::NameOrId; + use omicron_test_utils::dev::poll; use sled_agent_client::types::CrucibleOpts; use sled_agent_client::TestInterfaces as SledAgentTestInterfaces; use std::str::FromStr; + use std::time::Duration; type DiskTest<'a> = nexus_test_utils::resource_helpers::DiskTest<'a, crate::Server>; @@ -2308,6 +2310,31 @@ mod test { PROJECT_NAME, ) .await; + // Wait until the instance has advanced to the `NoVmm` + // state before deleting it. This may not happen + // immediately, as the `Nexus::cpapi_instances_put` API + // endpoint simply writes the new VMM state to the + // database and *starts* an `instance-update` saga, and + // the instance record isn't updated until that saga + // completes. + poll::wait_for_condition( + || async { + let new_state = test_helpers::instance_fetch_by_name( + cptestctx, + INSTANCE_NAME, + PROJECT_NAME, + ) + .await; + if new_state.instance().runtime().nexus_state != nexus_db_model::InstanceState::NoVmm { + Err(poll::CondCheckError::<()>::NotYet) + } else { + Ok(()) + } + }, + &Duration::from_secs(5), + &Duration::from_secs(300), + ) + .await.expect("instance did not advance to NoVmm after 400 seconds"); test_helpers::instance_delete_by_name( cptestctx, INSTANCE_NAME, diff --git a/nexus/src/app/sagas/test_helpers.rs b/nexus/src/app/sagas/test_helpers.rs index a5d9d0a8437..0ae85ec6ad2 100644 --- a/nexus/src/app/sagas/test_helpers.rs +++ b/nexus/src/app/sagas/test_helpers.rs @@ -188,6 +188,38 @@ pub async fn instance_fetch( db_state } +pub async fn instance_fetch_by_name( + cptestctx: &ControlPlaneTestContext, + name: &str, + project_name: &str, +) -> InstanceAndActiveVmm { + let datastore = cptestctx.server.server_context().nexus.datastore().clone(); + + let nexus = &cptestctx.server.server_context().nexus; + let opctx = test_opctx(&cptestctx); + let instance_selector = + nexus_types::external_api::params::InstanceSelector { + project: Some(project_name.to_string().try_into().unwrap()), + instance: name.to_string().try_into().unwrap(), + }; + + let instance_lookup = + nexus.instance_lookup(&opctx, instance_selector).unwrap(); + let (_, _, authz_instance, ..) = instance_lookup.fetch().await.unwrap(); + + let db_state = datastore + .instance_fetch_with_vmm(&opctx, &authz_instance) + .await + .expect("test instance's info should be fetchable"); + + info!(&cptestctx.logctx.log, "fetched instance info from db"; + "instance_name" => %name, + "project_name" => %project_name, + "instance_id" => %authz_instance.id(), + "instance_and_vmm" => ?db_state); + + db_state +} pub async fn no_virtual_provisioning_resource_records_exist( cptestctx: &ControlPlaneTestContext, ) -> bool { From ecd0030bc785c0029bfe451011f6c121299d367d Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Thu, 4 Jul 2024 10:36:39 -0700 Subject: [PATCH 097/234] clippy cleanliness --- nexus/src/app/background/tasks/instance_updater.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nexus/src/app/background/tasks/instance_updater.rs b/nexus/src/app/background/tasks/instance_updater.rs index 4aa31c07023..183a12fe24d 100644 --- a/nexus/src/app/background/tasks/instance_updater.rs +++ b/nexus/src/app/background/tasks/instance_updater.rs @@ -126,7 +126,7 @@ impl InstanceUpdater { Ok(Err(err)) => { warn!(opctx.log, "update saga failed!"; "error" => %err); stats.sagas_failed += 1; - last_err = Err(err.into()); + last_err = Err(err); } Ok(Ok(())) => stats.sagas_completed += 1, } From 473299dee24b3078bdce930da49af599c28686c4 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Mon, 8 Jul 2024 10:48:54 -0700 Subject: [PATCH 098/234] fixup instance real state determination --- nexus/db-queries/src/db/datastore/instance.rs | 59 ++++++++++++++----- 1 file changed, 45 insertions(+), 14 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/instance.rs b/nexus/db-queries/src/db/datastore/instance.rs index c88ba460b8f..86892dbdde2 100644 --- a/nexus/db-queries/src/db/datastore/instance.rs +++ b/nexus/db-queries/src/db/datastore/instance.rs @@ -38,6 +38,7 @@ use nexus_db_model::Disk; use nexus_db_model::VmmRuntimeState; use nexus_types::deployment::SledFilter; use omicron_common::api; +use omicron_common::api::external; use omicron_common::api::external::http_pagination::PaginatedBy; use omicron_common::api::external::CreateResult; use omicron_common::api::external::DataPageParams; @@ -76,9 +77,7 @@ impl InstanceAndActiveVmm { self.vmm.as_ref().map(|v| SledUuid::from_untyped_uuid(v.sled_id)) } - pub fn effective_state( - &self, - ) -> omicron_common::api::external::InstanceState { + pub fn effective_state(&self) -> external::InstanceState { if let Some(vmm) = &self.vmm { vmm.runtime.state.into() } else { @@ -93,17 +92,49 @@ impl From<(Instance, Option)> for InstanceAndActiveVmm { } } -impl From for omicron_common::api::external::Instance { +impl From for external::Instance { fn from(value: InstanceAndActiveVmm) -> Self { - let run_state: omicron_common::api::external::InstanceState; - let time_run_state_updated: chrono::DateTime; - (run_state, time_run_state_updated) = if let Some(vmm) = value.vmm { - (vmm.runtime.state.into(), vmm.runtime.time_state_updated) - } else { - ( - value.instance.runtime_state.nexus_state.into(), - value.instance.runtime_state.time_updated, - ) + use crate::db::model::InstanceState; + use crate::db::model::VmmState; + let time_run_state_updated = value + .vmm + .as_ref() + .map(|vmm| vmm.runtime.time_state_updated) + .unwrap_or(value.instance.runtime_state.time_updated); + + let instance_state = value.instance.runtime_state.nexus_state; + let vmm_state = value.vmm.as_ref().map(|vmm| vmm.runtime.state); + + // We want to only report that an instance is `Stopped` when a new + // `instance-start` saga is able to proceed. That means that: + let run_state = match (instance_state, vmm_state) { + // - An instance with a "stopped" VMM needs to be recast as a + // "stopping" instance. + (InstanceState::Vmm, Some(VmmState::Stopped)) => { + external::InstanceState::Stopping + } + // - An instance with a "destroyed" VMM can be recast as a "stopped" + // instance if the start saga is allowed to immediately replace + // it. This applies to "destroyed" VMMs but, critically, *not* to + // "SagaUnwound" VMMs, even though they will be otherwise + // converted to "destroyed" in the public API. + (InstanceState::Vmm, Some(VmmState::Destroyed)) => { + external::InstanceState::Stopped + } + // - An instance with no VMM is always "stopped" (as long as it's + // not "starting" etc.) + (InstanceState::NoVmm, _vmm_state) => { + debug_assert_eq!(_vmm_state, None); + external::InstanceState::Stopped + } + // If there's a VMM state, and none of the above rules apply, use + // that. + (_instance_state, Some(vmm_state)) => { + debug_assert_eq!(_instance_state, InstanceState::Vmm); + vmm_state.into() + } + // If there's no VMM state, use the instance's state. + (instance_state, None) => instance_state.into(), }; Self { @@ -116,7 +147,7 @@ impl From for omicron_common::api::external::Instance { .hostname .parse() .expect("found invalid hostname in the database"), - runtime: omicron_common::api::external::InstanceRuntimeState { + runtime: external::InstanceRuntimeState { run_state, time_run_state_updated, }, From 5a2404e4d363117bc86c391bd81b36bb256ee007 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Mon, 8 Jul 2024 10:49:09 -0700 Subject: [PATCH 099/234] make tests wait for states --- nexus/tests/integration_tests/disks.rs | 6 ++ nexus/tests/integration_tests/instances.rs | 69 ++++++++++++++++++---- 2 files changed, 64 insertions(+), 11 deletions(-) diff --git a/nexus/tests/integration_tests/disks.rs b/nexus/tests/integration_tests/disks.rs index ded4a346fb6..5221fb2de87 100644 --- a/nexus/tests/integration_tests/disks.rs +++ b/nexus/tests/integration_tests/disks.rs @@ -4,6 +4,7 @@ //! Tests basic disk support in the API +use super::instances::instance_wait_for_state; use super::metrics::{get_latest_silo_metric, query_for_metrics}; use chrono::Utc; use dropshot::test_util::ClientTestContext; @@ -37,6 +38,7 @@ use omicron_common::api::external::Disk; use omicron_common::api::external::DiskState; use omicron_common::api::external::IdentityMetadataCreateParams; use omicron_common::api::external::Instance; +use omicron_common::api::external::InstanceState; use omicron_common::api::external::Name; use omicron_common::api::external::NameOrId; use omicron_nexus::app::{MAX_DISK_SIZE_BYTES, MIN_DISK_SIZE_BYTES}; @@ -395,6 +397,8 @@ async fn test_disk_slot_assignment(cptestctx: &ControlPlaneTestContext) { let instance_id = InstanceUuid::from_untyped_uuid(instance.identity.id); set_instance_state(&client, INSTANCE_NAME, "stop").await; instance_simulate(nexus, &instance_id).await; + instance_wait_for_state(&client, INSTANCE_NAME, InstanceState::Stopped) + .await; let url_instance_disks = get_instance_disks_url(instance.identity.name.as_str()); let listed_disks = disks_list(&client, &url_instance_disks).await; @@ -504,6 +508,8 @@ async fn test_disk_move_between_instances(cptestctx: &ControlPlaneTestContext) { // is an artificial limitation without hotplug support. set_instance_state(&client, INSTANCE_NAME, "stop").await; instance_simulate(nexus, &instance_id).await; + instance_wait_for_state(&client, INSTANCE_NAME, InstanceState::Stopped) + .await; // Verify that there are no disks attached to the instance, and specifically // that our disk is not attached to this instance. diff --git a/nexus/tests/integration_tests/instances.rs b/nexus/tests/integration_tests/instances.rs index 8cd49a4d029..6e03ac8df97 100644 --- a/nexus/tests/integration_tests/instances.rs +++ b/nexus/tests/integration_tests/instances.rs @@ -1374,8 +1374,8 @@ async fn test_instance_metrics_with_migration( // After this the instance should be running and should continue to appear // to be provisioned. instance_simulate_on_sled(cptestctx, nexus, dst_sled_id, instance_id).await; - let instance = instance_get(&client, &instance_url).await; - assert_eq!(instance.runtime.run_state, InstanceState::Running); + instance_wait_for_state(&client, instance_name, InstanceState::Running) + .await; check_provisioning_state(4, 1).await; @@ -1387,9 +1387,8 @@ async fn test_instance_metrics_with_migration( // logical states of instances ignoring migration). instance_post(&client, instance_name, InstanceOp::Stop).await; instance_simulate(nexus, &instance_id).await; - let instance = - instance_get(&client, &get_instance_url(&instance_name)).await; - assert_eq!(instance.runtime.run_state, InstanceState::Stopped); + instance_wait_for_state(&client, instance_name, InstanceState::Stopped) + .await; check_provisioning_state(0, 0).await; } @@ -1489,8 +1488,8 @@ async fn test_instances_delete_fails_when_running_succeeds_when_stopped( // Stop the instance instance_post(&client, instance_name, InstanceOp::Stop).await; instance_simulate(nexus, &instance_id).await; - let instance = instance_get(&client, &instance_url).await; - assert_eq!(instance.runtime.run_state, InstanceState::Stopped); + instance_wait_for_state(&client, instance_name, InstanceState::Stopped) + .await; // Now deletion should succeed. NexusRequest::object_delete(&client, &instance_url) @@ -2358,6 +2357,9 @@ async fn test_instance_update_network_interfaces( // Stop the instance again, and now verify that the update works. instance_post(client, instance_name, InstanceOp::Stop).await; instance_simulate(nexus, &instance_id).await; + instance_wait_for_state(client, instance_name, InstanceState::Stopped) + .await; + let updated_primary_iface = NexusRequest::object_put( client, &format!("/v1/network-interfaces/{}", primary_iface.identity.id), @@ -3271,8 +3273,8 @@ async fn test_disks_detached_when_instance_destroyed( instance_post(&client, instance_name, InstanceOp::Stop).await; instance_simulate(nexus, &instance_id).await; - let instance = instance_get(&client, &instance_url).await; - assert_eq!(instance.runtime.run_state, InstanceState::Stopped); + instance_wait_for_state(&client, instance_name, InstanceState::Stopped) + .await; NexusRequest::object_delete(&client, &instance_url) .authn_as(AuthnMode::PrivilegedUser) @@ -4019,8 +4021,9 @@ async fn test_instance_serial(cptestctx: &ControlPlaneTestContext) { let instance = instance_next; instance_simulate(nexus, &instance_id).await; - let instance_next = instance_get(&client, &instance_url).await; - assert_eq!(instance_next.runtime.run_state, InstanceState::Stopped); + let instance_next = + instance_wait_for_state(&client, instance_name, InstanceState::Stopped) + .await; assert!( instance_next.runtime.time_run_state_updated > instance.runtime.time_run_state_updated @@ -4192,6 +4195,8 @@ async fn stop_and_delete_instance( &InstanceUuid::from_untyped_uuid(instance.identity.id), ) .await; + instance_wait_for_state(client, instance_name, InstanceState::Stopped) + .await; let url = format!("/v1/instances/{}?project={}", instance_name, PROJECT_NAME); object_delete(client, &url).await; @@ -4617,6 +4622,8 @@ async fn test_instance_create_in_silo(cptestctx: &ControlPlaneTestContext) { .expect("Failed to stop the instance"); instance_simulate_with_opctx(nexus, &instance_id, &opctx).await; + instance_wait_for_state(client, instance_name, InstanceState::Stopped) + .await; // Delete the instance NexusRequest::object_delete(client, &instance_url) @@ -4770,6 +4777,46 @@ pub enum InstanceOp { Reboot, } +pub async fn instance_wait_for_state( + client: &ClientTestContext, + instance_name: &str, + state: omicron_common::api::external::InstanceState, +) -> Instance { + const MAX_WAIT: Duration = Duration::from_secs(120); + let url = get_instance_url(instance_name); + + slog::info!( + &client.client_log, + "waiting for '{instance_name}' to transition to {state}..."; + ); + let result = wait_for_condition( + || async { + let instance = instance_get(client, &url).await; + if instance.runtime.run_state == state { + Ok(instance) + } else { + slog::info!( + &client.client_log, + "instance '{instance_name}' has not transitioned to {state}"; + "instance_id" => %instance.identity.id, + "instance_runtime_state" => ?instance.runtime, + ); + Err(CondCheckError::<()>::NotYet) + } + }, + &Duration::from_secs(1), + &MAX_WAIT, + ) + .await; + match result { + Ok(instance) => instance, + Err(_) => panic!( + "instance '{instance_name}' did not transition to {state:?} \ + after {MAX_WAIT:?}" + ), + } +} + pub async fn instance_post( client: &ClientTestContext, instance_name: &str, From 6af03b580765182b6c4278a733414fd71750e5ab Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Mon, 8 Jul 2024 10:49:44 -0700 Subject: [PATCH 100/234] make logging in start saga more useful --- nexus/src/app/sagas/instance_update/start.rs | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/nexus/src/app/sagas/instance_update/start.rs b/nexus/src/app/sagas/instance_update/start.rs index 6e501b5c550..b9377822a12 100644 --- a/nexus/src/app/sagas/instance_update/start.rs +++ b/nexus/src/app/sagas/instance_update/start.rs @@ -169,11 +169,15 @@ async fn siu_fetch_state_and_start_real_saga( osagactx.log(), "instance update: starting real update saga..."; "instance_id" => %authz_instance.id(), - "new_runtime_state" => ?update.new_runtime, - "network_config_update" => ?update.network_config, - "destroy_active_vmm" => ?update.destroy_active_vmm, - "destroy_target_vmm" => ?update.destroy_target_vmm, - "deprovision" => update.deprovision, + "current.runtime_state" => ?state.instance.runtime(), + "current.migration" => ?state.migration, + "current.active_vmm" => ?state.active_vmm, + "current.target_vmm" => ?state.target_vmm, + "update.new_runtime_state" => ?update.new_runtime, + "update.network_config_update" => ?update.network_config, + "update.destroy_active_vmm" => ?update.destroy_active_vmm, + "update.destroy_target_vmm" => ?update.destroy_target_vmm, + "update.deprovision" => update.deprovision, ); osagactx .nexus() @@ -192,6 +196,10 @@ async fn siu_fetch_state_and_start_real_saga( osagactx.log(), "instance update: no updates required, releasing lock."; "instance_id" => %authz_instance.id(), + "current.runtime_state" => ?state.instance.runtime(), + "current.migration" => ?state.migration, + "current.active_vmm" => ?state.active_vmm, + "current.target_vmm" => ?state.target_vmm, ); super::unlock_instance_inner( &serialized_authn, From df9e1eb38eba1b91f76dfd312535ab81fb8475b5 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Mon, 8 Jul 2024 11:31:32 -0700 Subject: [PATCH 101/234] don't treat instances as stopped until virtual resources are gone --- nexus/db-queries/src/db/datastore/instance.rs | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/instance.rs b/nexus/db-queries/src/db/datastore/instance.rs index 86892dbdde2..9d8167817cd 100644 --- a/nexus/db-queries/src/db/datastore/instance.rs +++ b/nexus/db-queries/src/db/datastore/instance.rs @@ -108,19 +108,13 @@ impl From for external::Instance { // We want to only report that an instance is `Stopped` when a new // `instance-start` saga is able to proceed. That means that: let run_state = match (instance_state, vmm_state) { - // - An instance with a "stopped" VMM needs to be recast as a - // "stopping" instance. - (InstanceState::Vmm, Some(VmmState::Stopped)) => { + // - An instance with a "stopped" or "destroyed" VMM needs to be + // recast as a "stopping" instance, as the virtual provisioning + // resources for that instance have not been deallocated until the + // active VMM ID has been unlinked by an update saga. + (InstanceState::Vmm, Some(VmmState::Stopped | VmmState::Destroyed)) => { external::InstanceState::Stopping } - // - An instance with a "destroyed" VMM can be recast as a "stopped" - // instance if the start saga is allowed to immediately replace - // it. This applies to "destroyed" VMMs but, critically, *not* to - // "SagaUnwound" VMMs, even though they will be otherwise - // converted to "destroyed" in the public API. - (InstanceState::Vmm, Some(VmmState::Destroyed)) => { - external::InstanceState::Stopped - } // - An instance with no VMM is always "stopped" (as long as it's // not "starting" etc.) (InstanceState::NoVmm, _vmm_state) => { From 9a6b1bf7b539c9e5478a9d08fd347d3b493827c5 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Mon, 8 Jul 2024 11:33:16 -0700 Subject: [PATCH 102/234] temp fix sagas not being rescheduled --- nexus/src/app/instance.rs | 34 ++++++++++++++++++++++++++++++++-- 1 file changed, 32 insertions(+), 2 deletions(-) diff --git a/nexus/src/app/instance.rs b/nexus/src/app/instance.rs index f2a159d2037..8f203d8ccbb 100644 --- a/nexus/src/app/instance.rs +++ b/nexus/src/app/instance.rs @@ -1409,13 +1409,43 @@ impl super::Nexus { "migration_state" => ?new_runtime_state.migrations(), ); let sagas = self.sagas.clone(); + let task_instance_updater = + self.background_tasks.task_instance_updater.clone(); let log = opctx.log.clone(); tokio::spawn(async move { - if let Err(error) = sagas.saga_start(saga).await { - warn!(&log, "update saga for {instance_id} failed!"; + // TODO(eliza): maybe we should use the lower level saga API so + // we can see if the saga failed due to the lock being held and + // retry it immediately? + let running_saga = async move { + let runnable_saga = sagas.saga_prepare(saga).await?; + runnable_saga.start().await + } + .await; + let result = match running_saga { + Err(error) => { + error!(&log, "failed to start update saga for {instance_id}"; + "instance_id" => %instance_id, + "error" => %error, + ); + // If we couldn't start the update saga for this + // instance, kick the instance-updater background task + // to try and start it again in a timely manner. + task_instance_updater.activate(); + return; + } + Ok(saga) => { + saga.wait_until_stopped().await.into_omicron_result() + } + }; + if let Err(error) = result { + error!(&log, "update saga for {instance_id} failed"; "instance_id" => %instance_id, "error" => %error, ); + // If we couldn't complete the update saga for this + // instance, kick the instance-updater background task + // to try and start it again in a timely manner. + task_instance_updater.activate(); } }); } From 723ae5650f84b0773325fe7e9a7bddf343508d4e Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Mon, 8 Jul 2024 11:33:47 -0700 Subject: [PATCH 103/234] instance_wait_for_state should log successful transitions --- nexus/tests/integration_tests/instances.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/nexus/tests/integration_tests/instances.rs b/nexus/tests/integration_tests/instances.rs index 6e03ac8df97..96247362a0a 100644 --- a/nexus/tests/integration_tests/instances.rs +++ b/nexus/tests/integration_tests/instances.rs @@ -4809,7 +4809,10 @@ pub async fn instance_wait_for_state( ) .await; match result { - Ok(instance) => instance, + Ok(instance) => { + slog::info!(&client.client_log, "instance '{instance_name}' has transitioned to {state}"); + instance + } Err(_) => panic!( "instance '{instance_name}' did not transition to {state:?} \ after {MAX_WAIT:?}" From ef24a5d0c52353d1a290deb294250556ad0ace5e Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Mon, 8 Jul 2024 11:40:49 -0700 Subject: [PATCH 104/234] remove defunct test output files --- ...ance_and_vmm_update_vmm_and_imigration.sql | 55 ------------ ..._vmm_update_vmm_instance_and_migration.sql | 84 ------------------- 2 files changed, 139 deletions(-) delete mode 100644 nexus/db-queries/tests/output/instance_and_vmm_update_vmm_and_imigration.sql delete mode 100644 nexus/db-queries/tests/output/instance_and_vmm_update_vmm_instance_and_migration.sql diff --git a/nexus/db-queries/tests/output/instance_and_vmm_update_vmm_and_imigration.sql b/nexus/db-queries/tests/output/instance_and_vmm_update_vmm_and_imigration.sql deleted file mode 100644 index 9c54c8b8efb..00000000000 --- a/nexus/db-queries/tests/output/instance_and_vmm_update_vmm_and_imigration.sql +++ /dev/null @@ -1,55 +0,0 @@ -WITH - migration_found - AS ( - SELECT - ( - SELECT - migration.id - FROM - migration - WHERE - migration.id = $1 AND (migration.time_deleted IS NULL) - ) - AS id - ), - migration_updated - AS ( - UPDATE - migration - SET - source_state = $2, time_source_updated = $3 - WHERE - (migration.id = $4 AND migration.source_propolis_id = $5) AND migration.source_gen < $6 - RETURNING - id - ), - migration_result - AS ( - SELECT - migration_found.id AS found, migration_updated.id AS updated - FROM - migration_found LEFT JOIN migration_updated ON migration_found.id = migration_updated.id - ), - vmm_found AS (SELECT (SELECT vmm.id FROM vmm WHERE vmm.id = $7) AS id), - vmm_updated - AS ( - UPDATE - vmm - SET - time_state_updated = $8, state_generation = $9, state = $10 - WHERE - ((vmm.time_deleted IS NULL) AND vmm.id = $11) AND vmm.state_generation < $12 - RETURNING - id - ), - vmm_result - AS ( - SELECT - vmm_found.id AS found, vmm_updated.id AS updated - FROM - vmm_found LEFT JOIN vmm_updated ON vmm_found.id = vmm_updated.id - ) -SELECT - vmm_result.found, vmm_result.updated, NULL, NULL, migration_result.found, migration_result.updated -FROM - vmm_result, migration_result diff --git a/nexus/db-queries/tests/output/instance_and_vmm_update_vmm_instance_and_migration.sql b/nexus/db-queries/tests/output/instance_and_vmm_update_vmm_instance_and_migration.sql deleted file mode 100644 index bee0b68a3a4..00000000000 --- a/nexus/db-queries/tests/output/instance_and_vmm_update_vmm_instance_and_migration.sql +++ /dev/null @@ -1,84 +0,0 @@ -WITH - instance_found AS (SELECT (SELECT instance.id FROM instance WHERE instance.id = $1) AS id), - instance_updated - AS ( - UPDATE - instance - SET - time_state_updated = $2, - state_generation = $3, - active_propolis_id = $4, - target_propolis_id = $5, - migration_id = $6, - state = $7 - WHERE - ((instance.time_deleted IS NULL) AND instance.id = $8) AND instance.state_generation < $9 - RETURNING - id - ), - instance_result - AS ( - SELECT - instance_found.id AS found, instance_updated.id AS updated - FROM - instance_found LEFT JOIN instance_updated ON instance_found.id = instance_updated.id - ), - migration_found - AS ( - SELECT - ( - SELECT - migration.id - FROM - migration - WHERE - migration.id = $10 AND (migration.time_deleted IS NULL) - ) - AS id - ), - migration_updated - AS ( - UPDATE - migration - SET - source_state = $11, time_source_updated = $12 - WHERE - (migration.id = $13 AND migration.source_propolis_id = $14) AND migration.source_gen < $15 - RETURNING - id - ), - migration_result - AS ( - SELECT - migration_found.id AS found, migration_updated.id AS updated - FROM - migration_found LEFT JOIN migration_updated ON migration_found.id = migration_updated.id - ), - vmm_found AS (SELECT (SELECT vmm.id FROM vmm WHERE vmm.id = $16) AS id), - vmm_updated - AS ( - UPDATE - vmm - SET - time_state_updated = $17, state_generation = $18, state = $19 - WHERE - ((vmm.time_deleted IS NULL) AND vmm.id = $20) AND vmm.state_generation < $21 - RETURNING - id - ), - vmm_result - AS ( - SELECT - vmm_found.id AS found, vmm_updated.id AS updated - FROM - vmm_found LEFT JOIN vmm_updated ON vmm_found.id = vmm_updated.id - ) -SELECT - vmm_result.found, - vmm_result.updated, - instance_result.found, - instance_result.updated, - migration_result.found, - migration_result.updated -FROM - vmm_result, instance_result, migration_result From 44559461c3c339add1e2d0167d185e11834a70bb Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Mon, 8 Jul 2024 14:26:00 -0700 Subject: [PATCH 105/234] fix `instance_wait_for_state` accidentally using the instance tests project --- nexus/tests/integration_tests/disks.rs | 6 +-- nexus/tests/integration_tests/instances.rs | 57 ++++++++++------------ 2 files changed, 29 insertions(+), 34 deletions(-) diff --git a/nexus/tests/integration_tests/disks.rs b/nexus/tests/integration_tests/disks.rs index 5221fb2de87..cc8657a2bd9 100644 --- a/nexus/tests/integration_tests/disks.rs +++ b/nexus/tests/integration_tests/disks.rs @@ -397,8 +397,7 @@ async fn test_disk_slot_assignment(cptestctx: &ControlPlaneTestContext) { let instance_id = InstanceUuid::from_untyped_uuid(instance.identity.id); set_instance_state(&client, INSTANCE_NAME, "stop").await; instance_simulate(nexus, &instance_id).await; - instance_wait_for_state(&client, INSTANCE_NAME, InstanceState::Stopped) - .await; + instance_wait_for_state(&client, instance_id, InstanceState::Stopped).await; let url_instance_disks = get_instance_disks_url(instance.identity.name.as_str()); let listed_disks = disks_list(&client, &url_instance_disks).await; @@ -508,8 +507,7 @@ async fn test_disk_move_between_instances(cptestctx: &ControlPlaneTestContext) { // is an artificial limitation without hotplug support. set_instance_state(&client, INSTANCE_NAME, "stop").await; instance_simulate(nexus, &instance_id).await; - instance_wait_for_state(&client, INSTANCE_NAME, InstanceState::Stopped) - .await; + instance_wait_for_state(&client, instance_id, InstanceState::Stopped).await; // Verify that there are no disks attached to the instance, and specifically // that our disk is not attached to this instance. diff --git a/nexus/tests/integration_tests/instances.rs b/nexus/tests/integration_tests/instances.rs index 96247362a0a..2c5b3ad6632 100644 --- a/nexus/tests/integration_tests/instances.rs +++ b/nexus/tests/integration_tests/instances.rs @@ -1374,8 +1374,7 @@ async fn test_instance_metrics_with_migration( // After this the instance should be running and should continue to appear // to be provisioned. instance_simulate_on_sled(cptestctx, nexus, dst_sled_id, instance_id).await; - instance_wait_for_state(&client, instance_name, InstanceState::Running) - .await; + instance_wait_for_state(&client, instance_id, InstanceState::Running).await; check_provisioning_state(4, 1).await; @@ -1387,8 +1386,7 @@ async fn test_instance_metrics_with_migration( // logical states of instances ignoring migration). instance_post(&client, instance_name, InstanceOp::Stop).await; instance_simulate(nexus, &instance_id).await; - instance_wait_for_state(&client, instance_name, InstanceState::Stopped) - .await; + instance_wait_for_state(&client, instance_id, InstanceState::Stopped).await; check_provisioning_state(0, 0).await; } @@ -1488,8 +1486,7 @@ async fn test_instances_delete_fails_when_running_succeeds_when_stopped( // Stop the instance instance_post(&client, instance_name, InstanceOp::Stop).await; instance_simulate(nexus, &instance_id).await; - instance_wait_for_state(&client, instance_name, InstanceState::Stopped) - .await; + instance_wait_for_state(&client, instance_id, InstanceState::Stopped).await; // Now deletion should succeed. NexusRequest::object_delete(&client, &instance_url) @@ -2357,8 +2354,7 @@ async fn test_instance_update_network_interfaces( // Stop the instance again, and now verify that the update works. instance_post(client, instance_name, InstanceOp::Stop).await; instance_simulate(nexus, &instance_id).await; - instance_wait_for_state(client, instance_name, InstanceState::Stopped) - .await; + instance_wait_for_state(client, instance_id, InstanceState::Stopped).await; let updated_primary_iface = NexusRequest::object_put( client, @@ -3273,8 +3269,7 @@ async fn test_disks_detached_when_instance_destroyed( instance_post(&client, instance_name, InstanceOp::Stop).await; instance_simulate(nexus, &instance_id).await; - instance_wait_for_state(&client, instance_name, InstanceState::Stopped) - .await; + instance_wait_for_state(&client, instance_id, InstanceState::Stopped).await; NexusRequest::object_delete(&client, &instance_url) .authn_as(AuthnMode::PrivilegedUser) @@ -4022,7 +4017,7 @@ async fn test_instance_serial(cptestctx: &ControlPlaneTestContext) { let instance = instance_next; instance_simulate(nexus, &instance_id).await; let instance_next = - instance_wait_for_state(&client, instance_name, InstanceState::Stopped) + instance_wait_for_state(&client, instance_id, InstanceState::Stopped) .await; assert!( instance_next.runtime.time_run_state_updated @@ -4189,14 +4184,10 @@ async fn stop_and_delete_instance( let client = &cptestctx.external_client; let instance = instance_post(&client, instance_name, InstanceOp::Stop).await; + let instance_id = InstanceUuid::from_untyped_uuid(instance.identity.id); let nexus = &cptestctx.server.server_context().nexus; - instance_simulate( - nexus, - &InstanceUuid::from_untyped_uuid(instance.identity.id), - ) - .await; - instance_wait_for_state(client, instance_name, InstanceState::Stopped) - .await; + instance_simulate(nexus, &instance_id).await; + instance_wait_for_state(client, instance_id, InstanceState::Stopped).await; let url = format!("/v1/instances/{}?project={}", instance_name, PROJECT_NAME); object_delete(client, &url).await; @@ -4622,8 +4613,7 @@ async fn test_instance_create_in_silo(cptestctx: &ControlPlaneTestContext) { .expect("Failed to stop the instance"); instance_simulate_with_opctx(nexus, &instance_id, &opctx).await; - instance_wait_for_state(client, instance_name, InstanceState::Stopped) - .await; + instance_wait_for_state(client, instance_id, InstanceState::Stopped).await; // Delete the instance NexusRequest::object_delete(client, &instance_url) @@ -4779,29 +4769,33 @@ pub enum InstanceOp { pub async fn instance_wait_for_state( client: &ClientTestContext, - instance_name: &str, + instance_id: InstanceUuid, state: omicron_common::api::external::InstanceState, ) -> Instance { const MAX_WAIT: Duration = Duration::from_secs(120); - let url = get_instance_url(instance_name); slog::info!( &client.client_log, - "waiting for '{instance_name}' to transition to {state}..."; + "waiting for instance {instance_id} to transition to {state}..."; ); + let url = format!("/v1/instances/{instance_id}"); let result = wait_for_condition( || async { - let instance = instance_get(client, &url).await; + let instance: Instance = NexusRequest::object_get(client, &url) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await? + .parsed_body()?; if instance.runtime.run_state == state { Ok(instance) } else { slog::info!( &client.client_log, - "instance '{instance_name}' has not transitioned to {state}"; + "instance {instance_id} has not transitioned to {state}"; "instance_id" => %instance.identity.id, "instance_runtime_state" => ?instance.runtime, ); - Err(CondCheckError::<()>::NotYet) + Err(CondCheckError::::NotYet) } }, &Duration::from_secs(1), @@ -4810,12 +4804,15 @@ pub async fn instance_wait_for_state( .await; match result { Ok(instance) => { - slog::info!(&client.client_log, "instance '{instance_name}' has transitioned to {state}"); + slog::info!( + &client.client_log, + "instance {instance_id} has transitioned to {state}" + ); instance } - Err(_) => panic!( - "instance '{instance_name}' did not transition to {state:?} \ - after {MAX_WAIT:?}" + Err(e) => panic!( + "instance {instance_id} did not transition to {state:?} \ + after {MAX_WAIT:?}: {e}" ), } } From 1e0ed7b8a048744b46bad6b7d76d622020645e19 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Mon, 8 Jul 2024 14:31:28 -0700 Subject: [PATCH 106/234] add missing wait for stop in disk tests --- nexus/tests/integration_tests/disks.rs | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/nexus/tests/integration_tests/disks.rs b/nexus/tests/integration_tests/disks.rs index cc8657a2bd9..7d8583293bb 100644 --- a/nexus/tests/integration_tests/disks.rs +++ b/nexus/tests/integration_tests/disks.rs @@ -238,18 +238,15 @@ async fn test_disk_create_attach_detach_delete( // Create an instance to attach the disk. let instance = create_instance(&client, PROJECT_NAME, INSTANCE_NAME).await; + let instance_id = InstanceUuid::from_untyped_uuid(instance.identity.id); // TODO(https://github.com/oxidecomputer/omicron/issues/811): // // Instances must be stopped before disks can be attached - this // is an artificial limitation without hotplug support. - let instance_next = - set_instance_state(&client, INSTANCE_NAME, "stop").await; - instance_simulate( - nexus, - &InstanceUuid::from_untyped_uuid(instance_next.identity.id), - ) - .await; + set_instance_state(&client, INSTANCE_NAME, "stop").await; + instance_simulate(nexus, &instance_id).await; + instance_wait_for_state(client, instance_id, InstanceState::Stopped).await; // Verify that there are no disks attached to the instance, and specifically // that our disk is not attached to this instance. @@ -398,6 +395,7 @@ async fn test_disk_slot_assignment(cptestctx: &ControlPlaneTestContext) { set_instance_state(&client, INSTANCE_NAME, "stop").await; instance_simulate(nexus, &instance_id).await; instance_wait_for_state(&client, instance_id, InstanceState::Stopped).await; + let url_instance_disks = get_instance_disks_url(instance.identity.name.as_str()); let listed_disks = disks_list(&client, &url_instance_disks).await; From 093ee441444fe6013319988143f76e74a7be4afc Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Mon, 8 Jul 2024 14:46:55 -0700 Subject: [PATCH 107/234] found some more places that need to wait for stop --- nexus/tests/integration_tests/external_ips.rs | 3 +++ nexus/tests/integration_tests/instances.rs | 8 ++++++++ 2 files changed, 11 insertions(+) diff --git a/nexus/tests/integration_tests/external_ips.rs b/nexus/tests/integration_tests/external_ips.rs index 27893188558..0940c8675b2 100644 --- a/nexus/tests/integration_tests/external_ips.rs +++ b/nexus/tests/integration_tests/external_ips.rs @@ -9,6 +9,7 @@ use std::net::Ipv4Addr; use crate::integration_tests::instances::fetch_instance_external_ips; use crate::integration_tests::instances::instance_simulate; +use crate::integration_tests::instances::instance_wait_for_state; use dropshot::test_util::ClientTestContext; use dropshot::HttpErrorResponseBody; use http::Method; @@ -47,6 +48,7 @@ use omicron_common::api::external::IdentityMetadataCreateParams; use omicron_common::api::external::IdentityMetadataUpdateParams; use omicron_common::api::external::Instance; use omicron_common::api::external::InstanceCpuCount; +use omicron_common::api::external::InstanceState; use omicron_common::api::external::Name; use omicron_common::api::external::NameOrId; use omicron_uuid_kinds::GenericUuid; @@ -696,6 +698,7 @@ async fn test_floating_ip_create_attachment( .unwrap(); instance_simulate(nexus, &instance_id).await; + instance_wait_for_state(client, instance_id, InstanceState::Stopped).await; NexusRequest::object_delete( &client, diff --git a/nexus/tests/integration_tests/instances.rs b/nexus/tests/integration_tests/instances.rs index 2c5b3ad6632..6a13b2f6f35 100644 --- a/nexus/tests/integration_tests/instances.rs +++ b/nexus/tests/integration_tests/instances.rs @@ -2087,6 +2087,7 @@ async fn test_instance_create_delete_network_interface( let instance = instance_post(client, instance_name, InstanceOp::Stop).await; let instance_id = InstanceUuid::from_untyped_uuid(instance.identity.id); instance_simulate(nexus, &instance_id).await; + instance_wait_for_state(client, instance_id, InstanceState::Stopped).await; // Verify we can now make the requests again let mut interfaces = Vec::with_capacity(2); @@ -2156,6 +2157,7 @@ async fn test_instance_create_delete_network_interface( // Stop the instance and verify we can delete the interface instance_post(client, instance_name, InstanceOp::Stop).await; instance_simulate(nexus, &instance_id).await; + instance_wait_for_state(client, instance_id, InstanceState::Stopped).await; // We should not be able to delete the primary interface, while the // secondary still exists @@ -2294,6 +2296,7 @@ async fn test_instance_update_network_interfaces( let instance = instance_post(client, instance_name, InstanceOp::Stop).await; let instance_id = InstanceUuid::from_untyped_uuid(instance.identity.id); instance_simulate(nexus, &instance_id).await; + instance_wait_for_state(client, instance_id, InstanceState::Stopped).await; // Create the first interface on the instance. let primary_iface = NexusRequest::objects_post( @@ -2489,6 +2492,7 @@ async fn test_instance_update_network_interfaces( // Stop the instance again. instance_post(client, instance_name, InstanceOp::Stop).await; instance_simulate(nexus, &instance_id).await; + instance_wait_for_state(client, instance_id, InstanceState::Stopped).await; // Verify that we can set the secondary as the new primary, and that nothing // else changes about the NICs. @@ -3787,6 +3791,8 @@ async fn test_cannot_provision_instance_beyond_cpu_capacity( instance_simulate(nexus, &instance_id).await; instances[1] = instance_post(client, configs[1].0, InstanceOp::Stop).await; instance_simulate(nexus, &instance_id).await; + instance_wait_for_state(client, instance_id, InstanceState::Stopped).await; + expect_instance_start_ok(client, configs[2].0).await; } @@ -3894,6 +3900,8 @@ async fn test_cannot_provision_instance_beyond_ram_capacity( instance_simulate(nexus, &instance_id).await; instances[1] = instance_post(client, configs[1].0, InstanceOp::Stop).await; instance_simulate(nexus, &instance_id).await; + instance_wait_for_state(client, instance_id, InstanceState::Stopped).await; + expect_instance_start_ok(client, configs[2].0).await; } From edc69a5e73dcb4c912b2bbacb9aed21087d11a8d Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Mon, 8 Jul 2024 15:11:54 -0700 Subject: [PATCH 108/234] WOW THERE'S MORE OF THEM (+authn stuff) --- nexus/tests/integration_tests/instances.rs | 35 +++++++++++++++++----- 1 file changed, 28 insertions(+), 7 deletions(-) diff --git a/nexus/tests/integration_tests/instances.rs b/nexus/tests/integration_tests/instances.rs index 6a13b2f6f35..464349a5e21 100644 --- a/nexus/tests/integration_tests/instances.rs +++ b/nexus/tests/integration_tests/instances.rs @@ -629,8 +629,7 @@ async fn test_instance_start_creates_networking_state( instance_simulate(nexus, &instance_id).await; instance_post(&client, instance_name, InstanceOp::Stop).await; instance_simulate(nexus, &instance_id).await; - let instance = instance_get(&client, &instance_url).await; - assert_eq!(instance.runtime.run_state, InstanceState::Stopped); + instance_wait_for_state(client, instance_id, InstanceState::Stopped).await; // Forcibly clear the instance's V2P mappings to simulate what happens when // the control plane comes up when an instance is stopped. @@ -1226,9 +1225,7 @@ async fn test_instance_metrics(cptestctx: &ControlPlaneTestContext) { instance_post(&client, instance_name, InstanceOp::Stop).await; let instance_id = InstanceUuid::from_untyped_uuid(instance.identity.id); instance_simulate(nexus, &instance_id).await; - let instance = - instance_get(&client, &get_instance_url(&instance_name)).await; - assert_eq!(instance.runtime.run_state, InstanceState::Stopped); + instance_wait_for_state(client, instance_id, InstanceState::Stopped).await; let virtual_provisioning_collection = datastore .virtual_provisioning_collection_get(&opctx, project_id) @@ -4621,7 +4618,13 @@ async fn test_instance_create_in_silo(cptestctx: &ControlPlaneTestContext) { .expect("Failed to stop the instance"); instance_simulate_with_opctx(nexus, &instance_id, &opctx).await; - instance_wait_for_state(client, instance_id, InstanceState::Stopped).await; + instance_wait_for_state_as( + client, + AuthnMode::SiloUser(user_id), + instance_id, + InstanceState::Stopped, + ) + .await; // Delete the instance NexusRequest::object_delete(client, &instance_url) @@ -4709,6 +4712,7 @@ async fn test_instance_v2p_mappings(cptestctx: &ControlPlaneTestContext) { instance_simulate(nexus, &instance_id).await; instance_post(&client, instance_name, InstanceOp::Stop).await; instance_simulate(nexus, &instance_id).await; + instance_wait_for_state(client, instance_id, InstanceState::Stopped).await; let instance_url = get_instance_url(instance_name); NexusRequest::object_delete(client, &instance_url) @@ -4779,6 +4783,23 @@ pub async fn instance_wait_for_state( client: &ClientTestContext, instance_id: InstanceUuid, state: omicron_common::api::external::InstanceState, +) -> Instance { + instance_wait_for_state_as( + client, + AuthnMode::PrivilegedUser, + instance_id, + state, + ) + .await +} + +/// Line [`instance_wait_for_state`], but with an [`AuthnMode`] parameter for +/// the instance lookup requests. +pub async fn instance_wait_for_state_as( + client: &ClientTestContext, + authn_as: AuthnMode, + instance_id: InstanceUuid, + state: omicron_common::api::external::InstanceState, ) -> Instance { const MAX_WAIT: Duration = Duration::from_secs(120); @@ -4790,7 +4811,7 @@ pub async fn instance_wait_for_state( let result = wait_for_condition( || async { let instance: Instance = NexusRequest::object_get(client, &url) - .authn_as(AuthnMode::PrivilegedUser) + .authn_as(authn_as.clone()) .execute() .await? .parsed_body()?; From 77c458aab28d016ad2e9093e3b09850af7015df4 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Mon, 8 Jul 2024 15:15:54 -0700 Subject: [PATCH 109/234] also include SagaUnwound in stopping --- nexus/db-queries/src/db/datastore/instance.rs | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/instance.rs b/nexus/db-queries/src/db/datastore/instance.rs index 9d8167817cd..3718338f51b 100644 --- a/nexus/db-queries/src/db/datastore/instance.rs +++ b/nexus/db-queries/src/db/datastore/instance.rs @@ -112,9 +112,14 @@ impl From for external::Instance { // recast as a "stopping" instance, as the virtual provisioning // resources for that instance have not been deallocated until the // active VMM ID has been unlinked by an update saga. - (InstanceState::Vmm, Some(VmmState::Stopped | VmmState::Destroyed)) => { - external::InstanceState::Stopping - } + ( + InstanceState::Vmm, + Some( + VmmState::Stopped + | VmmState::Destroyed + | VmmState::SagaUnwound, + ), + ) => external::InstanceState::Stopping, // - An instance with no VMM is always "stopped" (as long as it's // not "starting" etc.) (InstanceState::NoVmm, _vmm_state) => { From 8dadd3030ec2918c1fda6369da71d19ad7b5d3c7 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Mon, 8 Jul 2024 15:19:09 -0700 Subject: [PATCH 110/234] oh my god theres even more of them --- nexus/tests/integration_tests/instances.rs | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/nexus/tests/integration_tests/instances.rs b/nexus/tests/integration_tests/instances.rs index 464349a5e21..90bd200a8ba 100644 --- a/nexus/tests/integration_tests/instances.rs +++ b/nexus/tests/integration_tests/instances.rs @@ -421,8 +421,9 @@ async fn test_instances_create_reboot_halt( let instance = instance_next; instance_simulate(nexus, &instance_id).await; - let instance_next = instance_get(&client, &instance_url).await; - assert_eq!(instance_next.runtime.run_state, InstanceState::Stopped); + let instance_next = + instance_wait_for_state(client, instance_id, InstanceState::Stopped) + .await; assert!( instance_next.runtime.time_run_state_updated > instance.runtime.time_run_state_updated @@ -516,8 +517,9 @@ async fn test_instances_create_reboot_halt( // assert_eq!(error.message, "cannot reboot instance in state \"stopping\""); let instance = instance_next; instance_simulate(nexus, &instance_id).await; - let instance_next = instance_get(&client, &instance_url).await; - assert_eq!(instance_next.runtime.run_state, InstanceState::Stopped); + let instance_next = + instance_wait_for_state(client, instance_id, InstanceState::Stopped) + .await; assert!( instance_next.runtime.time_run_state_updated > instance.runtime.time_run_state_updated From a6af28674c8ed80b196763df135e11cb12f0926f Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Tue, 9 Jul 2024 11:43:51 -0700 Subject: [PATCH 111/234] WHEW OKAY ACTUALLY DO THE MIGRATION --- clients/sled-agent-client/src/lib.rs | 12 +++++ nexus/tests/integration_tests/instances.rs | 57 ++++++++++++++++++++++ sled-agent/src/sim/http_entrypoints.rs | 18 ++++++- sled-agent/src/sim/sled_agent.rs | 4 +- 4 files changed, 88 insertions(+), 3 deletions(-) diff --git a/clients/sled-agent-client/src/lib.rs b/clients/sled-agent-client/src/lib.rs index 158e8676174..ba3f0a054d5 100644 --- a/clients/sled-agent-client/src/lib.rs +++ b/clients/sled-agent-client/src/lib.rs @@ -447,6 +447,7 @@ impl From /// are bonus endpoints, not generated in the real client. #[async_trait] pub trait TestInterfaces { + async fn instance_single_step(&self, id: Uuid); async fn instance_finish_transition(&self, id: Uuid); async fn instance_simulate_migration_source( &self, @@ -458,6 +459,17 @@ pub trait TestInterfaces { #[async_trait] impl TestInterfaces for Client { + async fn instance_single_step(&self, id: Uuid) { + let baseurl = self.baseurl(); + let client = self.client(); + let url = format!("{}/instances/{}/poke-single-step", baseurl, id); + client + .post(url) + .send() + .await + .expect("instance_single_step() failed unexpectedly"); + } + async fn instance_finish_transition(&self, id: Uuid) { let baseurl = self.baseurl(); let client = self.client(); diff --git a/nexus/tests/integration_tests/instances.rs b/nexus/tests/integration_tests/instances.rs index 90bd200a8ba..c46bafa5088 100644 --- a/nexus/tests/integration_tests/instances.rs +++ b/nexus/tests/integration_tests/instances.rs @@ -1367,11 +1367,52 @@ async fn test_instance_metrics_with_migration( .parsed_body::() .unwrap(); + let migration_id = { + let datastore = apictx.nexus.datastore(); + let opctx = OpContext::for_tests( + cptestctx.logctx.log.new(o!()), + datastore.clone(), + ); + let (.., authz_instance) = LookupPath::new(&opctx, &datastore) + .instance_id(instance.identity.id) + .lookup_for(nexus_db_queries::authz::Action::Read) + .await + .unwrap(); + datastore + .instance_refetch(&opctx, &authz_instance) + .await + .unwrap() + .runtime_state + .migration_id + .expect("since we've started a migration, the instance record must have a migration id!") + }; + + // Wait for the instance to be in the `Migrating` state. Otherwise, the + // subsequent `instance_wait_for_state(..., Running)` may see the `Running` + // state from the *old* VMM, rather than waiting for the migration to + // complete. + instance_simulate_migration_source( + cptestctx, + nexus, + original_sled, + instance_id, + migration_id, + ) + .await; + instance_single_step_on_sled(cptestctx, nexus, original_sled, instance_id) + .await; + instance_single_step_on_sled(cptestctx, nexus, dst_sled_id, instance_id) + .await; + instance_wait_for_state(&client, instance_id, InstanceState::Migrating) + .await; + check_provisioning_state(4, 1).await; // Complete migration on the target. Simulated migrations always succeed. // After this the instance should be running and should continue to appear // to be provisioned. + instance_simulate_on_sled(cptestctx, nexus, original_sled, instance_id) + .await; instance_simulate_on_sled(cptestctx, nexus, dst_sled_id, instance_id).await; instance_wait_for_state(&client, instance_id, InstanceState::Running).await; @@ -5014,6 +5055,22 @@ pub async fn instance_simulate(nexus: &Arc, id: &InstanceUuid) { sa.instance_finish_transition(id.into_untyped_uuid()).await; } +/// Simulate one step of an ongoing instance state transition. To do this, we +/// have to look up the instance, then get the sled agent associated with that +/// instance, and then tell it to finish simulating whatever async transition is +/// going on. +async fn instance_single_step_on_sled( + cptestctx: &ControlPlaneTestContext, + nexus: &Arc, + sled_id: SledUuid, + instance_id: InstanceUuid, +) { + info!(&cptestctx.logctx.log, "Single-stepping simulated instance on sled"; + "instance_id" => %instance_id, "sled_id" => %sled_id); + let sa = nexus.sled_client(&sled_id).await.unwrap(); + sa.instance_single_step(instance_id.into_untyped_uuid()).await; +} + pub async fn instance_simulate_with_opctx( nexus: &Arc, id: &InstanceUuid, diff --git a/sled-agent/src/sim/http_entrypoints.rs b/sled-agent/src/sim/http_entrypoints.rs index 43aeec72a5c..51e5ad977fc 100644 --- a/sled-agent/src/sim/http_entrypoints.rs +++ b/sled-agent/src/sim/http_entrypoints.rs @@ -4,6 +4,7 @@ //! HTTP entrypoint functions for the sled agent's exposed API +use super::collection::PokeMode; use crate::bootstrap::params::AddSledRequest; use crate::params::{ DiskEnsureBody, InstanceEnsureBody, InstanceExternalIpBody, @@ -53,6 +54,7 @@ pub fn api() -> SledApiDescription { api.register(instance_put_external_ip)?; api.register(instance_delete_external_ip)?; api.register(instance_poke_post)?; + api.register(instance_poke_single_step_post)?; api.register(instance_post_sim_migration_source)?; api.register(disk_put)?; api.register(disk_poke_post)?; @@ -215,7 +217,21 @@ async fn instance_poke_post( ) -> Result { let sa = rqctx.context(); let instance_id = path_params.into_inner().instance_id; - sa.instance_poke(instance_id).await; + sa.instance_poke(instance_id, PokeMode::Drain).await; + Ok(HttpResponseUpdatedNoContent()) +} + +#[endpoint { + method = POST, + path = "/instances/{instance_id}/poke-single-step", +}] +async fn instance_poke_single_step_post( + rqctx: RequestContext>, + path_params: Path, +) -> Result { + let sa = rqctx.context(); + let instance_id = path_params.into_inner().instance_id; + sa.instance_poke(instance_id, PokeMode::SingleStep).await; Ok(HttpResponseUpdatedNoContent()) } diff --git a/sled-agent/src/sim/sled_agent.rs b/sled-agent/src/sim/sled_agent.rs index 7b198695f23..79d57a42e65 100644 --- a/sled-agent/src/sim/sled_agent.rs +++ b/sled-agent/src/sim/sled_agent.rs @@ -614,8 +614,8 @@ impl SledAgent { self.disks.size().await } - pub async fn instance_poke(&self, id: InstanceUuid) { - self.instances.sim_poke(id.into_untyped_uuid(), PokeMode::Drain).await; + pub async fn instance_poke(&self, id: InstanceUuid, mode: PokeMode) { + self.instances.sim_poke(id.into_untyped_uuid(), mode).await; } pub async fn disk_poke(&self, id: Uuid) { From 06d01c1ebffaf32e021544d3b35fba9fcf83cc2b Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Tue, 9 Jul 2024 12:39:29 -0700 Subject: [PATCH 112/234] another disk test that needs to wait for stop --- nexus/tests/integration_tests/disks.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/nexus/tests/integration_tests/disks.rs b/nexus/tests/integration_tests/disks.rs index 7d8583293bb..234ab5f3821 100644 --- a/nexus/tests/integration_tests/disks.rs +++ b/nexus/tests/integration_tests/disks.rs @@ -543,6 +543,8 @@ async fn test_disk_move_between_instances(cptestctx: &ControlPlaneTestContext) { let instance2_id = InstanceUuid::from_untyped_uuid(instance2.identity.id); set_instance_state(&client, "instance2", "stop").await; instance_simulate(nexus, &instance2_id).await; + instance_wait_for_state(&client, instance2_id, InstanceState::Stopped) + .await; let url_instance2_attach_disk = get_disk_attach_url(&instance2.identity.id.into()); From b9f5a46574e87ae22c8af69af551eea7aeeb735d Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Tue, 9 Jul 2024 13:00:36 -0700 Subject: [PATCH 113/234] oh there's also pantry tests that stop instances --- nexus/tests/integration_tests/pantry.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/nexus/tests/integration_tests/pantry.rs b/nexus/tests/integration_tests/pantry.rs index 29e590b1a99..d77ad49db69 100644 --- a/nexus/tests/integration_tests/pantry.rs +++ b/nexus/tests/integration_tests/pantry.rs @@ -4,6 +4,7 @@ //! Tests Nexus' interactions with Crucible's pantry +use crate::integration_tests::instances::instance_wait_for_state; use dropshot::test_util::ClientTestContext; use http::method::Method; use http::StatusCode; @@ -24,6 +25,7 @@ use omicron_common::api::external::Disk; use omicron_common::api::external::DiskState; use omicron_common::api::external::IdentityMetadataCreateParams; use omicron_common::api::external::Instance; +use omicron_common::api::external::InstanceState; use omicron_nexus::Nexus; use omicron_nexus::TestInterfaces as _; use omicron_uuid_kinds::GenericUuid; @@ -157,6 +159,7 @@ async fn create_instance_and_attach_disk( // is an artificial limitation without hotplug support. set_instance_state(&client, INSTANCE_NAME, "stop").await; instance_simulate(nexus, &instance_id).await; + instance_wait_for_state(&client, instance_id, InstanceState::Stopped).await; let url_instance_attach_disk = get_disk_attach_url(instance.identity.name.as_str()); From a5aa853ddc1a807470b8eb840e598217cfcfd0b5 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Tue, 9 Jul 2024 13:02:03 -0700 Subject: [PATCH 114/234] clippy cleanliness --- nexus/src/app/instance.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/nexus/src/app/instance.rs b/nexus/src/app/instance.rs index 8f203d8ccbb..95a2f5122fc 100644 --- a/nexus/src/app/instance.rs +++ b/nexus/src/app/instance.rs @@ -12,7 +12,6 @@ use super::MAX_NICS_PER_INSTANCE; use super::MAX_SSH_KEYS_PER_INSTANCE; use super::MAX_VCPU_PER_INSTANCE; use super::MIN_MEMORY_BYTES_PER_INSTANCE; -use crate::app::saga::StartSaga; use crate::app::sagas; use crate::app::sagas::NexusSaga; use crate::cidata::InstanceCiData; From 2cde269f83229748df1865285aefb7ca7c6e05e6 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Tue, 9 Jul 2024 13:19:48 -0700 Subject: [PATCH 115/234] bump instance-updater bg task period in tests --- nexus/tests/config.test.toml | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/nexus/tests/config.test.toml b/nexus/tests/config.test.toml index 23d3d2f5bc7..f78aee3a88d 100644 --- a/nexus/tests/config.test.toml +++ b/nexus/tests/config.test.toml @@ -124,7 +124,20 @@ v2p_mapping_propagation.period_secs = 30 abandoned_vmm_reaper.period_secs = 60 saga_recovery.period_secs = 600 lookup_region_port.period_secs = 60 -instance_updater.period_secs = 30 +# The purpose of the `instance-updater` background task is to ensure that update +# sagas are always *eventually* started for instances whose database state has +# changed, even if the update saga was not started by the Nexus replica handling +# an update from sled-agent. This is to ensure that updates are performed even +# in cases where a Nexus crashes or otherwise disappears between when the +# updated VMM and migration state is written to CRDB and when the resulting +# update saga actually starts executing. However, we would prefer update sagas +# to be executed in a timely manner, so for integration tests, we don't want to +# *rely* on the instance-updater background task for running these sagas. +# +# Therefore, set a period long enough that this task won't activate during a +# reasonable integration test execution. Tests for the instance-updater task +# will explictly activate it. +instance_updater.period_secs = 600 [default_region_allocation_strategy] # we only have one sled in the test environment, so we need to use the From b46676a970f09ff6235bac72a3ecbdf37ae6d9d1 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Tue, 9 Jul 2024 13:20:58 -0700 Subject: [PATCH 116/234] use migration_mark_failed where it's supposed to be used --- nexus/src/app/sagas/instance_migrate.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nexus/src/app/sagas/instance_migrate.rs b/nexus/src/app/sagas/instance_migrate.rs index 5d1ccbf109e..950a9ed795b 100644 --- a/nexus/src/app/sagas/instance_migrate.rs +++ b/nexus/src/app/sagas/instance_migrate.rs @@ -266,7 +266,7 @@ async fn sim_fail_migration_record( // If the migration record wasn't updated, this means it's already deleted, // which...seems weird, but isn't worth getting the whole saga unwind stuck over. if let Err(e) = - osagactx.datastore().migration_mark_deleted(&opctx, migration_id).await + osagactx.datastore().migration_mark_failed(&opctx, migration_id).await { warn!(osagactx.log(), "Error marking migration record as failed during rollback"; From 7a75c7b6caea19f3529794d2a94412b1a74e93fb Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Tue, 9 Jul 2024 13:33:59 -0700 Subject: [PATCH 117/234] don't re-query sled ID for network cfg update (thanks @gjcolombo) --- nexus/src/app/sagas/instance_update/mod.rs | 35 +++++++++++----------- 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/nexus/src/app/sagas/instance_update/mod.rs b/nexus/src/app/sagas/instance_update/mod.rs index bdb754f0eef..48485b76291 100644 --- a/nexus/src/app/sagas/instance_update/mod.rs +++ b/nexus/src/app/sagas/instance_update/mod.rs @@ -57,7 +57,7 @@ struct UpdatesRequired { #[derive(Debug, Deserialize, Serialize)] enum NetworkConfigUpdate { Delete, - Update(PropolisUuid), + Update { active_propolis_id: PropolisUuid, new_sled_id: Uuid }, } impl UpdatesRequired { @@ -156,11 +156,16 @@ impl UpdatesRequired { // creating these mappings, this path only needs to be taken if an // instance has changed sleds. if failed && destroy_active_vmm.is_none() { - network_config = Some(NetworkConfigUpdate::Update( - PropolisUuid::from_untyped_uuid( + network_config = Some(NetworkConfigUpdate::Update { + active_propolis_id: PropolisUuid::from_untyped_uuid( migration.source_propolis_id, ), - )); + new_sled_id: snapshot + .active_vmm + .as_ref() + .expect("if we're here, there must be an active VMM") + .sled_id, + }); update_required = true; } @@ -176,11 +181,16 @@ impl UpdatesRequired { "target_propolis_id" => %migration.target_propolis_id, ); - network_config = Some(NetworkConfigUpdate::Update( - PropolisUuid::from_untyped_uuid( + network_config = Some(NetworkConfigUpdate::Update { + active_propolis_id: PropolisUuid::from_untyped_uuid( migration.target_propolis_id, ), - )); + new_sled_id: snapshot + .target_vmm + .as_ref() + .expect("if we're here, there must be a target VMM") + .sled_id, + }); new_runtime.propolis_id = Some(migration.target_propolis_id); // Even if the active VMM was destroyed (and we set the // instance's state to `NoVmm` above), it has successfully @@ -441,16 +451,7 @@ async fn siu_update_network_config( .await .map_err(ActionError::action_failed)?; } - NetworkConfigUpdate::Update(active_propolis_id) => { - // Look up the ID of the sled that the instance now resides on, so that we - // can look up its address. - let new_sled_id = osagactx - .datastore() - .vmm_fetch(&opctx, authz_instance, &active_propolis_id) - .await - .map_err(ActionError::action_failed)? - .sled_id; - + NetworkConfigUpdate::Update { active_propolis_id, new_sled_id } => { info!( osagactx.log(), "instance update: ensuring updated instance network config"; From 46ae4cbe22ea2f24f6d4981b208ece50024a8969 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Tue, 9 Jul 2024 13:36:05 -0700 Subject: [PATCH 118/234] slightly more accurate log message --- nexus/src/app/sagas/instance_migrate.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nexus/src/app/sagas/instance_migrate.rs b/nexus/src/app/sagas/instance_migrate.rs index 950a9ed795b..ba3af62e939 100644 --- a/nexus/src/app/sagas/instance_migrate.rs +++ b/nexus/src/app/sagas/instance_migrate.rs @@ -348,7 +348,7 @@ async fn sim_set_migration_ids( let migration_id = sagactx.lookup::("migrate_id")?; let dst_propolis_id = sagactx.lookup::("dst_propolis_id")?; - info!(osagactx.log(), "setting migration IDs on migration source sled"; + info!(osagactx.log(), "setting instance migration IDs"; "instance_id" => %db_instance.id(), "migration_id" => %migration_id, "src_propolis_id" => %src_propolis_id, From 3e5b6d1e3822546dd9588f9dd727213d45e17c63 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Tue, 9 Jul 2024 13:36:29 -0700 Subject: [PATCH 119/234] rm vestigial line --- nexus/db-queries/src/db/datastore/vmm.rs | 2 -- sled-agent/src/common/instance.rs | 1 - 2 files changed, 3 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/vmm.rs b/nexus/db-queries/src/db/datastore/vmm.rs index 798bdf2b4f5..0ffd4b1f88d 100644 --- a/nexus/db-queries/src/db/datastore/vmm.rs +++ b/nexus/db-queries/src/db/datastore/vmm.rs @@ -163,8 +163,6 @@ impl DataStore { .await .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?; - // debug_assert_eq!(result.instance_status, ); - let vmm_updated = match result.vmm_status { Some(UpdateStatus::Updated) => true, Some(UpdateStatus::NotUpdatedButExists) => false, diff --git a/sled-agent/src/common/instance.rs b/sled-agent/src/common/instance.rs index 4e0d7a57fdc..ad974c285a3 100644 --- a/sled-agent/src/common/instance.rs +++ b/sled-agent/src/common/instance.rs @@ -213,7 +213,6 @@ impl InstanceStates { SledInstanceState { vmm_state: self.vmm.clone(), propolis_id: self.propolis_id, - // migration_state: self.migration.clone(), migration_in: self.migration_in.clone(), migration_out: self.migration_out.clone(), } From 32ea68f1bedc681dbe856a8dc4d45e8e96928b06 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Tue, 9 Jul 2024 13:49:59 -0700 Subject: [PATCH 120/234] THERES MORE OF THEM AGHGHGHGHHGHHGH --- nexus/tests/integration_tests/vpc_subnets.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/nexus/tests/integration_tests/vpc_subnets.rs b/nexus/tests/integration_tests/vpc_subnets.rs index b12c43aeccc..f063c7e9a26 100644 --- a/nexus/tests/integration_tests/vpc_subnets.rs +++ b/nexus/tests/integration_tests/vpc_subnets.rs @@ -4,6 +4,7 @@ use crate::integration_tests::instances::instance_post; use crate::integration_tests::instances::instance_simulate; +use crate::integration_tests::instances::instance_wait_for_state; use crate::integration_tests::instances::InstanceOp; use dropshot::HttpErrorResponseBody; use http::method::Method; @@ -20,6 +21,7 @@ use nexus_test_utils_macros::nexus_test; use nexus_types::external_api::{params, views::VpcSubnet}; use omicron_common::api::external::IdentityMetadataCreateParams; use omicron_common::api::external::IdentityMetadataUpdateParams; +use omicron_common::api::external::InstanceState; use omicron_common::api::external::Ipv6NetExt; use omicron_uuid_kinds::GenericUuid; use omicron_uuid_kinds::InstanceUuid; @@ -80,6 +82,7 @@ async fn test_delete_vpc_subnet_with_interfaces_fails( // Stop and then delete the instance instance_post(client, instance_name, InstanceOp::Stop).await; instance_simulate(&nexus, &instance_id).await; + instance_wait_for_state(client, instance_id, InstanceState::Stopped).await; NexusRequest::object_delete(&client, &instance_url) .authn_as(AuthnMode::PrivilegedUser) .execute() From bedcfc66c1336e4bd43d25a1bb45b445c51acaf1 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Tue, 9 Jul 2024 15:46:52 -0700 Subject: [PATCH 121/234] chain into another saga (bad initial version) --- nexus/src/app/sagas/instance_update/mod.rs | 54 +++++++++++++++++++--- 1 file changed, 48 insertions(+), 6 deletions(-) diff --git a/nexus/src/app/sagas/instance_update/mod.rs b/nexus/src/app/sagas/instance_update/mod.rs index 48485b76291..d1c3f3cc115 100644 --- a/nexus/src/app/sagas/instance_update/mod.rs +++ b/nexus/src/app/sagas/instance_update/mod.rs @@ -574,16 +574,58 @@ pub(super) async fn siu_unassign_oximeter_producer( async fn siu_update_and_unlock_instance( sagactx: NexusActionContext, ) -> Result<(), ActionError> { - let RealParams { - ref serialized_authn, ref authz_instance, ref update, .. - } = sagactx.saga_params::()?; + let RealParams { serialized_authn, authz_instance, ref update, .. } = + sagactx.saga_params::()?; unlock_instance_inner( - serialized_authn, - authz_instance, + &serialized_authn, + &authz_instance, &sagactx, Some(&update.new_runtime), ) - .await + .await?; + + let opctx = + crate::context::op_context_for_saga_action(&sagactx, &serialized_authn); + let osagactx = sagactx.user_data(); + + // fetch the state from the database again to see if we should immediately + // run a new saga. + // TODO(eliza): go back and make the unlock-instance query return the + // current state, instead... + let new_state = match osagactx + .datastore() + .instance_fetch_all(&opctx, &authz_instance) + .await + { + Ok(s) => s, + Err(e) => { + warn!(osagactx.log(), "instance update: failed to fetch state on saga completion"; + "instance_id" => %authz_instance.id(), + "error" => %e); + // if we can't refetch here, don't unwind all the work we did do. + // the instance-updater background task will take care of it. + return Ok(()); + } + }; + + if UpdatesRequired::for_snapshot(osagactx.log(), &new_state).is_some() { + if let Err(e) = osagactx + .nexus() + .sagas + .saga_execute::(Params { + // everyone in the friend group just venmo-ing the same + // serialized_authn back and forth forever. + serialized_authn, + authz_instance, + }) + .await + { + // again, if this fails, don't unwind all the good work we already did. + warn!(osagactx.log(), "instant update: subsequent saga execution failed"; "error" => %e); + } + } + + Ok(()) } async fn unlock_instance_inner( From 529684b68dde8e23468da3c71f77dd3010396e4a Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Tue, 9 Jul 2024 15:46:58 -0700 Subject: [PATCH 122/234] Revert "don't re-query sled ID for network cfg update" This reverts commit 0623ae80ad4fc3f69a295a262ea6123cdc1e811d. --- nexus/src/app/sagas/instance_update/mod.rs | 35 +++++++++++----------- 1 file changed, 17 insertions(+), 18 deletions(-) diff --git a/nexus/src/app/sagas/instance_update/mod.rs b/nexus/src/app/sagas/instance_update/mod.rs index d1c3f3cc115..7a656b120b5 100644 --- a/nexus/src/app/sagas/instance_update/mod.rs +++ b/nexus/src/app/sagas/instance_update/mod.rs @@ -57,7 +57,7 @@ struct UpdatesRequired { #[derive(Debug, Deserialize, Serialize)] enum NetworkConfigUpdate { Delete, - Update { active_propolis_id: PropolisUuid, new_sled_id: Uuid }, + Update(PropolisUuid), } impl UpdatesRequired { @@ -156,16 +156,11 @@ impl UpdatesRequired { // creating these mappings, this path only needs to be taken if an // instance has changed sleds. if failed && destroy_active_vmm.is_none() { - network_config = Some(NetworkConfigUpdate::Update { - active_propolis_id: PropolisUuid::from_untyped_uuid( + network_config = Some(NetworkConfigUpdate::Update( + PropolisUuid::from_untyped_uuid( migration.source_propolis_id, ), - new_sled_id: snapshot - .active_vmm - .as_ref() - .expect("if we're here, there must be an active VMM") - .sled_id, - }); + )); update_required = true; } @@ -181,16 +176,11 @@ impl UpdatesRequired { "target_propolis_id" => %migration.target_propolis_id, ); - network_config = Some(NetworkConfigUpdate::Update { - active_propolis_id: PropolisUuid::from_untyped_uuid( + network_config = Some(NetworkConfigUpdate::Update( + PropolisUuid::from_untyped_uuid( migration.target_propolis_id, ), - new_sled_id: snapshot - .target_vmm - .as_ref() - .expect("if we're here, there must be a target VMM") - .sled_id, - }); + )); new_runtime.propolis_id = Some(migration.target_propolis_id); // Even if the active VMM was destroyed (and we set the // instance's state to `NoVmm` above), it has successfully @@ -451,7 +441,16 @@ async fn siu_update_network_config( .await .map_err(ActionError::action_failed)?; } - NetworkConfigUpdate::Update { active_propolis_id, new_sled_id } => { + NetworkConfigUpdate::Update(active_propolis_id) => { + // Look up the ID of the sled that the instance now resides on, so that we + // can look up its address. + let new_sled_id = osagactx + .datastore() + .vmm_fetch(&opctx, authz_instance, &active_propolis_id) + .await + .map_err(ActionError::action_failed)? + .sled_id; + info!( osagactx.log(), "instance update: ensuring updated instance network config"; From 84ebd740427c00eec4cc7c5d00f6a5503016ff9d Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Wed, 10 Jul 2024 09:50:56 -0700 Subject: [PATCH 123/234] found another place we need to wait for stop --- nexus/tests/integration_tests/ip_pools.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/nexus/tests/integration_tests/ip_pools.rs b/nexus/tests/integration_tests/ip_pools.rs index d044eb735c8..e872cc6fe3b 100644 --- a/nexus/tests/integration_tests/ip_pools.rs +++ b/nexus/tests/integration_tests/ip_pools.rs @@ -6,6 +6,7 @@ use std::net::Ipv4Addr; +use crate::integration_tests::instances::instance_wait_for_state; use dropshot::test_util::ClientTestContext; use dropshot::HttpErrorResponseBody; use dropshot::ResultsPage; @@ -54,6 +55,7 @@ use nexus_types::external_api::views::SiloIpPool; use nexus_types::identity::Resource; use omicron_common::address::Ipv6Range; use omicron_common::api::external::IdentityMetadataUpdateParams; +use omicron_common::api::external::InstanceState; use omicron_common::api::external::LookupType; use omicron_common::api::external::NameOrId; use omicron_common::api::external::SimpleIdentity; @@ -1348,6 +1350,7 @@ async fn test_ip_range_delete_with_allocated_external_ip_fails( .unwrap() .expect("running instance should be on a sled"); sa.instance_finish_transition(instance.identity.id).await; + instance_wait_for_state(client, instance_id, InstanceState::Stopped).await; // Delete the instance NexusRequest::object_delete(client, &instance_url) From 1cebe3e8ce2dbc26521c81fdfee915e59016c9b2 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Wed, 10 Jul 2024 10:22:25 -0700 Subject: [PATCH 124/234] update omdb again --- dev-tools/omdb/tests/successes.out | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/dev-tools/omdb/tests/successes.out b/dev-tools/omdb/tests/successes.out index 7c8dadae4e0..395132a8026 100644 --- a/dev-tools/omdb/tests/successes.out +++ b/dev-tools/omdb/tests/successes.out @@ -487,14 +487,15 @@ task: "external_endpoints" TLS certificates: 0 task: "instance_updater" - configured period: every 30s + configured period: every 10m currently executing: no last completed activation: , triggered by a periodic timer firing started at (s ago) and ran for ms total instances in need of updates: 0 instances with destroyed active VMMs: 0 instances with terminated active migrations: 0 - update sagas queued: 0 + update sagas started: 0 + update sagas completed successfully: 0 task: "instance_watcher" configured period: every s From 852669757e0f89518f6f2c73af5bad535f8ce4af Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Wed, 10 Jul 2024 11:52:59 -0700 Subject: [PATCH 125/234] clean up saga chaining code --- nexus/src/app/sagas/instance_update/mod.rs | 116 +++++++++++++++------ 1 file changed, 82 insertions(+), 34 deletions(-) diff --git a/nexus/src/app/sagas/instance_update/mod.rs b/nexus/src/app/sagas/instance_update/mod.rs index 7a656b120b5..1d3cdbb12c0 100644 --- a/nexus/src/app/sagas/instance_update/mod.rs +++ b/nexus/src/app/sagas/instance_update/mod.rs @@ -15,6 +15,7 @@ use crate::app::db::model::InstanceState; use crate::app::db::model::MigrationState; use crate::app::db::model::VmmState; use crate::app::sagas::declare_saga_actions; +use anyhow::Context; use chrono::Utc; use nexus_db_queries::{authn, authz}; use nexus_types::identity::Resource; @@ -253,9 +254,12 @@ declare_saga_actions! { + siu_unassign_oximeter_producer } - // Release the lock and write back the new instance record. - UPDATE_AND_UNLOCK_INSTANCE -> "unlocked" { - + siu_update_and_unlock_instance + // Write back the new instance record, releasing the instance updater lock, + // and re-fetch the VMM and migration states. If they have changed in a way + // that requires an additional update saga, attempt to execute an additional + // update saga immediately. + COMMIT_INSTANCE_UPDATES -> "commit_instance_updates" { + + siu_commit_instance_updates } } @@ -308,8 +312,8 @@ impl NexusSaga for SagaDoActualInstanceUpdate { } // Once we've finished mutating everything owned by the instance, we can - // write ck the updated state and release the instance lock. - builder.append(update_and_unlock_instance_action()); + // write back the updated state and release the instance lock. + builder.append(commit_instance_updates_action()); // If either VMM linked to this instance has been destroyed, append // subsagas to clean up the VMMs resources and mark them as deleted. @@ -570,7 +574,7 @@ pub(super) async fn siu_unassign_oximeter_producer( .map_err(ActionError::action_failed) } -async fn siu_update_and_unlock_instance( +async fn siu_commit_instance_updates( sagactx: NexusActionContext, ) -> Result<(), ActionError> { let RealParams { serialized_authn, authz_instance, ref update, .. } = @@ -582,46 +586,90 @@ async fn siu_update_and_unlock_instance( Some(&update.new_runtime), ) .await?; + let instance_id = authz_instance.id(); + + // Check if the VMM or migration state has changed while the update saga was + // running and whether an additional update saga is now required. If one is + // required, try to start it. + // + // TODO(eliza): it would be nice if we didn't release the lock, determine + // the needed updates, and then start a new start-instance-update saga that + // re-locks the instance --- instead, perhaps we could keep the lock, and + // try to start a new "actual" instance update saga that inherits our lock. + // This way, we could also avoid computing updates required twice. + // But, I'm a bit sketched out by the implications of not committing update + // and dropping the lock in the same operation. This deserves more thought... + if let Err(error) = + chain_update_saga(&sagactx, authz_instance, serialized_authn).await + { + let osagactx = sagactx.user_data(); + // If starting the new update saga failed, DO NOT unwind this saga and + // undo all the work we've done successfully! Instead, just kick the + // instance-updater background task to try and start a new saga + // eventually, and log a warning. + warn!( + osagactx.log(), + "instance update: failed to start successor saga!"; + "instance_id" => %instance_id, + "error" => %error, + ); + osagactx.nexus().background_tasks.task_instance_updater.activate(); + } + + Ok(()) +} +async fn chain_update_saga( + sagactx: &NexusActionContext, + authz_instance: authz::Instance, + serialized_authn: authn::saga::Serialized, +) -> Result<(), anyhow::Error> { let opctx = - crate::context::op_context_for_saga_action(&sagactx, &serialized_authn); + crate::context::op_context_for_saga_action(sagactx, &serialized_authn); let osagactx = sagactx.user_data(); + let instance_id = authz_instance.id(); - // fetch the state from the database again to see if we should immediately + // Fetch the state from the database again to see if we should immediately // run a new saga. - // TODO(eliza): go back and make the unlock-instance query return the - // current state, instead... - let new_state = match osagactx + let new_state = osagactx .datastore() .instance_fetch_all(&opctx, &authz_instance) .await - { - Ok(s) => s, - Err(e) => { - warn!(osagactx.log(), "instance update: failed to fetch state on saga completion"; - "instance_id" => %authz_instance.id(), - "error" => %e); - // if we can't refetch here, don't unwind all the work we did do. - // the instance-updater background task will take care of it. - return Ok(()); - } - }; + .context("failed to fetch latest snapshot for instance")?; - if UpdatesRequired::for_snapshot(osagactx.log(), &new_state).is_some() { - if let Err(e) = osagactx + if let Some(update) = + UpdatesRequired::for_snapshot(osagactx.log(), &new_state) + { + debug!( + osagactx.log(), + "instance update: additional updates required, preparing a \ + successor update saga..."; + "instance_id" => %instance_id, + "update.new_runtime_state" => ?update.new_runtime, + "update.network_config_update" => ?update.network_config, + "update.destroy_active_vmm" => ?update.destroy_active_vmm, + "update.destroy_target_vmm" => ?update.destroy_target_vmm, + "update.deprovision" => update.deprovision, + ); + let saga_dag = SagaInstanceUpdate::prepare(&Params { + serialized_authn, + authz_instance, + }) + .context("failed to build new update saga DAG")?; + let saga = osagactx .nexus() .sagas - .saga_execute::(Params { - // everyone in the friend group just venmo-ing the same - // serialized_authn back and forth forever. - serialized_authn, - authz_instance, - }) + .saga_prepare(saga_dag) .await - { - // again, if this fails, don't unwind all the good work we already did. - warn!(osagactx.log(), "instant update: subsequent saga execution failed"; "error" => %e); - } + .context("failed to prepare new update saga")?; + saga.start().await.context("failed to start successor update saga")?; + // N.B. that we don't wait for the successor update saga to *complete* + // here. We just want to make sure it starts. + info!( + osagactx.log(), + "instance update: successor update saga started!"; + "instance_id" => %instance_id, + ); } Ok(()) From 1eba9cdc3230b1e10455ddfa3d2e2534db69f12e Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Wed, 10 Jul 2024 12:52:42 -0700 Subject: [PATCH 126/234] Reapply "don't re-query sled ID for network cfg update" This reverts commit 9fb5aa1cf67f598ad212bc8f41f3e0db5b471683. --- nexus/src/app/sagas/instance_update/mod.rs | 35 +++++++++++----------- 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/nexus/src/app/sagas/instance_update/mod.rs b/nexus/src/app/sagas/instance_update/mod.rs index 1d3cdbb12c0..7a4d1d4812b 100644 --- a/nexus/src/app/sagas/instance_update/mod.rs +++ b/nexus/src/app/sagas/instance_update/mod.rs @@ -58,7 +58,7 @@ struct UpdatesRequired { #[derive(Debug, Deserialize, Serialize)] enum NetworkConfigUpdate { Delete, - Update(PropolisUuid), + Update { active_propolis_id: PropolisUuid, new_sled_id: Uuid }, } impl UpdatesRequired { @@ -157,11 +157,16 @@ impl UpdatesRequired { // creating these mappings, this path only needs to be taken if an // instance has changed sleds. if failed && destroy_active_vmm.is_none() { - network_config = Some(NetworkConfigUpdate::Update( - PropolisUuid::from_untyped_uuid( + network_config = Some(NetworkConfigUpdate::Update { + active_propolis_id: PropolisUuid::from_untyped_uuid( migration.source_propolis_id, ), - )); + new_sled_id: snapshot + .active_vmm + .as_ref() + .expect("if we're here, there must be an active VMM") + .sled_id, + }); update_required = true; } @@ -177,11 +182,16 @@ impl UpdatesRequired { "target_propolis_id" => %migration.target_propolis_id, ); - network_config = Some(NetworkConfigUpdate::Update( - PropolisUuid::from_untyped_uuid( + network_config = Some(NetworkConfigUpdate::Update { + active_propolis_id: PropolisUuid::from_untyped_uuid( migration.target_propolis_id, ), - )); + new_sled_id: snapshot + .target_vmm + .as_ref() + .expect("if we're here, there must be a target VMM") + .sled_id, + }); new_runtime.propolis_id = Some(migration.target_propolis_id); // Even if the active VMM was destroyed (and we set the // instance's state to `NoVmm` above), it has successfully @@ -445,16 +455,7 @@ async fn siu_update_network_config( .await .map_err(ActionError::action_failed)?; } - NetworkConfigUpdate::Update(active_propolis_id) => { - // Look up the ID of the sled that the instance now resides on, so that we - // can look up its address. - let new_sled_id = osagactx - .datastore() - .vmm_fetch(&opctx, authz_instance, &active_propolis_id) - .await - .map_err(ActionError::action_failed)? - .sled_id; - + NetworkConfigUpdate::Update { active_propolis_id, new_sled_id } => { info!( osagactx.log(), "instance update: ensuring updated instance network config"; From 896d21f1271ed9f1d70da70d8eef691ba01f9b58 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Wed, 10 Jul 2024 13:55:23 -0700 Subject: [PATCH 127/234] cleanup migration update computation This way, we no longer try to do network config updates twice, and don't unlink the migration when only the target has completed --- nexus/src/app/sagas/instance_update/mod.rs | 151 ++++++++++++--------- 1 file changed, 90 insertions(+), 61 deletions(-) diff --git a/nexus/src/app/sagas/instance_update/mod.rs b/nexus/src/app/sagas/instance_update/mod.rs index 7a4d1d4812b..5f6eebb8cce 100644 --- a/nexus/src/app/sagas/instance_update/mod.rs +++ b/nexus/src/app/sagas/instance_update/mod.rs @@ -13,6 +13,7 @@ use crate::app::db::model::Generation; use crate::app::db::model::InstanceRuntimeState; use crate::app::db::model::InstanceState; use crate::app::db::model::MigrationState; +use crate::app::db::model::Vmm; use crate::app::db::model::VmmState; use crate::app::sagas::declare_saga_actions; use anyhow::Context; @@ -121,19 +122,15 @@ impl UpdatesRequired { } }); - // Determine what to do with the migration. + // If there's an active migration, determine how to update the instance + // record to reflect the current migration state. if let Some(ref migration) = snapshot.migration { - // Determine how to update the instance record to reflect the current - // migration state. - let failed = migration.either_side_failed(); - // If the migration has failed, or if the target reports that the migration - // has completed, clear the instance record's migration IDs so that a new - // migration can begin. - if failed || migration.target_state == MigrationState::COMPLETED { + if migration.either_side_failed() { + // If the migration has failed, clear the instance record's + // migration IDs so that a new migration can begin. info!( log, - "instance update (migration {}): clearing migration IDs", - if failed { "failed" } else { "target completed" }; + "instance update (migration failed): clearing migration IDs"; "instance_id" => %instance_id, "migration_id" => %migration.id, "src_propolis_id" => %migration.source_propolis_id, @@ -142,66 +139,89 @@ impl UpdatesRequired { new_runtime.migration_id = None; new_runtime.dst_propolis_id = None; update_required = true; - } - - // If the active VMM was destroyed, the network config must be - // deleted (which was determined above). Otherwise, if the - // migration failed but the active VMM was still there, we must - // still ensure the correct networking configuration - // exists for its current home. - // - // TODO(#3107) This is necessary even if the instance didn't move, - // because registering a migration target on a sled creates OPTE ports - // for its VNICs, and that creates new V2P mappings on that sled that - // place the relevant virtual IPs on the local sled. Once OPTE stops - // creating these mappings, this path only needs to be taken if an - // instance has changed sleds. - if failed && destroy_active_vmm.is_none() { - network_config = Some(NetworkConfigUpdate::Update { - active_propolis_id: PropolisUuid::from_untyped_uuid( - migration.source_propolis_id, - ), - new_sled_id: snapshot - .active_vmm - .as_ref() - .expect("if we're here, there must be an active VMM") - .sled_id, - }); - update_required = true; - } + // If the active VMM was destroyed, the network config must be + // deleted (which was determined above). Otherwise, if the + // migration failed but the active VMM was still there, we must + // still ensure the correct networking configuration + // exists for its current home. + // + // TODO(#3107) This is necessary even if the instance didn't move, + // because registering a migration target on a sled creates OPTE ports + // for its VNICs, and that creates new V2P mappings on that sled that + // place the relevant virtual IPs on the local sled. Once OPTE stops + // creating these mappings, this path only needs to be taken if an + // instance has changed sleds. + if destroy_active_vmm.is_none() { + if let Some(ref active_vmm) = snapshot.active_vmm { + info!( + log, + "instance update (migration failed): pointing network \ + config back at current VMM"; + "instance_id" => %instance_id, + "migration_id" => %migration.id, + "src_propolis_id" => %migration.source_propolis_id, + "target_propolis_id" => %migration.target_propolis_id, + ); + network_config = + Some(NetworkConfigUpdate::to_vmm(active_vmm)); + } else { + // Otherwise, the active VMM has already been destroyed, + // and the target is reporting a failure because of + // that. Just delete the network config. + } + } + } else if migration.either_side_completed() { + // If either side reports that the migration has completed, set + // the instance record's active Propolis ID to point at the new + // VMM, and update the network configuration to point at that VMM. + if new_runtime.propolis_id != Some(migration.target_propolis_id) + { + info!( + log, + "instance update (migration completed): setting active \ + VMM ID to target and updating network config"; + "instance_id" => %instance_id, + "migration_id" => %migration.id, + "src_propolis_id" => %migration.source_propolis_id, + "target_propolis_id" => %migration.target_propolis_id, + ); + let new_vmm = snapshot.target_vmm.as_ref().expect( + "if we have gotten here, there must be a target VMM", + ); + debug_assert_eq!(new_vmm.id, migration.target_propolis_id); + new_runtime.propolis_id = + Some(migration.target_propolis_id); + update_required = true; + network_config = Some(NetworkConfigUpdate::to_vmm(new_vmm)); + } - // If either side reports that the migration has completed, move the target - // Propolis ID to the active position. - if !failed && migration.either_side_completed() { - info!( - log, - "instance update (migration completed): setting active VMM ID to target"; - "instance_id" => %instance_id, - "migration_id" => %migration.id, - "src_propolis_id" => %migration.source_propolis_id, - "target_propolis_id" => %migration.target_propolis_id, - ); + // If the target reports that the migration has completed, + // unlink the migration (allowing a new one to begin). This has + // to wait until the target has reported completion to ensure a + // migration out of the target can't start until the migration + // in has definitely finished. + if migration.target_state == MigrationState::COMPLETED { + info!( + log, + "instance update (migration target completed): \ + clearing migration IDs"; + "instance_id" => %instance_id, + "migration_id" => %migration.id, + "src_propolis_id" => %migration.source_propolis_id, + "target_propolis_id" => %migration.target_propolis_id, + ); + new_runtime.migration_id = None; + new_runtime.dst_propolis_id = None; + update_required = true; + } - network_config = Some(NetworkConfigUpdate::Update { - active_propolis_id: PropolisUuid::from_untyped_uuid( - migration.target_propolis_id, - ), - new_sled_id: snapshot - .target_vmm - .as_ref() - .expect("if we're here, there must be a target VMM") - .sled_id, - }); - new_runtime.propolis_id = Some(migration.target_propolis_id); // Even if the active VMM was destroyed (and we set the // instance's state to `NoVmm` above), it has successfully // migrated, so leave it in the VMM state. new_runtime.nexus_state = InstanceState::Vmm; - new_runtime.dst_propolis_id = None; // If the active VMM has also been destroyed, don't delete // virtual provisioning records while cleaning it up. deprovision = false; - update_required = true; } } @@ -219,6 +239,15 @@ impl UpdatesRequired { } } +impl NetworkConfigUpdate { + fn to_vmm(vmm: &Vmm) -> Self { + Self::Update { + active_propolis_id: PropolisUuid::from_untyped_uuid(vmm.id), + new_sled_id: vmm.sled_id, + } + } +} + /// Parameters to the "real" instance update saga. #[derive(Debug, Deserialize, Serialize)] struct RealParams { From b09503cd05fa15b0372e21366423f72bdcaa291a Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Thu, 11 Jul 2024 09:29:21 -0700 Subject: [PATCH 128/234] misc review feedback cleanup --- nexus/db-queries/src/db/queries/instance.rs | 8 ++------ nexus/src/app/sagas/snapshot_create.rs | 2 +- nexus/tests/integration_tests/instances.rs | 9 ++++----- 3 files changed, 7 insertions(+), 12 deletions(-) diff --git a/nexus/db-queries/src/db/queries/instance.rs b/nexus/db-queries/src/db/queries/instance.rs index c8bb6a7e091..fcaa83cc325 100644 --- a/nexus/db-queries/src/db/queries/instance.rs +++ b/nexus/db-queries/src/db/queries/instance.rs @@ -109,14 +109,10 @@ pub struct InstanceAndVmmUpdateResult { /// indicates whether the row was updated. `None` if the VMM was not found. pub vmm_status: Option, - /// `Some(status)` if the inbound migration was found; the wrapped `UpdateStatus` - /// indicates whether the row was updated. `None` if the inbound migration - /// was not found, or no migration update was performed. + /// Indicates whether a migration-in update was performed. pub migration_in_status: RecordUpdateStatus, - /// `Some(status)` if the outbound migration was found; the wrapped `UpdateStatus` - /// indicates whether the row was updated. `None` if the inbound migration - /// was not found, or no migration update was performed. + /// Indicates whether a migration-out update was performed. pub migration_out_status: RecordUpdateStatus, } diff --git a/nexus/src/app/sagas/snapshot_create.rs b/nexus/src/app/sagas/snapshot_create.rs index 797ca842665..fdff9c14a20 100644 --- a/nexus/src/app/sagas/snapshot_create.rs +++ b/nexus/src/app/sagas/snapshot_create.rs @@ -2334,7 +2334,7 @@ mod test { &Duration::from_secs(5), &Duration::from_secs(300), ) - .await.expect("instance did not advance to NoVmm after 400 seconds"); + .await.expect("instance did not advance to NoVmm after 300 seconds"); test_helpers::instance_delete_by_name( cptestctx, INSTANCE_NAME, diff --git a/nexus/tests/integration_tests/instances.rs b/nexus/tests/integration_tests/instances.rs index c46bafa5088..633d19b5dd8 100644 --- a/nexus/tests/integration_tests/instances.rs +++ b/nexus/tests/integration_tests/instances.rs @@ -838,11 +838,6 @@ async fn test_instance_migrate(cptestctx: &ControlPlaneTestContext) { assert_eq!(migration.target_state, MigrationState::Pending.into()); assert_eq!(migration.source_state, MigrationState::Pending.into()); - // Explicitly simulate the migration action on the target. Simulated - // migrations always succeed. The state transition on the target is - // sufficient to move the instance back into a Running state (strictly - // speaking no further updates from the source are required if the target - // successfully takes over). instance_simulate_migration_source( cptestctx, nexus, @@ -851,6 +846,10 @@ async fn test_instance_migrate(cptestctx: &ControlPlaneTestContext) { migration_id, ) .await; + // TODO(eliza): it would be nice to single-step both simulated sled agents + // through each migration phase and assert that we see all the intermediate + // states, instead of just letting them run straight to completion... + // Ensure that both sled agents report that the migration has completed. instance_simulate_on_sled(cptestctx, nexus, original_sled, instance_id) .await; From 7ecb9a86993acea4fbac1a7f427c585d06f81c98 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Thu, 11 Jul 2024 10:54:08 -0700 Subject: [PATCH 129/234] nicer `instance_set_migration_ids` This commit adds a subquery for ensuring that the active VMM is in the `Running` or `Rebooting` states, so that we know it's okay to migrate out of when we start the migration. Additionally, this changes the query to return the instance record, so that the saga action `sim_set_migration_ids` can avoid a refetch query. --- nexus/db-model/src/schema.rs | 2 + nexus/db-model/src/vmm_state.rs | 2 +- nexus/db-queries/src/db/datastore/instance.rs | 45 ++++++++++++------- nexus/src/app/sagas/instance_migrate.rs | 18 -------- 4 files changed, 33 insertions(+), 34 deletions(-) diff --git a/nexus/db-model/src/schema.rs b/nexus/db-model/src/schema.rs index 246edecd335..845da13a446 100644 --- a/nexus/db-model/src/schema.rs +++ b/nexus/db-model/src/schema.rs @@ -425,6 +425,8 @@ table! { } } +joinable!(instance -> vmm (active_propolis_id)); + table! { vmm (id) { id -> Uuid, diff --git a/nexus/db-model/src/vmm_state.rs b/nexus/db-model/src/vmm_state.rs index 058e29ba95e..b61d79624af 100644 --- a/nexus/db-model/src/vmm_state.rs +++ b/nexus/db-model/src/vmm_state.rs @@ -8,7 +8,7 @@ use serde::Serialize; use std::fmt; impl_enum_type!( - #[derive(SqlType, Debug)] + #[derive(SqlType, Debug, Clone)] #[diesel(postgres_type(name = "vmm_state", schema = "public"))] pub struct VmmStateEnum; diff --git a/nexus/db-queries/src/db/datastore/instance.rs b/nexus/db-queries/src/db/datastore/instance.rs index 3718338f51b..31817f6331a 100644 --- a/nexus/db-queries/src/db/datastore/instance.rs +++ b/nexus/db-queries/src/db/datastore/instance.rs @@ -26,6 +26,7 @@ use crate::db::model::Name; use crate::db::model::Project; use crate::db::model::Sled; use crate::db::model::Vmm; +use crate::db::model::VmmState; use crate::db::pagination::paginated; use crate::db::update_and_check::UpdateAndCheck; use crate::db::update_and_check::UpdateAndQueryResult; @@ -581,15 +582,11 @@ impl DataStore { Ok(updated) } - /// Updates an instance record by setting the instance's migration ID. - // - // TODO-design It's tempting to return the updated state of the Instance - // here because it's convenient for consumers and by using a RETURNING - // clause, we could ensure that the "update" and "fetch" are atomic. - // But in the unusual case that we _don't_ update the row because our - // update is older than the one in the database, we would have to fetch - // the current state explicitly. For now, we'll just require consumers - // to explicitly fetch the state if they want that. + /// Updates an instance record by setting the instance's migration ID to the + /// provided `migration_id` and the target VMM ID to the provided + /// `target_propolis_id`, if the instance does not currently have an active + /// migration, and the active VMM is in the [`VmmState::Running`] or + /// [`VmmState::Rebooting`] states. pub async fn instance_set_migration_ids( &self, opctx: &OpContext, @@ -597,18 +594,33 @@ impl DataStore { src_propolis_id: PropolisUuid, migration_id: Uuid, target_propolis_id: PropolisUuid, - ) -> Result { + ) -> Result { use db::schema::instance::dsl; + use db::schema::vmm::dsl as vmm_dsl; + + // Only allow migrating out if the active VMM is running or rebooting. + const ALLOWED_ACTIVE_VMM_STATES: &[VmmState] = + &[VmmState::Running, VmmState::Rebooting]; let instance_id = instance_id.into_untyped_uuid(); let target_propolis_id = target_propolis_id.into_untyped_uuid(); let src_propolis_id = src_propolis_id.into_untyped_uuid(); + + // Subquery for determining whether the active VMM is in a state where + // it can be migrated out of. This returns the VMM row's instance ID, so + // that we can use it in a `filter` on the update query. + let vmm_ok = vmm_dsl::vmm + .filter(vmm_dsl::id.eq(src_propolis_id)) + .filter(vmm_dsl::state.eq_any(ALLOWED_ACTIVE_VMM_STATES)) + .select(vmm_dsl::instance_id); + let updated = diesel::update(dsl::instance) .filter(dsl::time_deleted.is_null()) .filter(dsl::id.eq(instance_id)) .filter(dsl::migration_id.is_null()) .filter(dsl::target_propolis_id.is_null()) .filter(dsl::active_propolis_id.eq(src_propolis_id)) + .filter(dsl::id.eq_any(vmm_ok)) .set(( dsl::migration_id.eq(Some(migration_id)), dsl::target_propolis_id.eq(Some(target_propolis_id)), @@ -616,6 +628,9 @@ impl DataStore { dsl::state_generation.eq(dsl::state_generation + 1), dsl::time_state_updated.eq(Utc::now()), )) + // TODO(eliza): it's too bad we can't do `check_if_exists` with both + // the instance and active VMM, so that we could return a nicer + // error in the case where the active VMM is in the wrong state... .check_if_exists::(instance_id.into_untyped_uuid()) .execute_and_check(&*self.pool_connection_authorized(&opctx).await?) .await @@ -631,12 +646,12 @@ impl DataStore { match updated { // If we updated the instance, that's great! Good job team! - UpdateAndQueryResult { status: UpdateStatus::Updated, .. } => { - Ok(true) + UpdateAndQueryResult { status: UpdateStatus::Updated, found } => { + Ok(found) } // No update was performed because the migration ID has already been // set to the ID we were trying to set it to. That's fine, count it - // as a success. + // as a success for saga action idempotency reasons. UpdateAndQueryResult { found, .. } if found.runtime_state.migration_id == Some(migration_id) => { @@ -648,7 +663,7 @@ impl DataStore { found.runtime_state.propolis_id, Some(src_propolis_id) ); - Ok(false) + Ok(found) } // On the other hand, if there was already a different migration ID, @@ -683,7 +698,7 @@ impl DataStore { } => { slog::warn!( opctx.log, - "failed to set instance migration IDs: one of its Propolis IDs was what way we anticipated!"; + "failed to set instance migration IDs: invalid instance or VMM runtime state"; "instance_id" => %instance_id, "desired_migration_id" => %migration_id, "desired_active_propolis_id" => %src_propolis_id, diff --git a/nexus/src/app/sagas/instance_migrate.rs b/nexus/src/app/sagas/instance_migrate.rs index ba3af62e939..58534bb1d37 100644 --- a/nexus/src/app/sagas/instance_migrate.rs +++ b/nexus/src/app/sagas/instance_migrate.rs @@ -365,24 +365,6 @@ async fn sim_set_migration_ids( dst_propolis_id, ) .await - .map_err(ActionError::action_failed)?; - - // Refetch the instance to make sure we have the correct thing to send to - // sled-agents. - // TODO(eliza): we *could* probably just munge the previous - // `InstanceRuntimeState` to have the migration IDs set, but...that feels - // sketchy. Doing another db query here to get the latest state is kinda sad - // but whatever. - let (.., authz_instance) = LookupPath::new(&opctx, &osagactx.datastore()) - .instance_id(db_instance.id()) - .lookup_for(authz::Action::Read) - .await - .map_err(ActionError::action_failed)?; - - osagactx - .datastore() - .instance_refetch(&opctx, &authz_instance) - .await .map_err(ActionError::action_failed) } From d3db8590b56a39484d5f0b1178b64f9be96fd809 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Thu, 11 Jul 2024 11:34:03 -0700 Subject: [PATCH 130/234] actually return UPDATED state in `instance_set_migration_ids` --- nexus/db-queries/src/db/datastore/instance.rs | 103 ++++-------------- 1 file changed, 24 insertions(+), 79 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/instance.rs b/nexus/db-queries/src/db/datastore/instance.rs index 31817f6331a..de7764bdd77 100644 --- a/nexus/db-queries/src/db/datastore/instance.rs +++ b/nexus/db-queries/src/db/datastore/instance.rs @@ -614,11 +614,26 @@ impl DataStore { .filter(vmm_dsl::state.eq_any(ALLOWED_ACTIVE_VMM_STATES)) .select(vmm_dsl::instance_id); - let updated = diesel::update(dsl::instance) + diesel::update(dsl::instance) .filter(dsl::time_deleted.is_null()) .filter(dsl::id.eq(instance_id)) - .filter(dsl::migration_id.is_null()) - .filter(dsl::target_propolis_id.is_null()) + // To ensure that saga actions that set migration IDs are + // idempotent, we update the row if the migration and target VMM IDs + // are not present *or* if they are already equal to the desired + // values. This way, we can use a `RETURNING` clause to fetch the + // current state after the update, rather than `check_if_exists` + // which returns the prior state, and still fail to update the + // record if another migration/target VMM ID is already there. + .filter( + dsl::migration_id + .is_null() + .or(dsl::migration_id.eq(Some(migration_id))), + ) + .filter( + dsl::target_propolis_id + .is_null() + .or(dsl::target_propolis_id.eq(Some(target_propolis_id))), + ) .filter(dsl::active_propolis_id.eq(src_propolis_id)) .filter(dsl::id.eq_any(vmm_ok)) .set(( @@ -628,90 +643,20 @@ impl DataStore { dsl::state_generation.eq(dsl::state_generation + 1), dsl::time_state_updated.eq(Utc::now()), )) - // TODO(eliza): it's too bad we can't do `check_if_exists` with both - // the instance and active VMM, so that we could return a nicer - // error in the case where the active VMM is in the wrong state... - .check_if_exists::(instance_id.into_untyped_uuid()) - .execute_and_check(&*self.pool_connection_authorized(&opctx).await?) + .returning(Instance::as_returning()) + .get_result_async::( + &*self.pool_connection_authorized(opctx).await?, + ) .await .map_err(|e| { public_error_from_diesel( e, ErrorHandler::NotFoundByLookup( ResourceType::Instance, - LookupType::ById(instance_id), + LookupType::ById(instance_id.into_untyped_uuid()), ), ) - })?; - - match updated { - // If we updated the instance, that's great! Good job team! - UpdateAndQueryResult { status: UpdateStatus::Updated, found } => { - Ok(found) - } - // No update was performed because the migration ID has already been - // set to the ID we were trying to set it to. That's fine, count it - // as a success for saga action idempotency reasons. - UpdateAndQueryResult { found, .. } - if found.runtime_state.migration_id == Some(migration_id) => - { - debug_assert_eq!( - found.runtime_state.dst_propolis_id, - Some(target_propolis_id) - ); - debug_assert_eq!( - found.runtime_state.propolis_id, - Some(src_propolis_id) - ); - Ok(found) - } - - // On the other hand, if there was already a different migration ID, - // that means another migrate saga has already started a migration. - // Guess I'll die! - UpdateAndQueryResult { - found: - Instance { - runtime_state: - InstanceRuntimeState { - migration_id: Some(actual_migration_id), - .. - }, - .. - }, - .. - } => { - slog::info!( - opctx.log, - "failed to set instance migration IDs: a different migration ID was already set"; - "instance_id" => %instance_id, - "desired_migration_id" => %migration_id, - "actual_migration_id" => %actual_migration_id, - ); - Err(Error::conflict("instance is already migrating")) - } - // If one of the other filters didn't match, our understanding of - // the instance's state is clearly pretty wromg. - UpdateAndQueryResult { - found: Instance { runtime_state, .. }, - .. - } => { - slog::warn!( - opctx.log, - "failed to set instance migration IDs: invalid instance or VMM runtime state"; - "instance_id" => %instance_id, - "desired_migration_id" => %migration_id, - "desired_active_propolis_id" => %src_propolis_id, - "desired_target_propolis_id" => %target_propolis_id, - "actual_migration_id" => ?runtime_state.migration_id, - "actual_active_propolis_id" => ?runtime_state.propolis_id, - "actual_target_propolis_id" => ?runtime_state.dst_propolis_id, - ); - Err(Error::conflict( - "instance snapshot didn't match actual state", - )) - } - } + }) } /// Unsets the migration IDs set by From 6571b1a559acd80c77d86103c69a9c585b882adc Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Thu, 11 Jul 2024 11:34:57 -0700 Subject: [PATCH 131/234] single-step through states in migration test --- nexus/tests/integration_tests/instances.rs | 39 ++++++++++++++++++---- 1 file changed, 33 insertions(+), 6 deletions(-) diff --git a/nexus/tests/integration_tests/instances.rs b/nexus/tests/integration_tests/instances.rs index 633d19b5dd8..49ab48a84b6 100644 --- a/nexus/tests/integration_tests/instances.rs +++ b/nexus/tests/integration_tests/instances.rs @@ -838,6 +838,9 @@ async fn test_instance_migrate(cptestctx: &ControlPlaneTestContext) { assert_eq!(migration.target_state, MigrationState::Pending.into()); assert_eq!(migration.source_state, MigrationState::Pending.into()); + // Simulate the migration. We will use `instance_single_step_on_sled` to + // single-step both sled-agents through the migration state machine and + // ensure that the migration state looks nice at each step. instance_simulate_migration_source( cptestctx, nexus, @@ -846,17 +849,41 @@ async fn test_instance_migrate(cptestctx: &ControlPlaneTestContext) { migration_id, ) .await; - // TODO(eliza): it would be nice to single-step both simulated sled agents - // through each migration phase and assert that we see all the intermediate - // states, instead of just letting them run straight to completion... - // Ensure that both sled agents report that the migration has completed. + // Move source to "migrating". + instance_single_step_on_sled(cptestctx, nexus, original_sled, instance_id) + .await; + + let migration = dbg!(migration_fetch(cptestctx, migration_id).await); + assert_eq!(migration.source_state, MigrationState::InProgress.into()); + assert_eq!(migration.target_state, MigrationState::Pending.into()); + let instance = instance_get(&client, &instance_url).await; + assert_eq!(instance.runtime.run_state, InstanceState::Migrating); + + // Move target to "migrating". + instance_single_step_on_sled(cptestctx, nexus, dst_sled_id, instance_id) + .await; + + let migration = dbg!(migration_fetch(cptestctx, migration_id).await); + assert_eq!(migration.source_state, MigrationState::InProgress.into()); + assert_eq!(migration.target_state, MigrationState::InProgress.into()); + let instance = instance_get(&client, &instance_url).await; + assert_eq!(instance.runtime.run_state, InstanceState::Migrating); + + // Move the source to "completed" instance_simulate_on_sled(cptestctx, nexus, original_sled, instance_id) .await; - instance_simulate_on_sled(cptestctx, nexus, dst_sled_id, instance_id).await; + let migration = dbg!(migration_fetch(cptestctx, migration_id).await); + assert_eq!(migration.source_state, MigrationState::Completed.into()); + assert_eq!(migration.target_state, MigrationState::InProgress.into()); let instance = instance_get(&client, &instance_url).await; - assert_eq!(instance.runtime.run_state, InstanceState::Running); + assert_eq!(instance.runtime.run_state, InstanceState::Migrating); + + // Move the target to "completed". + instance_simulate_on_sled(cptestctx, nexus, dst_sled_id, instance_id).await; + + instance_wait_for_state(&client, instance_id, InstanceState::Running).await; let current_sled = nexus .instance_sled_id(&instance_id) From 1f36487de69f08e6df1766d492260d1677305e05 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Thu, 11 Jul 2024 12:02:44 -0700 Subject: [PATCH 132/234] fix migration-source sled agents creating state at gen 1 --- sled-agent/src/common/instance.rs | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/sled-agent/src/common/instance.rs b/sled-agent/src/common/instance.rs index ad974c285a3..82087109905 100644 --- a/sled-agent/src/common/instance.rs +++ b/sled-agent/src/common/instance.rs @@ -240,13 +240,19 @@ impl InstanceStates { m.gen = m.gen.next(); } else { m.migration_id = id; - m.gen = Generation::new(); + m.gen = Generation::new().next(); } m.time_updated = now; } else { *current = Some(MigrationRuntimeState { migration_id: id, - gen: Generation::new(), + // We are creating a new migration record, but the state + // will not be `Pending`, because we've actually gotten a + // migration observation from Propolis. Therefore, we have + // to advance the initial generation once to be ahead of + // what the generation in the database is when Nexus creates + // the initial migration record at generation 1. + gen: Generation::new().next(), state, time_updated: now, }); From 7b39931ef66d6310a04b2655e118567c320a3a57 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Thu, 11 Jul 2024 12:25:03 -0700 Subject: [PATCH 133/234] fix migration update query not bumping generation --- nexus/tests/integration_tests/instances.rs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/nexus/tests/integration_tests/instances.rs b/nexus/tests/integration_tests/instances.rs index 49ab48a84b6..904677c3d4b 100644 --- a/nexus/tests/integration_tests/instances.rs +++ b/nexus/tests/integration_tests/instances.rs @@ -853,6 +853,8 @@ async fn test_instance_migrate(cptestctx: &ControlPlaneTestContext) { // Move source to "migrating". instance_single_step_on_sled(cptestctx, nexus, original_sled, instance_id) .await; + instance_single_step_on_sled(cptestctx, nexus, original_sled, instance_id) + .await; let migration = dbg!(migration_fetch(cptestctx, migration_id).await); assert_eq!(migration.source_state, MigrationState::InProgress.into()); @@ -863,6 +865,8 @@ async fn test_instance_migrate(cptestctx: &ControlPlaneTestContext) { // Move target to "migrating". instance_single_step_on_sled(cptestctx, nexus, dst_sled_id, instance_id) .await; + instance_single_step_on_sled(cptestctx, nexus, dst_sled_id, instance_id) + .await; let migration = dbg!(migration_fetch(cptestctx, migration_id).await); assert_eq!(migration.source_state, MigrationState::InProgress.into()); @@ -877,7 +881,7 @@ async fn test_instance_migrate(cptestctx: &ControlPlaneTestContext) { let migration = dbg!(migration_fetch(cptestctx, migration_id).await); assert_eq!(migration.source_state, MigrationState::Completed.into()); assert_eq!(migration.target_state, MigrationState::InProgress.into()); - let instance = instance_get(&client, &instance_url).await; + let instance = dbg!(instance_get(&client, &instance_url).await); assert_eq!(instance.runtime.run_state, InstanceState::Migrating); // Move the target to "completed". From 6ef212059bc6d286ee75c982dacaf6c0db9191cb Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Thu, 11 Jul 2024 12:40:00 -0700 Subject: [PATCH 134/234] report instance states as "migrating" until migration resolves This fixes a problem where an instance whose active VMM has been destroyed will incorrectly be reported as "stopping" until an instance-update saga sorts out the actual migration outcome. --- nexus/db-queries/src/db/datastore/instance.rs | 22 +++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/nexus/db-queries/src/db/datastore/instance.rs b/nexus/db-queries/src/db/datastore/instance.rs index de7764bdd77..9b403fde3e6 100644 --- a/nexus/db-queries/src/db/datastore/instance.rs +++ b/nexus/db-queries/src/db/datastore/instance.rs @@ -121,6 +121,28 @@ impl From for external::Instance { | VmmState::SagaUnwound, ), ) => external::InstanceState::Stopping, + // - If there's an active migration ID for the instance, *always* + // treat its state as "migration" regardless of the VMM's state. + // + // This avoids an issue where an instance whose previous active + // VMM has been destroyed as a result of a successful migration + // out will appear to be "stopping" for the time between when that + // VMM was reported destroyed and when the instance record was + // updated to reflect the migration's completion. + // + // Instead, we'll continue to report the instance's state as + // "migrating" until an instance-update saga has resolved the + // outcome of the migration, since only the instance-update saga + // If the instance actually *has* stopped or failed before a + // successful migration out, this is fine, because an + // instance-update saga will come along and remove the active VMM + // and migration IDs. + // + (InstanceState::Vmm, Some(_)) + if value.instance.runtime_state.migration_id.is_some() => + { + external::InstanceState::Migrating + } // - An instance with no VMM is always "stopped" (as long as it's // not "starting" etc.) (InstanceState::NoVmm, _vmm_state) => { From dfc0240439876abd6b2f2e963b21ae9db2464c94 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Thu, 11 Jul 2024 12:40:19 -0700 Subject: [PATCH 135/234] poke instances twice in single-steppy migration test --- nexus/db-queries/src/db/queries/instance.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/nexus/db-queries/src/db/queries/instance.rs b/nexus/db-queries/src/db/queries/instance.rs index fcaa83cc325..c73f7ac6806 100644 --- a/nexus/db-queries/src/db/queries/instance.rs +++ b/nexus/db-queries/src/db/queries/instance.rs @@ -249,6 +249,7 @@ impl InstanceAndVmmUpdate { .set(( migration_dsl::target_state.eq(state), migration_dsl::time_target_updated.eq(time_updated), + migration_dsl::target_gen.eq(gen), )), ); Update { @@ -280,6 +281,7 @@ impl InstanceAndVmmUpdate { .set(( migration_dsl::source_state.eq(state), migration_dsl::time_source_updated.eq(time_updated), + migration_dsl::source_gen.eq(gen), )), ); Update { From c47ed9093416fd8704b8fd387d379bf7dc25031e Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Thu, 11 Jul 2024 13:01:45 -0700 Subject: [PATCH 136/234] gotta wait for update saga to go back to running --- nexus/tests/integration_tests/instances.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/nexus/tests/integration_tests/instances.rs b/nexus/tests/integration_tests/instances.rs index 904677c3d4b..d80938daabd 100644 --- a/nexus/tests/integration_tests/instances.rs +++ b/nexus/tests/integration_tests/instances.rs @@ -1044,9 +1044,8 @@ async fn test_instance_migrate_v2p_and_routes( instance_simulate_on_sled(cptestctx, nexus, original_sled_id, instance_id) .await; instance_simulate_on_sled(cptestctx, nexus, dst_sled_id, instance_id).await; - let instance = instance_get(&client, &instance_url).await; + instance_wait_for_state(&client, instance_id, InstanceState::Running).await; - assert_eq!(instance.runtime.run_state, InstanceState::Running); let current_sled = nexus .instance_sled_id(&instance_id) .await From 9f7e1016696afaf609ba7d19a4593db2fadd2624 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Thu, 11 Jul 2024 13:38:01 -0700 Subject: [PATCH 137/234] whoops, migration arm needs to be BEFORE destroyed --- nexus/db-queries/src/db/datastore/instance.rs | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/instance.rs b/nexus/db-queries/src/db/datastore/instance.rs index 9b403fde3e6..b7091f17c3c 100644 --- a/nexus/db-queries/src/db/datastore/instance.rs +++ b/nexus/db-queries/src/db/datastore/instance.rs @@ -109,18 +109,6 @@ impl From for external::Instance { // We want to only report that an instance is `Stopped` when a new // `instance-start` saga is able to proceed. That means that: let run_state = match (instance_state, vmm_state) { - // - An instance with a "stopped" or "destroyed" VMM needs to be - // recast as a "stopping" instance, as the virtual provisioning - // resources for that instance have not been deallocated until the - // active VMM ID has been unlinked by an update saga. - ( - InstanceState::Vmm, - Some( - VmmState::Stopped - | VmmState::Destroyed - | VmmState::SagaUnwound, - ), - ) => external::InstanceState::Stopping, // - If there's an active migration ID for the instance, *always* // treat its state as "migration" regardless of the VMM's state. // @@ -143,6 +131,18 @@ impl From for external::Instance { { external::InstanceState::Migrating } + // - An instance with a "stopped" or "destroyed" VMM needs to be + // recast as a "stopping" instance, as the virtual provisioning + // resources for that instance have not been deallocated until the + // active VMM ID has been unlinked by an update saga. + ( + InstanceState::Vmm, + Some( + VmmState::Stopped + | VmmState::Destroyed + | VmmState::SagaUnwound, + ), + ) => external::InstanceState::Stopping, // - An instance with no VMM is always "stopped" (as long as it's // not "starting" etc.) (InstanceState::NoVmm, _vmm_state) => { From 59cf488df3b304f309ef17f6e533bab5349bbea3 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Thu, 11 Jul 2024 14:20:25 -0700 Subject: [PATCH 138/234] make filter expr more correct and match comment --- nexus/db-queries/src/db/datastore/instance.rs | 27 +++++++++---------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/instance.rs b/nexus/db-queries/src/db/datastore/instance.rs index b7091f17c3c..74de734951c 100644 --- a/nexus/db-queries/src/db/datastore/instance.rs +++ b/nexus/db-queries/src/db/datastore/instance.rs @@ -639,22 +639,21 @@ impl DataStore { diesel::update(dsl::instance) .filter(dsl::time_deleted.is_null()) .filter(dsl::id.eq(instance_id)) - // To ensure that saga actions that set migration IDs are - // idempotent, we update the row if the migration and target VMM IDs - // are not present *or* if they are already equal to the desired - // values. This way, we can use a `RETURNING` clause to fetch the - // current state after the update, rather than `check_if_exists` - // which returns the prior state, and still fail to update the - // record if another migration/target VMM ID is already there. .filter( - dsl::migration_id + // To ensure that saga actions that set migration IDs are + // idempotent, we update the row if the migration and target + // VMM IDs are not present *or* if they are already equal to the + // desired values. This way, we can use a `RETURNING` clause to + // fetch the current state after the update, rather than + // `check_if_exists` which returns the prior state, and still + // fail to update the record if another migration/target VMM ID + // is already there. + (dsl::migration_id .is_null() - .or(dsl::migration_id.eq(Some(migration_id))), - ) - .filter( - dsl::target_propolis_id - .is_null() - .or(dsl::target_propolis_id.eq(Some(target_propolis_id))), + .and(dsl::target_propolis_id.is_null())) + .or(dsl::migration_id + .eq(Some(migration_id)) + .and(dsl::target_propolis_id.eq(Some(target_propolis_id)))), ) .filter(dsl::active_propolis_id.eq(src_propolis_id)) .filter(dsl::id.eq_any(vmm_ok)) From b5443329e6247eff826f99cbc8f118aad721282f Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Thu, 11 Jul 2024 14:42:02 -0700 Subject: [PATCH 139/234] whoops i forgot to update expectorate queries --- ...ce_and_vmm_update_vmm_and_both_migrations.sql | 16 ++++++++-------- ...tance_and_vmm_update_vmm_and_migration_in.sql | 10 +++++----- ...ance_and_vmm_update_vmm_and_migration_out.sql | 10 +++++----- ...m_update_vmm_instance_and_both_migrations.sql | 16 ++++++++-------- ..._vmm_update_vmm_instance_and_migration_in.sql | 10 +++++----- ...vmm_update_vmm_instance_and_migration_out.sql | 10 +++++----- 6 files changed, 36 insertions(+), 36 deletions(-) diff --git a/nexus/db-queries/tests/output/instance_and_vmm_update_vmm_and_both_migrations.sql b/nexus/db-queries/tests/output/instance_and_vmm_update_vmm_and_both_migrations.sql index 15f5ec00890..354fc9a4035 100644 --- a/nexus/db-queries/tests/output/instance_and_vmm_update_vmm_and_both_migrations.sql +++ b/nexus/db-queries/tests/output/instance_and_vmm_update_vmm_and_both_migrations.sql @@ -17,9 +17,9 @@ WITH UPDATE migration SET - target_state = $2, time_target_updated = $3 + target_state = $2, time_target_updated = $3, target_gen = $4 WHERE - (migration.id = $4 AND migration.target_propolis_id = $5) AND migration.target_gen < $6 + (migration.id = $5 AND migration.target_propolis_id = $6) AND migration.target_gen < $7 RETURNING id ), @@ -40,7 +40,7 @@ WITH FROM migration WHERE - migration.id = $7 AND (migration.time_deleted IS NULL) + migration.id = $8 AND (migration.time_deleted IS NULL) ) AS id ), @@ -49,9 +49,9 @@ WITH UPDATE migration SET - source_state = $8, time_source_updated = $9 + source_state = $9, time_source_updated = $10, source_gen = $11 WHERE - (migration.id = $10 AND migration.source_propolis_id = $11) AND migration.source_gen < $12 + (migration.id = $12 AND migration.source_propolis_id = $13) AND migration.source_gen < $14 RETURNING id ), @@ -63,15 +63,15 @@ WITH migration_out_found LEFT JOIN migration_out_updated ON migration_out_found.id = migration_out_updated.id ), - vmm_found AS (SELECT (SELECT vmm.id FROM vmm WHERE vmm.id = $13) AS id), + vmm_found AS (SELECT (SELECT vmm.id FROM vmm WHERE vmm.id = $15) AS id), vmm_updated AS ( UPDATE vmm SET - time_state_updated = $14, state_generation = $15, state = $16 + time_state_updated = $16, state_generation = $17, state = $18 WHERE - ((vmm.time_deleted IS NULL) AND vmm.id = $17) AND vmm.state_generation < $18 + ((vmm.time_deleted IS NULL) AND vmm.id = $19) AND vmm.state_generation < $20 RETURNING id ), diff --git a/nexus/db-queries/tests/output/instance_and_vmm_update_vmm_and_migration_in.sql b/nexus/db-queries/tests/output/instance_and_vmm_update_vmm_and_migration_in.sql index 03f6d27d2cd..870cce4c02b 100644 --- a/nexus/db-queries/tests/output/instance_and_vmm_update_vmm_and_migration_in.sql +++ b/nexus/db-queries/tests/output/instance_and_vmm_update_vmm_and_migration_in.sql @@ -17,9 +17,9 @@ WITH UPDATE migration SET - target_state = $2, time_target_updated = $3 + target_state = $2, time_target_updated = $3, target_gen = $4 WHERE - (migration.id = $4 AND migration.target_propolis_id = $5) AND migration.target_gen < $6 + (migration.id = $5 AND migration.target_propolis_id = $6) AND migration.target_gen < $7 RETURNING id ), @@ -31,15 +31,15 @@ WITH migration_in_found LEFT JOIN migration_in_updated ON migration_in_found.id = migration_in_updated.id ), - vmm_found AS (SELECT (SELECT vmm.id FROM vmm WHERE vmm.id = $7) AS id), + vmm_found AS (SELECT (SELECT vmm.id FROM vmm WHERE vmm.id = $8) AS id), vmm_updated AS ( UPDATE vmm SET - time_state_updated = $8, state_generation = $9, state = $10 + time_state_updated = $9, state_generation = $10, state = $11 WHERE - ((vmm.time_deleted IS NULL) AND vmm.id = $11) AND vmm.state_generation < $12 + ((vmm.time_deleted IS NULL) AND vmm.id = $12) AND vmm.state_generation < $13 RETURNING id ), diff --git a/nexus/db-queries/tests/output/instance_and_vmm_update_vmm_and_migration_out.sql b/nexus/db-queries/tests/output/instance_and_vmm_update_vmm_and_migration_out.sql index 6dd4ab55205..4dea3779f7b 100644 --- a/nexus/db-queries/tests/output/instance_and_vmm_update_vmm_and_migration_out.sql +++ b/nexus/db-queries/tests/output/instance_and_vmm_update_vmm_and_migration_out.sql @@ -17,9 +17,9 @@ WITH UPDATE migration SET - source_state = $2, time_source_updated = $3 + source_state = $2, time_source_updated = $3, source_gen = $4 WHERE - (migration.id = $4 AND migration.source_propolis_id = $5) AND migration.source_gen < $6 + (migration.id = $5 AND migration.source_propolis_id = $6) AND migration.source_gen < $7 RETURNING id ), @@ -31,15 +31,15 @@ WITH migration_out_found LEFT JOIN migration_out_updated ON migration_out_found.id = migration_out_updated.id ), - vmm_found AS (SELECT (SELECT vmm.id FROM vmm WHERE vmm.id = $7) AS id), + vmm_found AS (SELECT (SELECT vmm.id FROM vmm WHERE vmm.id = $8) AS id), vmm_updated AS ( UPDATE vmm SET - time_state_updated = $8, state_generation = $9, state = $10 + time_state_updated = $9, state_generation = $10, state = $11 WHERE - ((vmm.time_deleted IS NULL) AND vmm.id = $11) AND vmm.state_generation < $12 + ((vmm.time_deleted IS NULL) AND vmm.id = $12) AND vmm.state_generation < $13 RETURNING id ), diff --git a/nexus/db-queries/tests/output/instance_and_vmm_update_vmm_instance_and_both_migrations.sql b/nexus/db-queries/tests/output/instance_and_vmm_update_vmm_instance_and_both_migrations.sql index 3959f323c92..52c28f85c37 100644 --- a/nexus/db-queries/tests/output/instance_and_vmm_update_vmm_instance_and_both_migrations.sql +++ b/nexus/db-queries/tests/output/instance_and_vmm_update_vmm_instance_and_both_migrations.sql @@ -41,9 +41,9 @@ WITH UPDATE migration SET - target_state = $11, time_target_updated = $12 + target_state = $11, time_target_updated = $12, target_gen = $13 WHERE - (migration.id = $13 AND migration.target_propolis_id = $14) AND migration.target_gen < $15 + (migration.id = $14 AND migration.target_propolis_id = $15) AND migration.target_gen < $16 RETURNING id ), @@ -64,7 +64,7 @@ WITH FROM migration WHERE - migration.id = $16 AND (migration.time_deleted IS NULL) + migration.id = $17 AND (migration.time_deleted IS NULL) ) AS id ), @@ -73,9 +73,9 @@ WITH UPDATE migration SET - source_state = $17, time_source_updated = $18 + source_state = $18, time_source_updated = $19, source_gen = $20 WHERE - (migration.id = $19 AND migration.source_propolis_id = $20) AND migration.source_gen < $21 + (migration.id = $21 AND migration.source_propolis_id = $22) AND migration.source_gen < $23 RETURNING id ), @@ -87,15 +87,15 @@ WITH migration_out_found LEFT JOIN migration_out_updated ON migration_out_found.id = migration_out_updated.id ), - vmm_found AS (SELECT (SELECT vmm.id FROM vmm WHERE vmm.id = $22) AS id), + vmm_found AS (SELECT (SELECT vmm.id FROM vmm WHERE vmm.id = $24) AS id), vmm_updated AS ( UPDATE vmm SET - time_state_updated = $23, state_generation = $24, state = $25 + time_state_updated = $25, state_generation = $26, state = $27 WHERE - ((vmm.time_deleted IS NULL) AND vmm.id = $26) AND vmm.state_generation < $27 + ((vmm.time_deleted IS NULL) AND vmm.id = $28) AND vmm.state_generation < $29 RETURNING id ), diff --git a/nexus/db-queries/tests/output/instance_and_vmm_update_vmm_instance_and_migration_in.sql b/nexus/db-queries/tests/output/instance_and_vmm_update_vmm_instance_and_migration_in.sql index ab73df18048..e717008617d 100644 --- a/nexus/db-queries/tests/output/instance_and_vmm_update_vmm_instance_and_migration_in.sql +++ b/nexus/db-queries/tests/output/instance_and_vmm_update_vmm_instance_and_migration_in.sql @@ -41,9 +41,9 @@ WITH UPDATE migration SET - target_state = $11, time_target_updated = $12 + target_state = $11, time_target_updated = $12, target_gen = $13 WHERE - (migration.id = $13 AND migration.target_propolis_id = $14) AND migration.target_gen < $15 + (migration.id = $14 AND migration.target_propolis_id = $15) AND migration.target_gen < $16 RETURNING id ), @@ -55,15 +55,15 @@ WITH migration_in_found LEFT JOIN migration_in_updated ON migration_in_found.id = migration_in_updated.id ), - vmm_found AS (SELECT (SELECT vmm.id FROM vmm WHERE vmm.id = $16) AS id), + vmm_found AS (SELECT (SELECT vmm.id FROM vmm WHERE vmm.id = $17) AS id), vmm_updated AS ( UPDATE vmm SET - time_state_updated = $17, state_generation = $18, state = $19 + time_state_updated = $18, state_generation = $19, state = $20 WHERE - ((vmm.time_deleted IS NULL) AND vmm.id = $20) AND vmm.state_generation < $21 + ((vmm.time_deleted IS NULL) AND vmm.id = $21) AND vmm.state_generation < $22 RETURNING id ), diff --git a/nexus/db-queries/tests/output/instance_and_vmm_update_vmm_instance_and_migration_out.sql b/nexus/db-queries/tests/output/instance_and_vmm_update_vmm_instance_and_migration_out.sql index eb4e558d95d..c02b73e4f60 100644 --- a/nexus/db-queries/tests/output/instance_and_vmm_update_vmm_instance_and_migration_out.sql +++ b/nexus/db-queries/tests/output/instance_and_vmm_update_vmm_instance_and_migration_out.sql @@ -41,9 +41,9 @@ WITH UPDATE migration SET - source_state = $11, time_source_updated = $12 + source_state = $11, time_source_updated = $12, source_gen = $13 WHERE - (migration.id = $13 AND migration.source_propolis_id = $14) AND migration.source_gen < $15 + (migration.id = $14 AND migration.source_propolis_id = $15) AND migration.source_gen < $16 RETURNING id ), @@ -55,15 +55,15 @@ WITH migration_out_found LEFT JOIN migration_out_updated ON migration_out_found.id = migration_out_updated.id ), - vmm_found AS (SELECT (SELECT vmm.id FROM vmm WHERE vmm.id = $16) AS id), + vmm_found AS (SELECT (SELECT vmm.id FROM vmm WHERE vmm.id = $17) AS id), vmm_updated AS ( UPDATE vmm SET - time_state_updated = $17, state_generation = $18, state = $19 + time_state_updated = $18, state_generation = $19, state = $20 WHERE - ((vmm.time_deleted IS NULL) AND vmm.id = $20) AND vmm.state_generation < $21 + ((vmm.time_deleted IS NULL) AND vmm.id = $21) AND vmm.state_generation < $22 RETURNING id ), From e6771fb88f2829bb6e8df36c9e6b630c040014d2 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Fri, 12 Jul 2024 10:08:56 -0700 Subject: [PATCH 140/234] add instance_wait_for_state saga test helpers --- nexus/src/app/sagas/instance_migrate.rs | 37 +++------- nexus/src/app/sagas/instance_start.rs | 36 ++-------- nexus/src/app/sagas/snapshot_create.rs | 25 ++----- nexus/src/app/sagas/test_helpers.rs | 95 +++++++++++++++++++++---- 4 files changed, 100 insertions(+), 93 deletions(-) diff --git a/nexus/src/app/sagas/instance_migrate.rs b/nexus/src/app/sagas/instance_migrate.rs index 58534bb1d37..0660abc03b5 100644 --- a/nexus/src/app/sagas/instance_migrate.rs +++ b/nexus/src/app/sagas/instance_migrate.rs @@ -591,7 +591,6 @@ async fn sim_instance_migrate( #[cfg(test)] mod tests { - use crate::app::db::datastore::InstanceAndActiveVmm; use crate::app::sagas::test_helpers; use camino::Utf8Path; use dropshot::test_util::ClientTestContext; @@ -605,8 +604,6 @@ mod tests { ByteCount, IdentityMetadataCreateParams, InstanceCpuCount, }; use omicron_sled_agent::sim::Server; - use omicron_test_utils::dev::poll; - use std::time::Duration; use super::*; @@ -831,37 +828,19 @@ mod tests { test_helpers::instance_stop(cptestctx, &instance_id).await; test_helpers::instance_simulate(cptestctx, &instance_id) .await; - // Wait until the instance has advanced to the `NoVmm` - // state. This may not happen immediately, as the - // `Nexus::cpapi_instances_put` API endpoint simply - // writes the new VMM state to the database and *starts* - // an `instance-update` saga, and the instance record - // isn't updated until that saga completes. - let new_state = poll::wait_for_condition( - || async { - let new_state = test_helpers::instance_fetch( - cptestctx, - instance_id, - ) - .await; - if new_state.instance().runtime().nexus_state == nexus_db_model::InstanceState::Vmm { - Err(poll::CondCheckError::::NotYet) - } else { - Ok(new_state) - } - }, - &Duration::from_secs(5), - &Duration::from_secs(300), + // state. This may hot happen immediately, as an + // instance-update saga must complete to update the + // instance's state. + let new_state = test_helpers::instance_wait_for_state( + cptestctx, + instance_id, + nexus_db_model::InstanceState::NoVmm, ) - .await.expect("instance did not transition to NoVmm state after 300 seconds"); + .await; let new_instance = new_state.instance(); let new_vmm = new_state.vmm().as_ref(); - assert_eq!( - new_instance.runtime().nexus_state, - nexus_db_model::InstanceState::NoVmm, - ); assert!(new_instance.runtime().propolis_id.is_none()); assert!(new_vmm.is_none()); diff --git a/nexus/src/app/sagas/instance_start.rs b/nexus/src/app/sagas/instance_start.rs index e5cf0433810..4e777c931a7 100644 --- a/nexus/src/app/sagas/instance_start.rs +++ b/nexus/src/app/sagas/instance_start.rs @@ -685,8 +685,6 @@ mod test { use omicron_common::api::external::{ ByteCount, IdentityMetadataCreateParams, InstanceCpuCount, }; - use omicron_test_utils::dev::poll; - use std::time::Duration; use uuid::Uuid; use super::*; @@ -805,30 +803,12 @@ mod test { }, || { Box::pin(async { - let new_db_instance = - // Wait until the instance has advanced to the `NoVmm` - // state. This may not happen immediately, as the - // `Nexus::cpapi_instances_put` API endpoint simply - // writes the new VMM state to the database and *starts* - // an `instance-update` saga, and the instance record - // isn't updated until that saga completes. - poll::wait_for_condition( - || async { - let new_db_instance = test_helpers::instance_fetch( - cptestctx, - instance_id, - ) - .await.instance().clone(); - if new_db_instance.runtime().nexus_state == nexus_db_model::InstanceState::Vmm { - Err(poll::CondCheckError::::NotYet) - } else { - Ok(new_db_instance) - } - }, - &Duration::from_secs(5), - &Duration::from_secs(300), - ) - .await.expect("instance did not transition to NoVmm state after 300 seconds"); + let new_db_state = test_helpers::instance_wait_for_state( + cptestctx, + instance_id, + nexus_db_model::InstanceState::NoVmm, + ).await; + let new_db_instance = new_db_state.instance(); info!(log, "fetched instance runtime state after saga execution"; @@ -836,10 +816,6 @@ mod test { "instance_runtime" => ?new_db_instance.runtime()); assert!(new_db_instance.runtime().propolis_id.is_none()); - assert_eq!( - new_db_instance.runtime().nexus_state, - nexus_db_model::InstanceState::NoVmm - ); assert!(test_helpers::no_virtual_provisioning_resource_records_exist(cptestctx).await); assert!(test_helpers::no_virtual_provisioning_collection_records_using_instances(cptestctx).await); diff --git a/nexus/src/app/sagas/snapshot_create.rs b/nexus/src/app/sagas/snapshot_create.rs index fdff9c14a20..eeb14091b25 100644 --- a/nexus/src/app/sagas/snapshot_create.rs +++ b/nexus/src/app/sagas/snapshot_create.rs @@ -1749,11 +1749,9 @@ mod test { use omicron_common::api::external::InstanceCpuCount; use omicron_common::api::external::Name; use omicron_common::api::external::NameOrId; - use omicron_test_utils::dev::poll; use sled_agent_client::types::CrucibleOpts; use sled_agent_client::TestInterfaces as SledAgentTestInterfaces; use std::str::FromStr; - use std::time::Duration; type DiskTest<'a> = nexus_test_utils::resource_helpers::DiskTest<'a, crate::Server>; @@ -2317,24 +2315,13 @@ mod test { // database and *starts* an `instance-update` saga, and // the instance record isn't updated until that saga // completes. - poll::wait_for_condition( - || async { - let new_state = test_helpers::instance_fetch_by_name( - cptestctx, - INSTANCE_NAME, - PROJECT_NAME, - ) - .await; - if new_state.instance().runtime().nexus_state != nexus_db_model::InstanceState::NoVmm { - Err(poll::CondCheckError::<()>::NotYet) - } else { - Ok(()) - } - }, - &Duration::from_secs(5), - &Duration::from_secs(300), + test_helpers::instance_wait_for_state_by_name( + cptestctx, + INSTANCE_NAME, + PROJECT_NAME, + nexus_db_model::InstanceState::NoVmm, ) - .await.expect("instance did not advance to NoVmm after 300 seconds"); + .await; test_helpers::instance_delete_by_name( cptestctx, INSTANCE_NAME, diff --git a/nexus/src/app/sagas/test_helpers.rs b/nexus/src/app/sagas/test_helpers.rs index 0ae85ec6ad2..e749dcd5941 100644 --- a/nexus/src/app/sagas/test_helpers.rs +++ b/nexus/src/app/sagas/test_helpers.rs @@ -15,17 +15,20 @@ use diesel::{ BoolExpressionMethods, ExpressionMethods, QueryDsl, SelectableHelper, }; use futures::future::BoxFuture; +use nexus_db_model::InstanceState; use nexus_db_queries::{ authz, context::OpContext, db::{datastore::InstanceAndActiveVmm, lookup::LookupPath, DataStore}, }; use nexus_types::identity::Resource; +use omicron_common::api::external::Error; use omicron_common::api::external::NameOrId; +use omicron_test_utils::dev::poll; use omicron_uuid_kinds::{GenericUuid, InstanceUuid}; use sled_agent_client::TestInterfaces as _; use slog::{info, warn, Logger}; -use std::{num::NonZeroU32, sync::Arc}; +use std::{num::NonZeroU32, sync::Arc, time::Duration}; use steno::SagaDag; type ControlPlaneTestContext = @@ -188,13 +191,27 @@ pub async fn instance_fetch( db_state } -pub async fn instance_fetch_by_name( +pub(crate) async fn instance_wait_for_state( + cptestctx: &ControlPlaneTestContext, + instance_id: InstanceUuid, + desired_state: InstanceState, +) -> InstanceAndActiveVmm { + let opctx = test_opctx(&cptestctx); + let datastore = cptestctx.server.server_context().nexus.datastore(); + let (.., authz_instance) = LookupPath::new(&opctx, &datastore) + .instance_id(instance_id.into_untyped_uuid()) + .lookup_for(authz::Action::Read) + .await + .expect("test instance should be present in datastore"); + instance_poll_state(cptestctx, &opctx, authz_instance, desired_state).await +} + +pub async fn instance_wait_for_state_by_name( cptestctx: &ControlPlaneTestContext, name: &str, project_name: &str, + desired_state: InstanceState, ) -> InstanceAndActiveVmm { - let datastore = cptestctx.server.server_context().nexus.datastore().clone(); - let nexus = &cptestctx.server.server_context().nexus; let opctx = test_opctx(&cptestctx); let instance_selector = @@ -207,19 +224,67 @@ pub async fn instance_fetch_by_name( nexus.instance_lookup(&opctx, instance_selector).unwrap(); let (_, _, authz_instance, ..) = instance_lookup.fetch().await.unwrap(); - let db_state = datastore - .instance_fetch_with_vmm(&opctx, &authz_instance) - .await - .expect("test instance's info should be fetchable"); - - info!(&cptestctx.logctx.log, "fetched instance info from db"; - "instance_name" => %name, - "project_name" => %project_name, - "instance_id" => %authz_instance.id(), - "instance_and_vmm" => ?db_state); + instance_poll_state(cptestctx, &opctx, authz_instance, desired_state).await +} - db_state +async fn instance_poll_state( + cptestctx: &ControlPlaneTestContext, + opctx: &OpContext, + authz_instance: authz::Instance, + desired_state: InstanceState, +) -> InstanceAndActiveVmm { + const MAX_WAIT: Duration = Duration::from_secs(120); + + let datastore = cptestctx.server.server_context().nexus.datastore(); + let log = &cptestctx.logctx.log; + let instance_id = authz_instance.id(); + + info!( + log, + "waiting for instance {instance_id} to transition to {desired_state}..."; + "instance_id" => %instance_id, + ); + let result = poll::wait_for_condition( + || async { + let db_state = datastore + .instance_fetch_with_vmm(&opctx, &authz_instance) + .await + .map_err(poll::CondCheckError::::Failed)?; + + if db_state.instance.runtime().nexus_state == desired_state { + info!( + log, + "instance {instance_id} transitioned to {desired_state}"; + "instance_id" => %instance_id, + "instance" => ?db_state.instance(), + "active_vmm" => ?db_state.vmm(), + ); + Ok(db_state) + } else { + info!( + log, + "instance {instance_id} has not yet transitioned to {desired_state}"; + "instance_id" => %instance_id, + "instance" => ?db_state.instance(), + "active_vmm" => ?db_state.vmm(), + ); + Err(poll::CondCheckError::::NotYet) + } + }, + &Duration::from_secs(1), + &MAX_WAIT, + ) + .await; + + match result { + Ok(i) => i, + Err(e) => panic!( + "instance {instance_id} did not transition to {desired_state} \ + after {MAX_WAIT:?}: {e}" + ), + } } + pub async fn no_virtual_provisioning_resource_records_exist( cptestctx: &ControlPlaneTestContext, ) -> bool { From a4c3eacac0c6f43f2cd302a275bd0a9263ed57fb Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Fri, 12 Jul 2024 11:05:53 -0700 Subject: [PATCH 141/234] remove test that doesn't really matter --- sled-agent/src/common/instance.rs | 91 ------------------------------- 1 file changed, 91 deletions(-) diff --git a/sled-agent/src/common/instance.rs b/sled-agent/src/common/instance.rs index 82087109905..dc8d1b09a6e 100644 --- a/sled-agent/src/common/instance.rs +++ b/sled-agent/src/common/instance.rs @@ -629,95 +629,4 @@ mod test { assert_eq!(migration.state, MigrationState::Failed); assert!(migration.gen > prev_migration.gen); } - - #[test] - #[ignore = "this logic is basically trivial now, maybe just get rid of the test?"] - fn migration_out_after_migration_in() { - todo!("eliza") - // let mut state = make_migration_target_instance(); - // let mut observed = ObservedPropolisState { - // vmm_state: PropolisInstanceState(Observed::Running), - // migration_in: ObservedMigrationStatus::Succeeded, - // time: Utc::now(), - // }; - - // // The transition into the Running state on the migration target should - // // take over for the source, updating the Propolis generation. - // let prev = state.clone(); - // assert!(state.apply_propolis_observation(&observed).is_none()); - // assert_state_change_has_gen_change(&prev, &state); - // assert_eq!(state.vmm.state, VmmState::Running); - // assert!(state.vmm.gen > prev.vmm.gen); - - // // The migration state should transition to completed. - // let migration = state - // .migration - // .clone() - // .expect("instance must have a migration state"); - // let prev_migration = - // prev.migration.expect("previous state must have a migration"); - // assert_eq!(migration.state, MigrationState::Completed); - // assert!(migration.gen > prev_migration.gen); - - // // Pretend Nexus set some new migration IDs. - // let migration_id = Uuid::new_v4(); - // let prev = state.clone(); - // state.set_migration_ids( - // &Some(InstanceMigrationSourceParams { - // migration_id, - // dst_propolis_id: PropolisUuid::new_v4(), - // }), - // Utc::now(), - // ); - // assert_state_change_has_gen_change(&prev, &state); - // assert_eq!(state.vmm.gen, prev.vmm.gen); - - // // There should be a new, pending migration state. - // let migration = state - // .migration - // .clone() - // .expect("instance must have a migration state"); - // assert_eq!(migration.state, MigrationState::Pending); - // assert_eq!(migration.migration_id, migration_id); - // let prev_migration = migration; - - // // Mark that the new migration out is in progress. This doesn't change - // // anything in the instance runtime state, but does update the VMM state - // // generation. - // let prev = state.clone(); - // observed.vmm_state = PropolisInstanceState(Observed::Migrating); - // assert!(state.apply_propolis_observation(&observed).is_none()); - // assert_state_change_has_gen_change(&prev, &state); - // assert_eq!(state.vmm.state, VmmState::Migrating); - // assert!(state.vmm.gen > prev.vmm.gen); - - // // The migration state should transition to in progress. - // let migration = state - // .migration - // .clone() - // .expect("instance must have a migration state"); - // assert_eq!(migration.state, MigrationState::InProgress); - // assert!(migration.gen > prev_migration.gen); - // let prev_migration = migration; - - // // Propolis will publish that the migration succeeds before changing any - // // state. This should transfer control to the target but should not - // // touch the migration ID (that is the new target's job). - // let prev = state.clone(); - // observed.vmm_state = PropolisInstanceState(Observed::Migrating); - // assert!(state.apply_propolis_observation(&observed).is_none()); - // assert_state_change_has_gen_change(&prev, &state); - // assert_eq!(state.vmm.state, VmmState::Migrating); - // assert!(state.vmm.gen > prev.vmm.gen); - - // // The migration state should transition to completed. - // let migration = state - // .migration - // .clone() - // .expect("instance must have a migration state"); - // assert_eq!(migration.state, MigrationState::Completed); - // assert!(migration.gen > prev_migration.gen); - - // // The rest of the destruction sequence is covered by other tests. - } } From 9d622f0329a7d0eee99e3250d394760958702671 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Fri, 12 Jul 2024 11:19:49 -0700 Subject: [PATCH 142/234] remove defunct `instance_put_migration_ids` API --- openapi/sled-agent.json | 86 -------------------------- sled-agent/src/http_entrypoints.rs | 22 +------ sled-agent/src/params.rs | 17 ----- sled-agent/src/sim/http_entrypoints.rs | 19 +----- 4 files changed, 4 insertions(+), 140 deletions(-) diff --git a/openapi/sled-agent.json b/openapi/sled-agent.json index 478b6c52b0d..bf51462c9d9 100644 --- a/openapi/sled-agent.json +++ b/openapi/sled-agent.json @@ -419,49 +419,6 @@ } } }, - "/instances/{instance_id}/migration-ids": { - "put": { - "operationId": "instance_put_migration_ids", - "parameters": [ - { - "in": "path", - "name": "instance_id", - "required": true, - "schema": { - "$ref": "#/components/schemas/TypedUuidForInstanceKind" - } - } - ], - "requestBody": { - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/InstancePutMigrationIdsBody" - } - } - }, - "required": true - }, - "responses": { - "200": { - "description": "successful operation", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/SledInstanceState" - } - } - } - }, - "4XX": { - "$ref": "#/components/responses/Error" - }, - "5XX": { - "$ref": "#/components/responses/Error" - } - } - } - }, "/instances/{instance_id}/state": { "get": { "operationId": "instance_get_state", @@ -3063,23 +3020,6 @@ "silo_id" ] }, - "InstanceMigrationSourceParams": { - "description": "Instance runtime state to update for a migration.", - "type": "object", - "properties": { - "dst_propolis_id": { - "$ref": "#/components/schemas/TypedUuidForPropolisKind" - }, - "migration_id": { - "type": "string", - "format": "uuid" - } - }, - "required": [ - "dst_propolis_id", - "migration_id" - ] - }, "InstanceMigrationTargetParams": { "description": "Parameters used when directing Propolis to initialize itself via live migration.", "type": "object", @@ -3124,32 +3064,6 @@ "ncpus" ] }, - "InstancePutMigrationIdsBody": { - "description": "The body of a request to set or clear the migration identifiers from a sled agent's instance state records.", - "type": "object", - "properties": { - "migration_params": { - "nullable": true, - "description": "The migration identifiers to set. If `None`, this operation clears the migration IDs.", - "allOf": [ - { - "$ref": "#/components/schemas/InstanceMigrationSourceParams" - } - ] - }, - "old_runtime": { - "description": "The last instance runtime state known to this requestor. This request will succeed if either (a) the state generation in the sled agent's runtime state matches the generation in this record, or (b) the sled agent's runtime state matches what would result from applying this request to the caller's runtime state. This latter condition provides idempotency.", - "allOf": [ - { - "$ref": "#/components/schemas/InstanceRuntimeState" - } - ] - } - }, - "required": [ - "old_runtime" - ] - }, "InstancePutStateBody": { "description": "The body of a request to move a previously-ensured instance into a specific runtime state.", "type": "object", diff --git a/sled-agent/src/http_entrypoints.rs b/sled-agent/src/http_entrypoints.rs index 7ce7be1d079..820ec746b88 100644 --- a/sled-agent/src/http_entrypoints.rs +++ b/sled-agent/src/http_entrypoints.rs @@ -8,9 +8,9 @@ use super::sled_agent::SledAgent; use crate::bootstrap::params::AddSledRequest; use crate::params::{ BootstoreStatus, CleanupContextUpdate, DiskEnsureBody, InstanceEnsureBody, - InstanceExternalIpBody, InstancePutMigrationIdsBody, InstancePutStateBody, - InstancePutStateResponse, InstanceUnregisterResponse, TimeSync, - VpcFirewallRulesEnsureBody, ZoneBundleId, ZoneBundleMetadata, Zpool, + InstanceExternalIpBody, InstancePutStateBody, InstancePutStateResponse, + InstanceUnregisterResponse, TimeSync, VpcFirewallRulesEnsureBody, + ZoneBundleId, ZoneBundleMetadata, Zpool, }; use crate::sled_agent::Error as SledAgentError; use crate::zone_bundle; @@ -54,7 +54,6 @@ pub fn api() -> SledApiDescription { api.register(disk_put)?; api.register(cockroachdb_init)?; api.register(instance_issue_disk_snapshot_request)?; - api.register(instance_put_migration_ids)?; api.register(instance_put_state)?; api.register(instance_get_state)?; api.register(instance_put_external_ip)?; @@ -496,21 +495,6 @@ async fn instance_get_state( Ok(HttpResponseOk(sa.instance_get_state(instance_id).await?)) } -#[endpoint { - method = PUT, - path = "/instances/{instance_id}/migration-ids", -}] -async fn instance_put_migration_ids( - _: RequestContext, - _: Path, - _: TypedBody, -) -> Result, HttpError> { - Err(HttpError::for_bad_request( - None, - "operation no longer supported".to_string(), - )) -} - #[endpoint { method = PUT, path = "/instances/{instance_id}/external-ip", diff --git a/sled-agent/src/params.rs b/sled-agent/src/params.rs index b7a143cf878..4a7885279c3 100644 --- a/sled-agent/src/params.rs +++ b/sled-agent/src/params.rs @@ -210,23 +210,6 @@ pub struct InstanceMigrationSourceParams { pub dst_propolis_id: PropolisUuid, } -/// The body of a request to set or clear the migration identifiers from a -/// sled agent's instance state records. -#[derive(Debug, Serialize, Deserialize, JsonSchema)] -pub struct InstancePutMigrationIdsBody { - /// The last instance runtime state known to this requestor. This request - /// will succeed if either (a) the state generation in the sled agent's - /// runtime state matches the generation in this record, or (b) the sled - /// agent's runtime state matches what would result from applying this - /// request to the caller's runtime state. This latter condition provides - /// idempotency. - pub old_runtime: InstanceRuntimeState, - - /// The migration identifiers to set. If `None`, this operation clears the - /// migration IDs. - pub migration_params: Option, -} - #[derive(Clone, Debug, Deserialize, Serialize, JsonSchema, PartialEq)] pub enum DiskType { U2, diff --git a/sled-agent/src/sim/http_entrypoints.rs b/sled-agent/src/sim/http_entrypoints.rs index 51e5ad977fc..d042e19814e 100644 --- a/sled-agent/src/sim/http_entrypoints.rs +++ b/sled-agent/src/sim/http_entrypoints.rs @@ -8,8 +8,7 @@ use super::collection::PokeMode; use crate::bootstrap::params::AddSledRequest; use crate::params::{ DiskEnsureBody, InstanceEnsureBody, InstanceExternalIpBody, - InstancePutMigrationIdsBody, InstancePutStateBody, - InstancePutStateResponse, InstanceUnregisterResponse, + InstancePutStateBody, InstancePutStateResponse, InstanceUnregisterResponse, VpcFirewallRulesEnsureBody, }; use dropshot::ApiDescription; @@ -46,7 +45,6 @@ pub fn api() -> SledApiDescription { fn register_endpoints( api: &mut SledApiDescription, ) -> Result<(), ApiDescriptionRegisterError> { - api.register(instance_put_migration_ids)?; api.register(instance_put_state)?; api.register(instance_get_state)?; api.register(instance_register)?; @@ -160,21 +158,6 @@ async fn instance_get_state( Ok(HttpResponseOk(sa.instance_get_state(instance_id).await?)) } -#[endpoint { - method = PUT, - path = "/instances/{instance_id}/migration-ids", -}] -async fn instance_put_migration_ids( - _: RequestContext>, - _: Path, - _: TypedBody, -) -> Result, HttpError> { - Err(HttpError::for_bad_request( - None, - "operation no longer supported".to_string(), - )) -} - #[endpoint { method = PUT, path = "/instances/{instance_id}/external-ip", From 91beec6efdfe8d5ae72562190ae8244d87a6a6f8 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Fri, 12 Jul 2024 11:35:55 -0700 Subject: [PATCH 143/234] sled-agent's per-instance logger should have UUIDs Especially now that the `Instance` struct doesn't actually know its UUID. --- sled-agent/src/instance_manager.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sled-agent/src/instance_manager.rs b/sled-agent/src/instance_manager.rs index 0de890ce63e..5ac9ddbed77 100644 --- a/sled-agent/src/instance_manager.rs +++ b/sled-agent/src/instance_manager.rs @@ -603,7 +603,8 @@ impl InstanceManagerRunner { info!(&self.log, "registering new instance"; "instance_id" => ?instance_id); - let instance_log = self.log.new(o!()); + let instance_log = + self.log.new(o!("instance_id" => format!("{instance_id}"))); let ticket = InstanceTicket::new(instance_id, self.terminate_tx.clone()); From 96a21d036e44668d5d47481f973492068e358ed4 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Fri, 12 Jul 2024 15:03:49 -0700 Subject: [PATCH 144/234] misc commentary suggestions from @gjcolombo --- nexus/src/app/sagas/instance_migrate.rs | 2 +- nexus/tests/integration_tests/instances.rs | 6 ------ sled-agent/src/instance.rs | 19 +++++++++++++++++-- 3 files changed, 18 insertions(+), 9 deletions(-) diff --git a/nexus/src/app/sagas/instance_migrate.rs b/nexus/src/app/sagas/instance_migrate.rs index 0660abc03b5..4432e806efa 100644 --- a/nexus/src/app/sagas/instance_migrate.rs +++ b/nexus/src/app/sagas/instance_migrate.rs @@ -829,7 +829,7 @@ mod tests { test_helpers::instance_simulate(cptestctx, &instance_id) .await; // Wait until the instance has advanced to the `NoVmm` - // state. This may hot happen immediately, as an + // state. This may not happen immediately, as an // instance-update saga must complete to update the // instance's state. let new_state = test_helpers::instance_wait_for_state( diff --git a/nexus/tests/integration_tests/instances.rs b/nexus/tests/integration_tests/instances.rs index d80938daabd..2e41fac3a4e 100644 --- a/nexus/tests/integration_tests/instances.rs +++ b/nexus/tests/integration_tests/instances.rs @@ -5130,12 +5130,6 @@ async fn instance_simulate_on_sled( /// Simulates a migration source for the provided instance ID, sled ID, and /// migration ID. -// -// XXX(eliza): I had really wanted to have the migration target's simulated -// sled-agent do this automagically when it's told to start a migration in, but -// unfortunately, I wasn't able to figure out a way for it to get the simulated -// *sled-agent*'s IP --- it just gets the Propolis IP in the migration target -// params, and the propolis doesn't actually exist... async fn instance_simulate_migration_source( cptestctx: &ControlPlaneTestContext, nexus: &Arc, diff --git a/sled-agent/src/instance.rs b/sled-agent/src/instance.rs index 60164fd7485..8060dcea3a2 100644 --- a/sled-agent/src/instance.rs +++ b/sled-agent/src/instance.rs @@ -695,8 +695,23 @@ impl InstanceRunner { Some(params) => { let migration_id = self.state .migration_in() - // TODO(eliza): this is a bit of a shame; it would be nice - // to refactor this code so we don't unwrap here. + // TODO(eliza): This is a bit of an unfortunate dance: the + // initial instance-ensure-registered request is what sends + // the migration ID, but it's the subsequent + // instance-ensure-state request (which we're handling here) + // that includes migration the source VMM's UUID and IP + // address. Because the API currently splits the migration + // IDs between the instance-ensure-registered and + // instance-ensure-state requests, we have to stash the + // migration ID in an `Option` and `expect()` it here, + // panicking if we get an instance-ensure-state request with + // a source Propolis ID if the instance wasn't registered + // with a migration in ID. + // + // This is kind of a shame. Eventually, we should consider + // reworking the API ensure-state request contains the + // migration ID, and we don't have to unwrap here. See: + // https://github.com/oxidecomputer/omicron/issues/6073 .expect("if we have migration target params, we should also have a migration in") .migration_id; Some(propolis_client::types::InstanceMigrateInitiateRequest { From 70a7fbf56229d4e5885a5875b5de7e7160a1b1e1 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Mon, 15 Jul 2024 16:28:14 -0700 Subject: [PATCH 145/234] wip: instance updater saga tests --- nexus/src/app/sagas/instance_update/mod.rs | 128 +++++++++++++++++++++ 1 file changed, 128 insertions(+) diff --git a/nexus/src/app/sagas/instance_update/mod.rs b/nexus/src/app/sagas/instance_update/mod.rs index 5f6eebb8cce..eaa07500c98 100644 --- a/nexus/src/app/sagas/instance_update/mod.rs +++ b/nexus/src/app/sagas/instance_update/mod.rs @@ -737,3 +737,131 @@ async fn unlock_instance_inner( Ok(()) } + +#[cfg(test)] +mod test { + use super::*; + use crate::app::sagas::test_helpers; + use crate::external_api::params; + use chrono::Utc; + use dropshot::test_util::ClientTestContext; + use nexus_db_model::VmmRuntimeState; + use nexus_db_queries::db::lookup::LookupPath; + use nexus_test_utils::resource_helpers::{ + create_default_ip_pool, create_project, object_create, + }; + use nexus_test_utils_macros::nexus_test; + use omicron_uuid_kinds::GenericUuid; + use omicron_uuid_kinds::PropolisUuid; + use uuid::Uuid; + + type ControlPlaneTestContext = + nexus_test_utils::ControlPlaneTestContext; + + const PROJECT_NAME: &str = "test-project"; + const INSTANCE_NAME: &str = "test-instance"; + + async fn setup_test_project(client: &ClientTestContext) -> Uuid { + create_default_ip_pool(&client).await; + let project = create_project(&client, PROJECT_NAME).await; + project.identity.id + } + + async fn create_instance( + client: &ClientTestContext, + ) -> omicron_common::api::external::Instance { + use omicron_common::api::external::{ + ByteCount, IdentityMetadataCreateParams, InstanceCpuCount, + }; + let instances_url = format!("/v1/instances?project={}", PROJECT_NAME); + object_create( + client, + &instances_url, + ¶ms::InstanceCreate { + identity: IdentityMetadataCreateParams { + name: INSTANCE_NAME.parse().unwrap(), + description: format!("instance {:?}", INSTANCE_NAME), + }, + ncpus: InstanceCpuCount(2), + memory: ByteCount::from_gibibytes_u32(2), + hostname: INSTANCE_NAME.parse().unwrap(), + user_data: b"#cloud-config".to_vec(), + ssh_public_keys: Some(Vec::new()), + network_interfaces: + params::InstanceNetworkInterfaceAttachment::None, + external_ips: vec![], + disks: vec![], + start: true, + }, + ) + .await + } + + #[nexus_test(server = crate::Server)] + async fn test_active_vmm_destroyed_succeeds( + cptestctx: &ControlPlaneTestContext, + ) { + let client = &cptestctx.external_client; + let nexus = &cptestctx.server.server_context().nexus; + let datastore = + cptestctx.server.server_context().nexus.datastore().clone(); + let _project_id = setup_test_project(&client).await; + + let opctx = test_helpers::test_opctx(cptestctx); + let instance = create_instance(client).await; + let instance_id = InstanceUuid::from_untyped_uuid(instance.identity.id); + + // Poke the instance to get it into the Running state. + test_helpers::instance_simulate(cptestctx, &instance_id).await; + + // Now, destroy the active VMM + let state = test_helpers::instance_fetch(cptestctx, instance_id).await; + let vmm = state.vmm().as_ref().unwrap(); + let vmm_id = PropolisUuid::from_untyped_uuid(vmm.id); + datastore + .vmm_update_runtime( + &vmm_id, + &VmmRuntimeState { + time_state_updated: Utc::now(), + gen: Generation(vmm.runtime.gen.0.next()), + state: VmmState::Destroyed, + }, + ) + .await + .unwrap(); + + let (_, _, authz_instance, ..) = LookupPath::new(&opctx, &datastore) + .instance_id(instance_id.into_untyped_uuid()) + .fetch() + .await + .expect("test instance should be present in datastore"); + + // run the instance-update saga + nexus + .sagas + .saga_execute::(Params { + serialized_authn: authn::saga::Serialized::for_opctx(&opctx), + authz_instance, + }) + .await + .expect("update saga should succeed"); + + // TODO(eliza): it would be nicer if we ensured that the start saga + // kicked off by the transition to `Running` completed *before* going to + // Destroyed, so we can guarantee that the saga that _we_ run performs + // the update. Figure that out. + test_helpers::instance_wait_for_state( + cptestctx, + instance_id, + InstanceState::NoVmm, + ) + .await; + assert!( + test_helpers::no_virtual_provisioning_resource_records_exist( + cptestctx + ) + .await + ); + assert!(test_helpers::no_virtual_provisioning_collection_records_using_instances(cptestctx).await); + } +} From 6951d97cf79ce20d36e32df62bb8758cd8220dcc Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Tue, 16 Jul 2024 09:37:30 -0700 Subject: [PATCH 146/234] wait for the preceeding update saga to complete --- nexus/src/app/instance.rs | 8 ++ nexus/src/app/sagas/instance_update/mod.rs | 91 ++++++++++++++++++---- 2 files changed, 86 insertions(+), 13 deletions(-) diff --git a/nexus/src/app/instance.rs b/nexus/src/app/instance.rs index 95a2f5122fc..9750b2dd4e6 100644 --- a/nexus/src/app/instance.rs +++ b/nexus/src/app/instance.rs @@ -1918,6 +1918,14 @@ pub(crate) async fn notify_instance_updated( // If the instance or VMM records in the database have changed as a result // of this update, prepare an `instance-update` saga to ensure that the // changes are reflected by the instance record. + // + // TODO(eliza): it would be nice to be smarter about determining whether we + // need to run a saga here. We don't need to run an instance-update saga + // *every* time a VMM or migration has been updated. instead, we should only + // trigger them if any side of the migration has *terminated*, or if the + // active VMM state transitioned to Destroyed. Eliding unnecessary start + // sagas would reduce updater lock contention and allow the necessary sagas + // to run in a timelier manner. let updated = vmm_updated || migration_updated.unwrap_or(false); if updated { let (.., authz_instance) = LookupPath::new(&opctx, datastore) diff --git a/nexus/src/app/sagas/instance_update/mod.rs b/nexus/src/app/sagas/instance_update/mod.rs index eaa07500c98..530ff1a6f0b 100644 --- a/nexus/src/app/sagas/instance_update/mod.rs +++ b/nexus/src/app/sagas/instance_update/mod.rs @@ -2,6 +2,9 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. +//! Instance Update Saga +//! +//! # Theory of Operation use super::{ ActionRegistry, NexusActionContext, NexusSaga, SagaInitError, ACTION_GENERATE_ID, @@ -741,18 +744,22 @@ async fn unlock_instance_inner( #[cfg(test)] mod test { use super::*; + use crate::app::db::model::Instance; + use crate::app::db::model::VmmRuntimeState; use crate::app::sagas::test_helpers; use crate::external_api::params; use chrono::Utc; use dropshot::test_util::ClientTestContext; - use nexus_db_model::VmmRuntimeState; + use nexus_db_queries::db::datastore::InstanceAndActiveVmm; use nexus_db_queries::db::lookup::LookupPath; use nexus_test_utils::resource_helpers::{ create_default_ip_pool, create_project, object_create, }; use nexus_test_utils_macros::nexus_test; + use omicron_test_utils::dev::poll; use omicron_uuid_kinds::GenericUuid; use omicron_uuid_kinds::PropolisUuid; + use std::time::Duration; use uuid::Uuid; type ControlPlaneTestContext = @@ -797,6 +804,63 @@ mod test { .await } + /// Wait for an update saga to complete for the provided `instance`. + async fn wait_for_update( + cptestctx: &ControlPlaneTestContext, + instance: &Instance, + ) -> InstanceAndActiveVmm { + // I'd be pretty surprised if an update saga takes longer than a minute + // to complete in a unit test. If a saga hasn't run, failing the test in + // a timely manner is helpful, so making this *too* long could be annoying... + const MAX_WAIT: Duration = Duration::from_secs(60); + + let instance_id = InstanceUuid::from_untyped_uuid(instance.id()); + let initial_gen = instance.updater_gen; + let log = &cptestctx.logctx.log; + + info!( + log, + "waiting for instance update to complete..."; + "instance_id" => %instance_id, + "initial_gen" => ?initial_gen, + ); + + poll::wait_for_condition( + || async { + let state = + test_helpers::instance_fetch(cptestctx, instance_id).await; + let instance = state.instance(); + if instance.updater_gen > initial_gen + && instance.updater_id.is_none() + { + info!( + log, + "instance update completed!"; + "instance_id" => %instance_id, + "initial_gen" => ?initial_gen, + "current_gen" => ?instance.updater_gen, + ); + return Ok(state); + } + + info!( + log, + "instance update has not yet completed..."; + "instance_id" => %instance_id, + "initial_gen" => ?initial_gen, + "current_gen" => ?instance.updater_gen, + "current_updater" => ?instance.updater_id, + ); + Err(poll::CondCheckError::NotYet::<()>) + }, + // A lot can happen in one second... + &Duration::from_secs(1), + &MAX_WAIT, + ) + .await + .unwrap() + } + #[nexus_test(server = crate::Server)] async fn test_active_vmm_destroyed_succeeds( cptestctx: &ControlPlaneTestContext, @@ -812,10 +876,16 @@ mod test { let instance_id = InstanceUuid::from_untyped_uuid(instance.identity.id); // Poke the instance to get it into the Running state. + let state = test_helpers::instance_fetch(cptestctx, instance_id).await; test_helpers::instance_simulate(cptestctx, &instance_id).await; + // Wait for the instance update saga triggered by a transition to + // Running to complete. + // TODO(eliza): it would be a bit nicer if `notify_instance_updated` was + // smarter about determining whether it should try to start an update + // saga, since this update doesn't actually do anything... + let state = wait_for_update(cptestctx, state.instance()).await; // Now, destroy the active VMM - let state = test_helpers::instance_fetch(cptestctx, instance_id).await; let vmm = state.vmm().as_ref().unwrap(); let vmm_id = PropolisUuid::from_untyped_uuid(vmm.id); datastore @@ -836,7 +906,7 @@ mod test { .await .expect("test instance should be present in datastore"); - // run the instance-update saga + // Run the instance-update saga and wait for the update to complete. nexus .sagas .saga_execute::(Params { @@ -845,17 +915,12 @@ mod test { }) .await .expect("update saga should succeed"); + let state = wait_for_update(cptestctx, state.instance()).await; - // TODO(eliza): it would be nicer if we ensured that the start saga - // kicked off by the transition to `Running` completed *before* going to - // Destroyed, so we can guarantee that the saga that _we_ run performs - // the update. Figure that out. - test_helpers::instance_wait_for_state( - cptestctx, - instance_id, - InstanceState::NoVmm, - ) - .await; + assert_eq!( + state.instance().runtime().nexus_state, + InstanceState::NoVmm + ); assert!( test_helpers::no_virtual_provisioning_resource_records_exist( cptestctx From 579c135c158c64bc6783aabfe023e39f991bbca8 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Tue, 16 Jul 2024 11:07:16 -0700 Subject: [PATCH 147/234] more assertions --- nexus/src/app/sagas/instance_update/mod.rs | 47 ++++++++++++++++++---- nexus/src/app/sagas/test_helpers.rs | 31 ++++++++++++++ 2 files changed, 71 insertions(+), 7 deletions(-) diff --git a/nexus/src/app/sagas/instance_update/mod.rs b/nexus/src/app/sagas/instance_update/mod.rs index 530ff1a6f0b..518effee301 100644 --- a/nexus/src/app/sagas/instance_update/mod.rs +++ b/nexus/src/app/sagas/instance_update/mod.rs @@ -880,10 +880,36 @@ mod test { test_helpers::instance_simulate(cptestctx, &instance_id).await; // Wait for the instance update saga triggered by a transition to // Running to complete. - // TODO(eliza): it would be a bit nicer if `notify_instance_updated` was - // smarter about determining whether it should try to start an update - // saga, since this update doesn't actually do anything... let state = wait_for_update(cptestctx, state.instance()).await; + // The instance should have an active VMM. + let instance_runtime = state.instance().runtime(); + assert_eq!(instance_runtime.nexus_state, InstanceState::Vmm); + assert!(instance_runtime.propolis_id.is_some()); + // Once we destroy the active VMM, we'll assert that the virtual + // provisioning and sled resource records it owns have been deallocated. + // In order to ensure we're actually testing the correct thing, let's + // make sure that those records exist now --- if not, the assertions + // later won't mean anything! + assert!( + !test_helpers::no_virtual_provisioning_resource_records_exist( + cptestctx + ) + .await, + "we can't assert that a destroyed VMM instance update deallocates \ + virtual provisioning records if none exist!", + ); + assert!( + !test_helpers::no_virtual_provisioning_collection_records_using_instances(cptestctx) + .await, + "we can't assert that a destroyed VMM instance update deallocates \ + virtual provisioning records if none exist!", + ); + assert!( + !test_helpers::no_sled_resource_instance_records_exist(cptestctx) + .await, + "we can't assert that a destroyed VMM instance update deallocates \ + sled resource records if none exist!" + ); // Now, destroy the active VMM let vmm = state.vmm().as_ref().unwrap(); @@ -917,10 +943,13 @@ mod test { .expect("update saga should succeed"); let state = wait_for_update(cptestctx, state.instance()).await; - assert_eq!( - state.instance().runtime().nexus_state, - InstanceState::NoVmm - ); + // The instance's active VMM has been destroyed, so its state should + // transition to `NoVmm`, and its active VMM ID should be unlinked. The + // virtual provisioning and sled resources allocated to the instance + // should be deallocated. + let instance_runtime = state.instance().runtime(); + assert_eq!(instance_runtime.nexus_state, InstanceState::NoVmm); + assert!(instance_runtime.propolis_id.is_none()); assert!( test_helpers::no_virtual_provisioning_resource_records_exist( cptestctx @@ -928,5 +957,9 @@ mod test { .await ); assert!(test_helpers::no_virtual_provisioning_collection_records_using_instances(cptestctx).await); + assert!( + test_helpers::no_sled_resource_instance_records_exist(cptestctx) + .await + ); } } diff --git a/nexus/src/app/sagas/test_helpers.rs b/nexus/src/app/sagas/test_helpers.rs index e749dcd5941..6231f1fe56b 100644 --- a/nexus/src/app/sagas/test_helpers.rs +++ b/nexus/src/app/sagas/test_helpers.rs @@ -347,6 +347,37 @@ pub async fn no_virtual_provisioning_collection_records_using_instances( .unwrap() } +pub async fn no_sled_resource_instance_records_exist( + cptestctx: &ControlPlaneTestContext, +) -> bool { + use nexus_db_queries::db::model::SledResource; + use nexus_db_queries::db::model::SledResourceKind; + use nexus_db_queries::db::schema::sled_resource::dsl; + + let datastore = cptestctx.server.server_context().nexus.datastore(); + let conn = datastore.pool_connection_for_tests().await.unwrap(); + + datastore + .transaction_retry_wrapper("no_sled_resource_instance_records_exist") + .transaction(&conn, |conn| async move { + conn.batch_execute_async( + nexus_test_utils::db::ALLOW_FULL_TABLE_SCAN_SQL, + ) + .await + .unwrap(); + + Ok(dsl::sled_resource + .filter(dsl::kind.eq(SledResourceKind::Instance)) + .select(SledResource::as_select()) + .get_results_async::(&conn) + .await + .unwrap() + .is_empty()) + }) + .await + .unwrap() +} + /// Tests that the saga described by `dag` succeeds if each of its nodes is /// repeated. /// From 5f7dbe1494486f8d654fc5f942b0f42e95cc112f Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Tue, 16 Jul 2024 11:10:43 -0700 Subject: [PATCH 148/234] ah, there was already a thingy for that --- nexus/src/app/sagas/instance_create.rs | 40 ++++---------------------- 1 file changed, 5 insertions(+), 35 deletions(-) diff --git a/nexus/src/app/sagas/instance_create.rs b/nexus/src/app/sagas/instance_create.rs index 4f0ec7c0c61..d19230892fa 100644 --- a/nexus/src/app/sagas/instance_create.rs +++ b/nexus/src/app/sagas/instance_create.rs @@ -1065,7 +1065,7 @@ pub mod test { app::sagas::instance_create::SagaInstanceCreate, app::sagas::test_helpers, external_api::params, }; - use async_bb8_diesel::{AsyncRunQueryDsl, AsyncSimpleConnection}; + use async_bb8_diesel::AsyncRunQueryDsl; use diesel::{ ExpressionMethods, OptionalExtension, QueryDsl, SelectableHelper, }; @@ -1201,39 +1201,6 @@ pub mod test { .is_none() } - async fn no_sled_resource_instance_records_exist( - datastore: &DataStore, - ) -> bool { - use nexus_db_queries::db::model::SledResource; - use nexus_db_queries::db::schema::sled_resource::dsl; - - let conn = datastore.pool_connection_for_tests().await.unwrap(); - - datastore - .transaction_retry_wrapper( - "no_sled_resource_instance_records_exist", - ) - .transaction(&conn, |conn| async move { - conn.batch_execute_async( - nexus_test_utils::db::ALLOW_FULL_TABLE_SCAN_SQL, - ) - .await - .unwrap(); - - Ok(dsl::sled_resource - .filter(dsl::kind.eq( - nexus_db_queries::db::model::SledResourceKind::Instance, - )) - .select(SledResource::as_select()) - .get_results_async::(&conn) - .await - .unwrap() - .is_empty()) - }) - .await - .unwrap() - } - async fn disk_is_detached(datastore: &DataStore) -> bool { use nexus_db_queries::db::model::Disk; use nexus_db_queries::db::schema::disk::dsl; @@ -1267,7 +1234,10 @@ pub mod test { assert!(no_instance_records_exist(datastore).await); assert!(no_network_interface_records_exist(datastore).await); assert!(no_external_ip_records_exist(datastore).await); - assert!(no_sled_resource_instance_records_exist(datastore).await); + assert!( + test_helpers::no_sled_resource_instance_records_exist(cptestctx) + .await + ); assert!( test_helpers::no_virtual_provisioning_resource_records_exist( cptestctx From 2515d2098ad4120fe38eeb070de2a412e0b564a7 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Tue, 16 Jul 2024 14:08:41 -0700 Subject: [PATCH 149/234] additional active_vmm_destroyed tests --- nexus/src/app/sagas/instance_update/mod.rs | 143 ++++++++++++++++++--- 1 file changed, 128 insertions(+), 15 deletions(-) diff --git a/nexus/src/app/sagas/instance_update/mod.rs b/nexus/src/app/sagas/instance_update/mod.rs index 518effee301..b8b6f74dab6 100644 --- a/nexus/src/app/sagas/instance_update/mod.rs +++ b/nexus/src/app/sagas/instance_update/mod.rs @@ -746,6 +746,7 @@ mod test { use super::*; use crate::app::db::model::Instance; use crate::app::db::model::VmmRuntimeState; + use crate::app::saga::create_saga_dag; use crate::app::sagas::test_helpers; use crate::external_api::params; use chrono::Utc; @@ -768,6 +769,26 @@ mod test { const PROJECT_NAME: &str = "test-project"; const INSTANCE_NAME: &str = "test-instance"; + // Most Nexus sagas have test suites that follow a simple formula: there's + // usually a `test_saga_basic_usage_succeeds` that just makes sure the saga + // basically works, and then a `test_actions_succeed_idempotently` test that + // does the same thing, but runs every action twice. Then, there's usually a + // `test_action_failures_can_unwind` test, and often also a + // `test_action_failures_can_unwind_idempotently` test. + // + // For the instance-update saga, the test suite is a bit more complicated. + // This saga will do a number of different things depending on the ways in + // which the instance's migration and VMM records have changed since the + // last update. Therefore, we want to test all of the possible branches + // through the saga: + // + // 1. active VMM destroyed + // 2. migration source completed + // 3. migration target completed + // 4. migration source VMM completed and was destroyed, + // 5. migration target failed + // 6. migration source failed + async fn setup_test_project(client: &ClientTestContext) -> Uuid { create_default_ip_pool(&client).await; let project = create_project(&client, PROJECT_NAME).await; @@ -861,14 +882,12 @@ mod test { .unwrap() } - #[nexus_test(server = crate::Server)] - async fn test_active_vmm_destroyed_succeeds( + async fn setup_active_vmm_destroyed_test( cptestctx: &ControlPlaneTestContext, - ) { + ) -> (InstanceAndActiveVmm, Params) { let client = &cptestctx.external_client; let nexus = &cptestctx.server.server_context().nexus; - let datastore = - cptestctx.server.server_context().nexus.datastore().clone(); + let datastore = nexus.datastore().clone(); let _project_id = setup_test_project(&client).await; let opctx = test_helpers::test_opctx(cptestctx); @@ -931,22 +950,29 @@ mod test { .fetch() .await .expect("test instance should be present in datastore"); + let params = Params { + authz_instance, + serialized_authn: authn::saga::Serialized::for_opctx(&opctx), + }; + (state, params) + } - // Run the instance-update saga and wait for the update to complete. - nexus - .sagas - .saga_execute::(Params { - serialized_authn: authn::saga::Serialized::for_opctx(&opctx), - authz_instance, - }) - .await - .expect("update saga should succeed"); - let state = wait_for_update(cptestctx, state.instance()).await; + async fn verify_active_vmm_destroyed( + cptestctx: &ControlPlaneTestContext, + instance_id: Uuid, + ) { + let state = test_helpers::instance_fetch( + cptestctx, + InstanceUuid::from_untyped_uuid(instance_id), + ) + .await; // The instance's active VMM has been destroyed, so its state should // transition to `NoVmm`, and its active VMM ID should be unlinked. The // virtual provisioning and sled resources allocated to the instance // should be deallocated. + assert_instance_unlocked(state.instance()); + assert!(state.vmm().is_none()); let instance_runtime = state.instance().runtime(); assert_eq!(instance_runtime.nexus_state, InstanceState::NoVmm); assert!(instance_runtime.propolis_id.is_none()); @@ -962,4 +988,91 @@ mod test { .await ); } + + #[track_caller] + fn assert_instance_unlocked(instance: &Instance) { + assert_eq!( + instance.updater_id, None, + "instance updater lock should have been released" + ) + } + + #[nexus_test(server = crate::Server)] + async fn test_active_vmm_destroyed_succeeds( + cptestctx: &ControlPlaneTestContext, + ) { + let (state, params) = setup_active_vmm_destroyed_test(cptestctx).await; + + // Run the instance-update saga. + let nexus = &cptestctx.server.server_context().nexus; + nexus + .sagas + .saga_execute::(params) + .await + .expect("update saga should succeed"); + + // Assert that the saga properly cleaned up the active VMM's resources. + verify_active_vmm_destroyed(cptestctx, state.instance().id()).await; + } + + #[nexus_test(server = crate::Server)] + async fn test_active_vmm_destroyed_actions_succeed_idempotently( + cptestctx: &ControlPlaneTestContext, + ) { + let (state, params) = setup_active_vmm_destroyed_test(cptestctx).await; + + // Build the saga DAG with the provided test parameters + let dag = create_saga_dag::(params).unwrap(); + + crate::app::sagas::test_helpers::actions_succeed_idempotently( + &cptestctx.server.server_context().nexus, + dag, + ) + .await; + + // Assert that the saga properly cleaned up the active VMM's resources. + verify_active_vmm_destroyed(cptestctx, state.instance().id()).await; + } + + #[nexus_test(server = crate::Server)] + async fn test_active_vmm_destroyed_action_failure_can_unwind( + cptestctx: &ControlPlaneTestContext, + ) { + let (state, Params { serialized_authn, authz_instance }) = + setup_active_vmm_destroyed_test(cptestctx).await; + let instance_id = + InstanceUuid::from_untyped_uuid(state.instance().id()); + + test_helpers::action_failure_can_unwind::( + &cptestctx.server.server_context().nexus, + || { + Box::pin(async { + Params { + serialized_authn: serialized_authn.clone(), + authz_instance: authz_instance.clone(), + } + }) + }, + || { + Box::pin({ + async { + let state = test_helpers::instance_fetch( + cptestctx, + instance_id, + ) + .await; + // Unlike most other sagas, we actually don't unwind the + // work performed by an update saga, as we would prefer + // that at least some of it succeeds. The only thing + // that *needs* to be rolled back when an + // instance-update saga fails is that the updater lock + // *MUST* be released so that a subsequent saga can run. + assert_instance_unlocked(state.instance()); + } + }) + }, + &cptestctx.logctx.log, + ) + .await; + } } From 9f868654df2986b3162c1fcff8813c059bb68656 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Tue, 16 Jul 2024 14:46:04 -0700 Subject: [PATCH 150/234] rearrange deck chairs --- nexus/src/app/sagas/instance_update/mod.rs | 178 +++++++++++---------- 1 file changed, 91 insertions(+), 87 deletions(-) diff --git a/nexus/src/app/sagas/instance_update/mod.rs b/nexus/src/app/sagas/instance_update/mod.rs index b8b6f74dab6..75524c4d14c 100644 --- a/nexus/src/app/sagas/instance_update/mod.rs +++ b/nexus/src/app/sagas/instance_update/mod.rs @@ -882,6 +882,97 @@ mod test { .unwrap() } + #[track_caller] + fn assert_instance_unlocked(instance: &Instance) { + assert_eq!( + instance.updater_id, None, + "instance updater lock should have been released" + ) + } + + // === Active VMM destroyed tests ==== + + #[nexus_test(server = crate::Server)] + async fn test_active_vmm_destroyed_succeeds( + cptestctx: &ControlPlaneTestContext, + ) { + let (state, params) = setup_active_vmm_destroyed_test(cptestctx).await; + + // Run the instance-update saga. + let nexus = &cptestctx.server.server_context().nexus; + nexus + .sagas + .saga_execute::(params) + .await + .expect("update saga should succeed"); + + // Assert that the saga properly cleaned up the active VMM's resources. + verify_active_vmm_destroyed(cptestctx, state.instance().id()).await; + } + + #[nexus_test(server = crate::Server)] + async fn test_active_vmm_destroyed_actions_succeed_idempotently( + cptestctx: &ControlPlaneTestContext, + ) { + let (state, params) = setup_active_vmm_destroyed_test(cptestctx).await; + + // Build the saga DAG with the provided test parameters + let dag = create_saga_dag::(params).unwrap(); + + crate::app::sagas::test_helpers::actions_succeed_idempotently( + &cptestctx.server.server_context().nexus, + dag, + ) + .await; + + // Assert that the saga properly cleaned up the active VMM's resources. + verify_active_vmm_destroyed(cptestctx, state.instance().id()).await; + } + + #[nexus_test(server = crate::Server)] + async fn test_active_vmm_destroyed_action_failure_can_unwind( + cptestctx: &ControlPlaneTestContext, + ) { + let (state, Params { serialized_authn, authz_instance }) = + setup_active_vmm_destroyed_test(cptestctx).await; + let instance_id = + InstanceUuid::from_untyped_uuid(state.instance().id()); + + test_helpers::action_failure_can_unwind::( + &cptestctx.server.server_context().nexus, + || { + Box::pin(async { + Params { + serialized_authn: serialized_authn.clone(), + authz_instance: authz_instance.clone(), + } + }) + }, + || { + Box::pin({ + async { + let state = test_helpers::instance_fetch( + cptestctx, + instance_id, + ) + .await; + // Unlike most other sagas, we actually don't unwind the + // work performed by an update saga, as we would prefer + // that at least some of it succeeds. The only thing + // that *needs* to be rolled back when an + // instance-update saga fails is that the updater lock + // *MUST* be released so that a subsequent saga can run. + assert_instance_unlocked(state.instance()); + } + }) + }, + &cptestctx.logctx.log, + ) + .await; + } + + // --- test helpers --- + async fn setup_active_vmm_destroyed_test( cptestctx: &ControlPlaneTestContext, ) -> (InstanceAndActiveVmm, Params) { @@ -988,91 +1079,4 @@ mod test { .await ); } - - #[track_caller] - fn assert_instance_unlocked(instance: &Instance) { - assert_eq!( - instance.updater_id, None, - "instance updater lock should have been released" - ) - } - - #[nexus_test(server = crate::Server)] - async fn test_active_vmm_destroyed_succeeds( - cptestctx: &ControlPlaneTestContext, - ) { - let (state, params) = setup_active_vmm_destroyed_test(cptestctx).await; - - // Run the instance-update saga. - let nexus = &cptestctx.server.server_context().nexus; - nexus - .sagas - .saga_execute::(params) - .await - .expect("update saga should succeed"); - - // Assert that the saga properly cleaned up the active VMM's resources. - verify_active_vmm_destroyed(cptestctx, state.instance().id()).await; - } - - #[nexus_test(server = crate::Server)] - async fn test_active_vmm_destroyed_actions_succeed_idempotently( - cptestctx: &ControlPlaneTestContext, - ) { - let (state, params) = setup_active_vmm_destroyed_test(cptestctx).await; - - // Build the saga DAG with the provided test parameters - let dag = create_saga_dag::(params).unwrap(); - - crate::app::sagas::test_helpers::actions_succeed_idempotently( - &cptestctx.server.server_context().nexus, - dag, - ) - .await; - - // Assert that the saga properly cleaned up the active VMM's resources. - verify_active_vmm_destroyed(cptestctx, state.instance().id()).await; - } - - #[nexus_test(server = crate::Server)] - async fn test_active_vmm_destroyed_action_failure_can_unwind( - cptestctx: &ControlPlaneTestContext, - ) { - let (state, Params { serialized_authn, authz_instance }) = - setup_active_vmm_destroyed_test(cptestctx).await; - let instance_id = - InstanceUuid::from_untyped_uuid(state.instance().id()); - - test_helpers::action_failure_can_unwind::( - &cptestctx.server.server_context().nexus, - || { - Box::pin(async { - Params { - serialized_authn: serialized_authn.clone(), - authz_instance: authz_instance.clone(), - } - }) - }, - || { - Box::pin({ - async { - let state = test_helpers::instance_fetch( - cptestctx, - instance_id, - ) - .await; - // Unlike most other sagas, we actually don't unwind the - // work performed by an update saga, as we would prefer - // that at least some of it succeeds. The only thing - // that *needs* to be rolled back when an - // instance-update saga fails is that the updater lock - // *MUST* be released so that a subsequent saga can run. - assert_instance_unlocked(state.instance()); - } - }) - }, - &cptestctx.logctx.log, - ) - .await; - } } From d67395e65746256ec6d4e9d1cbaef19c19f3a1eb Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Wed, 17 Jul 2024 13:06:47 -0700 Subject: [PATCH 151/234] migration source completed tests --- nexus/src/app/sagas/instance_migrate.rs | 70 +----- nexus/src/app/sagas/instance_update/mod.rs | 244 +++++++++++++++++++++ nexus/src/app/sagas/test_helpers.rs | 53 ++++- 3 files changed, 305 insertions(+), 62 deletions(-) diff --git a/nexus/src/app/sagas/instance_migrate.rs b/nexus/src/app/sagas/instance_migrate.rs index 4432e806efa..2c61b4380c8 100644 --- a/nexus/src/app/sagas/instance_migrate.rs +++ b/nexus/src/app/sagas/instance_migrate.rs @@ -591,21 +591,16 @@ async fn sim_instance_migrate( #[cfg(test)] mod tests { + use super::*; use crate::app::sagas::test_helpers; - use camino::Utf8Path; use dropshot::test_util::ClientTestContext; - use nexus_test_interface::NexusServer; use nexus_test_utils::resource_helpers::{ create_default_ip_pool, create_project, object_create, }; - use nexus_test_utils::start_sled_agent; use nexus_test_utils_macros::nexus_test; use omicron_common::api::external::{ ByteCount, IdentityMetadataCreateParams, InstanceCpuCount, }; - use omicron_sled_agent::sim::Server; - - use super::*; type ControlPlaneTestContext = nexus_test_utils::ControlPlaneTestContext; @@ -619,35 +614,6 @@ mod tests { project.identity.id } - async fn add_sleds( - cptestctx: &ControlPlaneTestContext, - num_sleds: usize, - ) -> Vec<(SledUuid, Server)> { - let mut sas = Vec::with_capacity(num_sleds); - for _ in 0..num_sleds { - let sa_id = SledUuid::new_v4(); - let log = - cptestctx.logctx.log.new(o!("sled_id" => sa_id.to_string())); - let addr = - cptestctx.server.get_http_server_internal_address().await; - - info!(&cptestctx.logctx.log, "Adding simulated sled"; "sled_id" => %sa_id); - let update_dir = Utf8Path::new("/should/be/unused"); - let sa = start_sled_agent( - log, - addr, - sa_id, - &update_dir, - omicron_sled_agent::sim::SimMode::Explicit, - ) - .await - .unwrap(); - sas.push((sa_id, sa)); - } - - sas - } - async fn create_instance( client: &ClientTestContext, ) -> omicron_common::api::external::Instance { @@ -675,32 +641,11 @@ mod tests { .await } - fn select_first_alternate_sled( - db_vmm: &db::model::Vmm, - other_sleds: &[(SledUuid, Server)], - ) -> SledUuid { - let default_sled_uuid: SledUuid = - nexus_test_utils::SLED_AGENT_UUID.parse().unwrap(); - if other_sleds.is_empty() { - panic!("need at least one other sled"); - } - - if other_sleds.iter().any(|sled| sled.0 == default_sled_uuid) { - panic!("default test sled agent was in other_sleds"); - } - - if db_vmm.sled_id == default_sled_uuid.into_untyped_uuid() { - other_sleds[0].0 - } else { - default_sled_uuid - } - } - #[nexus_test(server = crate::Server)] async fn test_saga_basic_usage_succeeds( cptestctx: &ControlPlaneTestContext, ) { - let other_sleds = add_sleds(cptestctx, 1).await; + let other_sleds = test_helpers::add_sleds(cptestctx, 1).await; let client = &cptestctx.external_client; let nexus = &cptestctx.server.server_context().nexus; let _project_id = setup_test_project(&client).await; @@ -714,7 +659,8 @@ mod tests { let state = test_helpers::instance_fetch(cptestctx, instance_id).await; let vmm = state.vmm().as_ref().unwrap(); - let dst_sled_id = select_first_alternate_sled(vmm, &other_sleds); + let dst_sled_id = + test_helpers::select_first_alternate_sled(vmm, &other_sleds[..]); let params = Params { serialized_authn: authn::saga::Serialized::for_opctx(&opctx), instance: state.instance().clone(), @@ -747,7 +693,7 @@ mod tests { cptestctx: &ControlPlaneTestContext, ) { let log = &cptestctx.logctx.log; - let other_sleds = add_sleds(cptestctx, 1).await; + let other_sleds = test_helpers::add_sleds(cptestctx, 1).await; let client = &cptestctx.external_client; let nexus = &cptestctx.server.server_context().nexus; let _project_id = setup_test_project(&client).await; @@ -772,8 +718,10 @@ mod tests { .as_ref() .expect("instance should have a vmm before migrating"); - let dst_sled_id = - select_first_alternate_sled(old_vmm, &other_sleds); + let dst_sled_id = test_helpers::select_first_alternate_sled( + old_vmm, + &other_sleds[..], + ); info!(log, "setting up new migration saga"; "old_instance" => ?old_instance, diff --git a/nexus/src/app/sagas/instance_update/mod.rs b/nexus/src/app/sagas/instance_update/mod.rs index 75524c4d14c..06f0f532586 100644 --- a/nexus/src/app/sagas/instance_update/mod.rs +++ b/nexus/src/app/sagas/instance_update/mod.rs @@ -748,6 +748,7 @@ mod test { use crate::app::db::model::VmmRuntimeState; use crate::app::saga::create_saga_dag; use crate::app::sagas::test_helpers; + use crate::app::OpContext; use crate::external_api::params; use chrono::Utc; use dropshot::test_util::ClientTestContext; @@ -757,6 +758,9 @@ mod test { create_default_ip_pool, create_project, object_create, }; use nexus_test_utils_macros::nexus_test; + use omicron_common::api::internal::nexus::{ + MigrationRuntimeState, MigrationState, Migrations, + }; use omicron_test_utils::dev::poll; use omicron_uuid_kinds::GenericUuid; use omicron_uuid_kinds::PropolisUuid; @@ -1079,4 +1083,244 @@ mod test { .await ); } + + // === migration source completed tests === + + #[nexus_test(server = crate::Server)] + async fn test_migration_source_completed_succeeds( + cptestctx: &ControlPlaneTestContext, + ) { + let test = MigrationTest::setup(cptestctx).await; + + // Pretend the migration source has completed. + test.update_src_state( + cptestctx, + VmmState::Stopping, + MigrationState::Completed, + ) + .await; + + // Run the instance-update saga. + let nexus = &cptestctx.server.server_context().nexus; + nexus + .sagas + .saga_execute::(test.saga_params()) + .await + .expect("update saga should succeed"); + + test.verify_src_succeeded(cptestctx).await; + } + + #[nexus_test(server = crate::Server)] + async fn test_migration_source_completed_actions_succeed_idempotently( + cptestctx: &ControlPlaneTestContext, + ) { + let test = MigrationTest::setup(cptestctx).await; + + // Pretend the migration source has completed. + test.update_src_state( + cptestctx, + VmmState::Stopping, + MigrationState::Completed, + ) + .await; + + // Build the saga DAG with the provided test parameters + let dag = + create_saga_dag::(test.saga_params()).unwrap(); + + crate::app::sagas::test_helpers::actions_succeed_idempotently( + &cptestctx.server.server_context().nexus, + dag, + ) + .await; + + test.verify_src_succeeded(cptestctx).await; + } + + struct MigrationTest { + instance_id: InstanceUuid, + state: InstanceSnapshot, + authz_instance: authz::Instance, + opctx: OpContext, + } + + impl MigrationTest { + fn target_vmm_id(&self) -> Uuid { + self.state + .target_vmm + .as_ref() + .expect("migrating instance must have a target VMM") + .id + } + + async fn setup(cptestctx: &ControlPlaneTestContext) -> Self { + use crate::app::sagas::instance_migrate; + + let other_sleds = test_helpers::add_sleds(cptestctx, 1).await; + let client = &cptestctx.external_client; + let nexus = &cptestctx.server.server_context().nexus; + let datastore = nexus.datastore(); + let _project_id = setup_test_project(&client).await; + + let opctx = test_helpers::test_opctx(cptestctx); + let instance = create_instance(client).await; + let instance_id = + InstanceUuid::from_untyped_uuid(instance.identity.id); + + // Poke the instance to get it into the Running state. + let state = + test_helpers::instance_fetch(cptestctx, instance_id).await; + test_helpers::instance_simulate(cptestctx, &instance_id).await; + + // Wait for the instance update saga triggered by a transition to + // Running to complete. + let state = wait_for_update(cptestctx, state.instance()).await; + let vmm = state.vmm().as_ref().unwrap(); + let dst_sled_id = test_helpers::select_first_alternate_sled( + vmm, + &other_sleds[..], + ); + let params = instance_migrate::Params { + serialized_authn: authn::saga::Serialized::for_opctx(&opctx), + instance: state.instance().clone(), + src_vmm: vmm.clone(), + migrate_params: params::InstanceMigrate { + dst_sled_id: dst_sled_id.into_untyped_uuid(), + }, + }; + + nexus + .sagas + .saga_execute::(params) + .await + .expect("Migration saga should succeed"); + + let (_, _, authz_instance, ..) = + LookupPath::new(&opctx, &datastore) + .instance_id(instance_id.into_untyped_uuid()) + .fetch() + .await + .expect("test instance should be present in datastore"); + let state = datastore + .instance_fetch_all(&opctx, &authz_instance) + .await + .expect("test instance should be present in datastore"); + + Self { authz_instance, state, opctx, instance_id } + } + + async fn update_src_state( + &self, + cptestctx: &ControlPlaneTestContext, + vmm_state: VmmState, + migration_state: MigrationState, + ) { + let src_vmm = self + .state + .active_vmm + .as_ref() + .expect("must have an active VMM"); + let vmm_id = PropolisUuid::from_untyped_uuid(src_vmm.id); + let new_runtime = nexus_db_model::VmmRuntimeState { + time_state_updated: Utc::now(), + gen: Generation(src_vmm.runtime.gen.0.next()), + state: vmm_state, + }; + + let migration = self + .state + .migration + .as_ref() + .expect("must have an active migration"); + let migration_out = MigrationRuntimeState { + migration_id: migration.id, + state: migration_state, + gen: migration.source_gen.0.next(), + time_updated: Utc::now(), + }; + let migrations = Migrations { + migration_in: None, + migration_out: Some(&migration_out), + }; + + info!( + cptestctx.logctx.log, + "updating source VMM state..."; + "propolis_id" => %vmm_id, + "new_runtime" => ?new_runtime, + "migration_out" => ?migration_out, + ); + + cptestctx + .server + .server_context() + .nexus + .datastore() + .vmm_and_migration_update_runtime( + vmm_id, + &new_runtime, + migrations, + ) + .await + .expect("updating migration source state should succeed"); + } + + fn saga_params(&self) -> Params { + Params { + authz_instance: self.authz_instance.clone(), + serialized_authn: authn::saga::Serialized::for_opctx( + &self.opctx, + ), + } + } + + async fn verify_src_succeeded( + &self, + cptestctx: &ControlPlaneTestContext, + ) { + let state = + test_helpers::instance_fetch(cptestctx, self.instance_id).await; + let instance = state.instance(); + let instance_runtime = instance.runtime(); + + let active_vmm_id = instance_runtime.propolis_id; + assert_eq!( + active_vmm_id, + Some(self.target_vmm_id()), + "target VMM must be in the active VMM position after source success", + ); + assert_eq!( + instance_runtime.dst_propolis_id, + Some(active_vmm_id.unwrap()), + "target VMM ID must remain set until target VMM reports success", + ); + assert_eq!( + instance_runtime.migration_id, + self.state.instance.runtime().migration_id, + "migration ID must remain set until target VMM reports success", + ); + assert_eq!(instance_runtime.nexus_state, InstanceState::Vmm); + assert_instance_unlocked(instance); + assert!( + !test_helpers::no_virtual_provisioning_resource_records_exist( + cptestctx + ) + .await, + "virtual provisioning records must exist after successful migration", + ); + assert!( + !test_helpers::no_virtual_provisioning_collection_records_using_instances(cptestctx) + .await, + "virtual provisioning records must exist after successful migration", + ); + assert!( + !test_helpers::no_sled_resource_instance_records_exist( + cptestctx + ) + .await, + "sled resource records must exist after successful migration", + ); + } + } } diff --git a/nexus/src/app/sagas/test_helpers.rs b/nexus/src/app/sagas/test_helpers.rs index 6231f1fe56b..eb6e30bd22e 100644 --- a/nexus/src/app/sagas/test_helpers.rs +++ b/nexus/src/app/sagas/test_helpers.rs @@ -11,6 +11,7 @@ use crate::{ Nexus, }; use async_bb8_diesel::{AsyncRunQueryDsl, AsyncSimpleConnection}; +use camino::Utf8Path; use diesel::{ BoolExpressionMethods, ExpressionMethods, QueryDsl, SelectableHelper, }; @@ -21,11 +22,13 @@ use nexus_db_queries::{ context::OpContext, db::{datastore::InstanceAndActiveVmm, lookup::LookupPath, DataStore}, }; +use nexus_test_interface::NexusServer; +use nexus_test_utils::start_sled_agent; use nexus_types::identity::Resource; use omicron_common::api::external::Error; use omicron_common::api::external::NameOrId; use omicron_test_utils::dev::poll; -use omicron_uuid_kinds::{GenericUuid, InstanceUuid}; +use omicron_uuid_kinds::{GenericUuid, InstanceUuid, SledUuid}; use sled_agent_client::TestInterfaces as _; use slog::{info, warn, Logger}; use std::{num::NonZeroU32, sync::Arc, time::Duration}; @@ -660,3 +663,51 @@ pub(crate) async fn assert_no_failed_undo_steps( assert!(saga_node_events.is_empty()); } + +pub(crate) async fn add_sleds( + cptestctx: &ControlPlaneTestContext, + num_sleds: usize, +) -> Vec<(SledUuid, omicron_sled_agent::sim::Server)> { + let mut sas = Vec::with_capacity(num_sleds); + for _ in 0..num_sleds { + let sa_id = SledUuid::new_v4(); + let log = cptestctx.logctx.log.new(o!("sled_id" => sa_id.to_string())); + let addr = cptestctx.server.get_http_server_internal_address().await; + + info!(&cptestctx.logctx.log, "Adding simulated sled"; "sled_id" => %sa_id); + let update_dir = Utf8Path::new("/should/be/unused"); + let sa = start_sled_agent( + log, + addr, + sa_id, + &update_dir, + omicron_sled_agent::sim::SimMode::Explicit, + ) + .await + .unwrap(); + sas.push((sa_id, sa)); + } + + sas +} + +pub(crate) fn select_first_alternate_sled( + db_vmm: &crate::app::db::model::Vmm, + other_sleds: &[(SledUuid, omicron_sled_agent::sim::Server)], +) -> SledUuid { + let default_sled_uuid: SledUuid = + nexus_test_utils::SLED_AGENT_UUID.parse().unwrap(); + if other_sleds.is_empty() { + panic!("need at least one other sled"); + } + + if other_sleds.iter().any(|sled| sled.0 == default_sled_uuid) { + panic!("default test sled agent was in other_sleds"); + } + + if db_vmm.sled_id == default_sled_uuid.into_untyped_uuid() { + other_sleds[0].0 + } else { + default_sled_uuid + } +} From b9e02bc47674277f4d51e48e137c2491cb34baed Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Wed, 17 Jul 2024 13:56:11 -0700 Subject: [PATCH 152/234] add unwinding test for migration source success --- nexus/src/app/sagas/instance_update/mod.rs | 166 ++++++++++++++++++--- nexus/src/app/sagas/test_helpers.rs | 33 ++++ 2 files changed, 180 insertions(+), 19 deletions(-) diff --git a/nexus/src/app/sagas/instance_update/mod.rs b/nexus/src/app/sagas/instance_update/mod.rs index 06f0f532586..651505c8de6 100644 --- a/nexus/src/app/sagas/instance_update/mod.rs +++ b/nexus/src/app/sagas/instance_update/mod.rs @@ -764,6 +764,7 @@ mod test { use omicron_test_utils::dev::poll; use omicron_uuid_kinds::GenericUuid; use omicron_uuid_kinds::PropolisUuid; + use omicron_uuid_kinds::SledUuid; use std::time::Duration; use uuid::Uuid; @@ -900,6 +901,7 @@ mod test { async fn test_active_vmm_destroyed_succeeds( cptestctx: &ControlPlaneTestContext, ) { + let _project_id = setup_test_project(&cptestctx.external_client).await; let (state, params) = setup_active_vmm_destroyed_test(cptestctx).await; // Run the instance-update saga. @@ -918,6 +920,7 @@ mod test { async fn test_active_vmm_destroyed_actions_succeed_idempotently( cptestctx: &ControlPlaneTestContext, ) { + let _project_id = setup_test_project(&cptestctx.external_client).await; let (state, params) = setup_active_vmm_destroyed_test(cptestctx).await; // Build the saga DAG with the provided test parameters @@ -937,36 +940,71 @@ mod test { async fn test_active_vmm_destroyed_action_failure_can_unwind( cptestctx: &ControlPlaneTestContext, ) { - let (state, Params { serialized_authn, authz_instance }) = - setup_active_vmm_destroyed_test(cptestctx).await; - let instance_id = - InstanceUuid::from_untyped_uuid(state.instance().id()); - + let _project_id = setup_test_project(&cptestctx.external_client).await; + let nexus = &cptestctx.server.server_context().nexus; test_helpers::action_failure_can_unwind::( - &cptestctx.server.server_context().nexus, + nexus, || { Box::pin(async { - Params { - serialized_authn: serialized_authn.clone(), - authz_instance: authz_instance.clone(), - } + let (_, params) = + setup_active_vmm_destroyed_test(cptestctx).await; + params }) }, || { Box::pin({ async { - let state = test_helpers::instance_fetch( + let state = test_helpers::instance_fetch_by_name( cptestctx, - instance_id, + INSTANCE_NAME, + PROJECT_NAME, ) .await; + let instance = state.instance(); + // Unlike most other sagas, we actually don't unwind the // work performed by an update saga, as we would prefer // that at least some of it succeeds. The only thing // that *needs* to be rolled back when an // instance-update saga fails is that the updater lock // *MUST* be released so that a subsequent saga can run. - assert_instance_unlocked(state.instance()); + assert_instance_unlocked(instance); + + // Throw away the instance so that subsequent unwinding + // tests also operate on an instance in the correct + // preconditions to actually run the saga path we mean + // to test. + let instance_id = + InstanceUuid::from_untyped_uuid(instance.id()); + // Depending on where we got to in the update saga, the + // sled-agent may or may not actually be willing to stop + // the instance, so just manually update the DB record + // into a state where we can delete it to make sure + // everything is cleaned up for the next run. + nexus + .datastore() + .instance_update_runtime( + &instance_id, + &InstanceRuntimeState { + time_updated: Utc::now(), + gen: Generation( + instance.runtime().gen.0.next(), + ), + propolis_id: None, + dst_propolis_id: None, + migration_id: None, + nexus_state: InstanceState::NoVmm, + }, + ) + .await + .unwrap(); + + test_helpers::instance_delete_by_name( + cptestctx, + INSTANCE_NAME, + PROJECT_NAME, + ) + .await; } }) }, @@ -983,7 +1021,6 @@ mod test { let client = &cptestctx.external_client; let nexus = &cptestctx.server.server_context().nexus; let datastore = nexus.datastore().clone(); - let _project_id = setup_test_project(&client).await; let opctx = test_helpers::test_opctx(cptestctx); let instance = create_instance(client).await; @@ -1090,7 +1127,9 @@ mod test { async fn test_migration_source_completed_succeeds( cptestctx: &ControlPlaneTestContext, ) { - let test = MigrationTest::setup(cptestctx).await; + let _project_id = setup_test_project(&cptestctx.external_client).await; + let other_sleds = test_helpers::add_sleds(cptestctx, 1).await; + let test = MigrationTest::setup(cptestctx, &other_sleds).await; // Pretend the migration source has completed. test.update_src_state( @@ -1115,7 +1154,9 @@ mod test { async fn test_migration_source_completed_actions_succeed_idempotently( cptestctx: &ControlPlaneTestContext, ) { - let test = MigrationTest::setup(cptestctx).await; + let _project_id = setup_test_project(&cptestctx.external_client).await; + let other_sleds = test_helpers::add_sleds(cptestctx, 1).await; + let test = MigrationTest::setup(cptestctx, &other_sleds).await; // Pretend the migration source has completed. test.update_src_state( @@ -1138,6 +1179,92 @@ mod test { test.verify_src_succeeded(cptestctx).await; } + #[nexus_test(server = crate::Server)] + async fn test_migration_source_completed_can_unwind( + cptestctx: &ControlPlaneTestContext, + ) { + let nexus = &cptestctx.server.server_context().nexus; + let other_sleds = test_helpers::add_sleds(cptestctx, 1).await; + let _project_id = setup_test_project(&cptestctx.external_client).await; + + test_helpers::action_failure_can_unwind::( + nexus, + || { + Box::pin(async { + let test = + MigrationTest::setup(cptestctx, &other_sleds).await; + // Pretend the migration source has completed. + test.update_src_state( + cptestctx, + VmmState::Stopping, + MigrationState::Completed, + ) + .await; + test.saga_params() + }) + }, + || { + Box::pin({ + async { + let state = test_helpers::instance_fetch_by_name( + cptestctx, + INSTANCE_NAME, + PROJECT_NAME, + ) + .await; + let instance = state.instance(); + + // Unlike most other sagas, we actually don't unwind the + // work performed by an update saga, as we would prefer + // that at least some of it succeeds. The only thing + // that *needs* to be rolled back when an + // instance-update saga fails is that the updater lock + // *MUST* be released so that a subsequent saga can run. + assert_instance_unlocked(instance); + + // Throw away the instance so that subsequent unwinding + // tests also operate on an instance in the correct + // preconditions to actually run the saga path we mean + // to test. + let instance_id = + InstanceUuid::from_untyped_uuid(instance.id()); + // Depending on where we got to in the update saga, the + // sled-agent may or may not actually be willing to stop + // the instance, so just manually update the DB record + // into a state where we can delete it to make sure + // everything is cleaned up for the next run. + nexus + .datastore() + .instance_update_runtime( + &instance_id, + &InstanceRuntimeState { + time_updated: Utc::now(), + gen: Generation( + instance.runtime().gen.0.next(), + ), + propolis_id: None, + dst_propolis_id: None, + migration_id: None, + nexus_state: InstanceState::NoVmm, + }, + ) + .await + .unwrap(); + + test_helpers::instance_delete_by_name( + cptestctx, + INSTANCE_NAME, + PROJECT_NAME, + ) + .await; + } + }) + }, + &cptestctx.logctx.log, + ) + .await; + } + struct MigrationTest { instance_id: InstanceUuid, state: InstanceSnapshot, @@ -1154,14 +1281,15 @@ mod test { .id } - async fn setup(cptestctx: &ControlPlaneTestContext) -> Self { + async fn setup( + cptestctx: &ControlPlaneTestContext, + other_sleds: &[(SledUuid, omicron_sled_agent::sim::Server)], + ) -> Self { use crate::app::sagas::instance_migrate; - let other_sleds = test_helpers::add_sleds(cptestctx, 1).await; let client = &cptestctx.external_client; let nexus = &cptestctx.server.server_context().nexus; let datastore = nexus.datastore(); - let _project_id = setup_test_project(&client).await; let opctx = test_helpers::test_opctx(cptestctx); let instance = create_instance(client).await; diff --git a/nexus/src/app/sagas/test_helpers.rs b/nexus/src/app/sagas/test_helpers.rs index eb6e30bd22e..74041c4f52a 100644 --- a/nexus/src/app/sagas/test_helpers.rs +++ b/nexus/src/app/sagas/test_helpers.rs @@ -194,6 +194,39 @@ pub async fn instance_fetch( db_state } +pub async fn instance_fetch_by_name( + cptestctx: &ControlPlaneTestContext, + name: &str, + project_name: &str, +) -> InstanceAndActiveVmm { + let nexus = &cptestctx.server.server_context().nexus; + let datastore = nexus.datastore(); + let opctx = test_opctx(&cptestctx); + let instance_selector = + nexus_types::external_api::params::InstanceSelector { + project: Some(project_name.to_string().try_into().unwrap()), + instance: name.to_string().try_into().unwrap(), + }; + + let instance_lookup = + nexus.instance_lookup(&opctx, instance_selector).unwrap(); + let (_, _, authz_instance, ..) = instance_lookup.fetch().await.unwrap(); + + let db_state = datastore + .instance_fetch_with_vmm(&opctx, &authz_instance) + .await + .expect("test instance's info should be fetchable"); + + info!(&cptestctx.logctx.log, "refetched instance info from db"; + "instance_name" => name, + "project_name" => project_name, + "instance_id" => %authz_instance.id(), + "instance_and_vmm" => ?db_state, + ); + + db_state +} + pub(crate) async fn instance_wait_for_state( cptestctx: &ControlPlaneTestContext, instance_id: InstanceUuid, From 77a5a8527d46e5ec63293e2c87dc89806d47d71b Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Wed, 17 Jul 2024 14:28:29 -0700 Subject: [PATCH 153/234] shut up clippy --- nexus/src/app/sagas/instance_update/mod.rs | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/nexus/src/app/sagas/instance_update/mod.rs b/nexus/src/app/sagas/instance_update/mod.rs index 651505c8de6..ae62a85f168 100644 --- a/nexus/src/app/sagas/instance_update/mod.rs +++ b/nexus/src/app/sagas/instance_update/mod.rs @@ -1305,10 +1305,8 @@ mod test { // Running to complete. let state = wait_for_update(cptestctx, state.instance()).await; let vmm = state.vmm().as_ref().unwrap(); - let dst_sled_id = test_helpers::select_first_alternate_sled( - vmm, - &other_sleds[..], - ); + let dst_sled_id = + test_helpers::select_first_alternate_sled(vmm, other_sleds); let params = instance_migrate::Params { serialized_authn: authn::saga::Serialized::for_opctx(&opctx), instance: state.instance().clone(), From 4d7e582784e0e8242d800914e5e8900bd067a27c Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Wed, 17 Jul 2024 15:55:31 -0700 Subject: [PATCH 154/234] urghh --- nexus/examples/config-second.toml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/nexus/examples/config-second.toml b/nexus/examples/config-second.toml index 40f5d95a5f0..40fe8da632e 100644 --- a/nexus/examples/config-second.toml +++ b/nexus/examples/config-second.toml @@ -132,6 +132,8 @@ region_replacement.period_secs = 30 region_replacement_driver.period_secs = 10 # How frequently to query the status of active instances. instance_watcher.period_secs = 30 +# How frequently to schedule new instance update sagass. +instance_updater.period_secs = 30 service_firewall_propagation.period_secs = 300 v2p_mapping_propagation.period_secs = 30 abandoned_vmm_reaper.period_secs = 60 From 2c86e489a06881ca16cdc5c60f76a5c56491a628 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Wed, 17 Jul 2024 16:08:38 -0700 Subject: [PATCH 155/234] more migration tests --- nexus/src/app/sagas/instance_update/mod.rs | 355 ++++++++++++++------- 1 file changed, 233 insertions(+), 122 deletions(-) diff --git a/nexus/src/app/sagas/instance_update/mod.rs b/nexus/src/app/sagas/instance_update/mod.rs index ae62a85f168..c152369d378 100644 --- a/nexus/src/app/sagas/instance_update/mod.rs +++ b/nexus/src/app/sagas/instance_update/mod.rs @@ -194,8 +194,8 @@ impl UpdatesRequired { debug_assert_eq!(new_vmm.id, migration.target_propolis_id); new_runtime.propolis_id = Some(migration.target_propolis_id); - update_required = true; network_config = Some(NetworkConfigUpdate::to_vmm(new_vmm)); + update_required = true; } // If the target reports that the migration has completed, @@ -895,6 +895,60 @@ mod test { ) } + async fn after_unwinding(cptestctx: &ControlPlaneTestContext) { + let state = test_helpers::instance_fetch_by_name( + cptestctx, + INSTANCE_NAME, + PROJECT_NAME, + ) + .await; + let instance = state.instance(); + + // Unlike most other sagas, we actually don't unwind the + // work performed by an update saga, as we would prefer + // that at least some of it succeeds. The only thing + // that *needs* to be rolled back when an + // instance-update saga fails is that the updater lock + // *MUST* be released so that a subsequent saga can run. + assert_instance_unlocked(instance); + + // Throw away the instance so that subsequent unwinding + // tests also operate on an instance in the correct + // preconditions to actually run the saga path we mean + // to test. + let instance_id = InstanceUuid::from_untyped_uuid(instance.id()); + // Depending on where we got to in the update saga, the + // sled-agent may or may not actually be willing to stop + // the instance, so just manually update the DB record + // into a state where we can delete it to make sure + // everything is cleaned up for the next run. + cptestctx + .server + .server_context() + .nexus + .datastore() + .instance_update_runtime( + &instance_id, + &InstanceRuntimeState { + time_updated: Utc::now(), + gen: Generation(instance.runtime().gen.0.next()), + propolis_id: None, + dst_propolis_id: None, + migration_id: None, + nexus_state: InstanceState::NoVmm, + }, + ) + .await + .unwrap(); + + test_helpers::instance_delete_by_name( + cptestctx, + INSTANCE_NAME, + PROJECT_NAME, + ) + .await; + } + // === Active VMM destroyed tests ==== #[nexus_test(server = crate::Server)] @@ -951,63 +1005,7 @@ mod test { params }) }, - || { - Box::pin({ - async { - let state = test_helpers::instance_fetch_by_name( - cptestctx, - INSTANCE_NAME, - PROJECT_NAME, - ) - .await; - let instance = state.instance(); - - // Unlike most other sagas, we actually don't unwind the - // work performed by an update saga, as we would prefer - // that at least some of it succeeds. The only thing - // that *needs* to be rolled back when an - // instance-update saga fails is that the updater lock - // *MUST* be released so that a subsequent saga can run. - assert_instance_unlocked(instance); - - // Throw away the instance so that subsequent unwinding - // tests also operate on an instance in the correct - // preconditions to actually run the saga path we mean - // to test. - let instance_id = - InstanceUuid::from_untyped_uuid(instance.id()); - // Depending on where we got to in the update saga, the - // sled-agent may or may not actually be willing to stop - // the instance, so just manually update the DB record - // into a state where we can delete it to make sure - // everything is cleaned up for the next run. - nexus - .datastore() - .instance_update_runtime( - &instance_id, - &InstanceRuntimeState { - time_updated: Utc::now(), - gen: Generation( - instance.runtime().gen.0.next(), - ), - propolis_id: None, - dst_propolis_id: None, - migration_id: None, - nexus_state: InstanceState::NoVmm, - }, - ) - .await - .unwrap(); - - test_helpers::instance_delete_by_name( - cptestctx, - INSTANCE_NAME, - PROJECT_NAME, - ) - .await; - } - }) - }, + || Box::pin(after_unwinding(cptestctx)), &cptestctx.logctx.log, ) .await; @@ -1203,63 +1201,95 @@ mod test { test.saga_params() }) }, + || Box::pin(after_unwinding(cptestctx)), + &cptestctx.logctx.log, + ) + .await; + } + + // === migration target completed tests === + + #[nexus_test(server = crate::Server)] + async fn test_migration_target_completed_succeeds( + cptestctx: &ControlPlaneTestContext, + ) { + let _project_id = setup_test_project(&cptestctx.external_client).await; + let other_sleds = test_helpers::add_sleds(cptestctx, 1).await; + let test = MigrationTest::setup(cptestctx, &other_sleds).await; + + // Pretend the migration target has completed. + test.update_target_state( + cptestctx, + VmmState::Running, + MigrationState::Completed, + ) + .await; + + // Run the instance-update saga. + let nexus = &cptestctx.server.server_context().nexus; + nexus + .sagas + .saga_execute::(test.saga_params()) + .await + .expect("update saga should succeed"); + + test.verify_target_succeeded(cptestctx).await; + } + + #[nexus_test(server = crate::Server)] + async fn test_migration_target_completed_actions_succeed_idempotently( + cptestctx: &ControlPlaneTestContext, + ) { + let _project_id = setup_test_project(&cptestctx.external_client).await; + let other_sleds = test_helpers::add_sleds(cptestctx, 1).await; + let test = MigrationTest::setup(cptestctx, &other_sleds).await; + + // Pretend the migration target has completed. + test.update_target_state( + cptestctx, + VmmState::Running, + MigrationState::Completed, + ) + .await; + + // Build the saga DAG with the provided test parameters + let dag = + create_saga_dag::(test.saga_params()).unwrap(); + + crate::app::sagas::test_helpers::actions_succeed_idempotently( + &cptestctx.server.server_context().nexus, + dag, + ) + .await; + + test.verify_target_succeeded(cptestctx).await; + } + + #[nexus_test(server = crate::Server)] + async fn test_migration_target_completed_can_unwind( + cptestctx: &ControlPlaneTestContext, + ) { + let nexus = &cptestctx.server.server_context().nexus; + let other_sleds = test_helpers::add_sleds(cptestctx, 1).await; + let _project_id = setup_test_project(&cptestctx.external_client).await; + + test_helpers::action_failure_can_unwind::( + nexus, || { - Box::pin({ - async { - let state = test_helpers::instance_fetch_by_name( - cptestctx, - INSTANCE_NAME, - PROJECT_NAME, - ) - .await; - let instance = state.instance(); - - // Unlike most other sagas, we actually don't unwind the - // work performed by an update saga, as we would prefer - // that at least some of it succeeds. The only thing - // that *needs* to be rolled back when an - // instance-update saga fails is that the updater lock - // *MUST* be released so that a subsequent saga can run. - assert_instance_unlocked(instance); - - // Throw away the instance so that subsequent unwinding - // tests also operate on an instance in the correct - // preconditions to actually run the saga path we mean - // to test. - let instance_id = - InstanceUuid::from_untyped_uuid(instance.id()); - // Depending on where we got to in the update saga, the - // sled-agent may or may not actually be willing to stop - // the instance, so just manually update the DB record - // into a state where we can delete it to make sure - // everything is cleaned up for the next run. - nexus - .datastore() - .instance_update_runtime( - &instance_id, - &InstanceRuntimeState { - time_updated: Utc::now(), - gen: Generation( - instance.runtime().gen.0.next(), - ), - propolis_id: None, - dst_propolis_id: None, - migration_id: None, - nexus_state: InstanceState::NoVmm, - }, - ) - .await - .unwrap(); - - test_helpers::instance_delete_by_name( - cptestctx, - INSTANCE_NAME, - PROJECT_NAME, - ) - .await; - } + Box::pin(async { + let test = + MigrationTest::setup(cptestctx, &other_sleds).await; + // Pretend the migration target has completed. + test.update_target_state( + cptestctx, + VmmState::Running, + MigrationState::Completed, + ) + .await; + test.saga_params() }) }, + || Box::pin(after_unwinding(cptestctx)), &cptestctx.logctx.log, ) .await; @@ -1392,6 +1422,59 @@ mod test { .expect("updating migration source state should succeed"); } + async fn update_target_state( + &self, + cptestctx: &ControlPlaneTestContext, + vmm_state: VmmState, + migration_state: MigrationState, + ) { + let target_vmm = + self.state.target_vmm.as_ref().expect("must have a target VMM"); + let vmm_id = PropolisUuid::from_untyped_uuid(target_vmm.id); + let new_runtime = nexus_db_model::VmmRuntimeState { + time_state_updated: Utc::now(), + gen: Generation(target_vmm.runtime.gen.0.next()), + state: vmm_state, + }; + + let migration = self + .state + .migration + .as_ref() + .expect("must have an active migration"); + let migration_in = MigrationRuntimeState { + migration_id: migration.id, + state: migration_state, + gen: migration.target_gen.0.next(), + time_updated: Utc::now(), + }; + let migrations = Migrations { + migration_in: Some(&migration_in), + migration_out: None, + }; + + info!( + cptestctx.logctx.log, + "updating target VMM state..."; + "propolis_id" => %vmm_id, + "new_runtime" => ?new_runtime, + "migration_in" => ?migration_in, + ); + + cptestctx + .server + .server_context() + .nexus + .datastore() + .vmm_and_migration_update_runtime( + vmm_id, + &new_runtime, + migrations, + ) + .await + .expect("updating migration target state should succeed"); + } + fn saga_params(&self) -> Params { Params { authz_instance: self.authz_instance.clone(), @@ -1405,20 +1488,12 @@ mod test { &self, cptestctx: &ControlPlaneTestContext, ) { - let state = - test_helpers::instance_fetch(cptestctx, self.instance_id).await; + let state = self.verify_migration_succeeded(cptestctx).await; let instance = state.instance(); let instance_runtime = instance.runtime(); - - let active_vmm_id = instance_runtime.propolis_id; - assert_eq!( - active_vmm_id, - Some(self.target_vmm_id()), - "target VMM must be in the active VMM position after source success", - ); assert_eq!( instance_runtime.dst_propolis_id, - Some(active_vmm_id.unwrap()), + Some(self.target_vmm_id()), "target VMM ID must remain set until target VMM reports success", ); assert_eq!( @@ -1426,6 +1501,40 @@ mod test { self.state.instance.runtime().migration_id, "migration ID must remain set until target VMM reports success", ); + } + + async fn verify_target_succeeded( + &self, + cptestctx: &ControlPlaneTestContext, + ) { + let state = self.verify_migration_succeeded(cptestctx).await; + let instance = state.instance(); + let instance_runtime = instance.runtime(); + assert_eq!( + instance_runtime.dst_propolis_id, None, + "target VMM ID must be unset once VMM reports success", + ); + assert_eq!( + instance_runtime.migration_id, None, + "migration ID must be unset once target VMM reports success", + ); + } + + async fn verify_migration_succeeded( + &self, + cptestctx: &ControlPlaneTestContext, + ) -> InstanceAndActiveVmm { + let state = + test_helpers::instance_fetch(cptestctx, self.instance_id).await; + let instance = state.instance(); + let instance_runtime = instance.runtime(); + + let active_vmm_id = instance_runtime.propolis_id; + assert_eq!( + active_vmm_id, + Some(self.target_vmm_id()), + "target VMM must be in the active VMM position after migration success", + ); assert_eq!(instance_runtime.nexus_state, InstanceState::Vmm); assert_instance_unlocked(instance); assert!( @@ -1447,6 +1556,8 @@ mod test { .await, "sled resource records must exist after successful migration", ); + + state } } } From 8dae3866306665c69cdcf5a501e7e2bbbb6fedd6 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Thu, 18 Jul 2024 10:41:21 -0700 Subject: [PATCH 156/234] massively overengineered test DSL thingy --- nexus/src/app/sagas/instance_update/mod.rs | 471 ++++++++++++++------- nexus/src/app/sagas/test_helpers.rs | 51 ++- 2 files changed, 359 insertions(+), 163 deletions(-) diff --git a/nexus/src/app/sagas/instance_update/mod.rs b/nexus/src/app/sagas/instance_update/mod.rs index c152369d378..22859d0f48f 100644 --- a/nexus/src/app/sagas/instance_update/mod.rs +++ b/nexus/src/app/sagas/instance_update/mod.rs @@ -1127,25 +1127,12 @@ mod test { ) { let _project_id = setup_test_project(&cptestctx.external_client).await; let other_sleds = test_helpers::add_sleds(cptestctx, 1).await; - let test = MigrationTest::setup(cptestctx, &other_sleds).await; - - // Pretend the migration source has completed. - test.update_src_state( - cptestctx, - VmmState::Stopping, - MigrationState::Completed, - ) - .await; - - // Run the instance-update saga. - let nexus = &cptestctx.server.server_context().nexus; - nexus - .sagas - .saga_execute::(test.saga_params()) + MigrationOutcome::default() + .source(MigrationState::Completed, VmmState::Stopping) + .setup_test(cptestctx, &other_sleds) .await - .expect("update saga should succeed"); - - test.verify_src_succeeded(cptestctx).await; + .run_saga_basic_usage_succeeds_test(cptestctx) + .await; } #[nexus_test(server = crate::Server)] @@ -1154,27 +1141,13 @@ mod test { ) { let _project_id = setup_test_project(&cptestctx.external_client).await; let other_sleds = test_helpers::add_sleds(cptestctx, 1).await; - let test = MigrationTest::setup(cptestctx, &other_sleds).await; - - // Pretend the migration source has completed. - test.update_src_state( - cptestctx, - VmmState::Stopping, - MigrationState::Completed, - ) - .await; - - // Build the saga DAG with the provided test parameters - let dag = - create_saga_dag::(test.saga_params()).unwrap(); - - crate::app::sagas::test_helpers::actions_succeed_idempotently( - &cptestctx.server.server_context().nexus, - dag, - ) - .await; - test.verify_src_succeeded(cptestctx).await; + MigrationOutcome::default() + .source(MigrationState::Completed, VmmState::Stopping) + .setup_test(cptestctx, &other_sleds) + .await + .run_actions_succeed_idempotently_test(cptestctx) + .await; } #[nexus_test(server = crate::Server)] @@ -1185,20 +1158,17 @@ mod test { let other_sleds = test_helpers::add_sleds(cptestctx, 1).await; let _project_id = setup_test_project(&cptestctx.external_client).await; + let outcome = MigrationOutcome::default() + .source(MigrationState::Completed, VmmState::Stopping); + test_helpers::action_failure_can_unwind::( nexus, || { Box::pin(async { - let test = - MigrationTest::setup(cptestctx, &other_sleds).await; - // Pretend the migration source has completed. - test.update_src_state( - cptestctx, - VmmState::Stopping, - MigrationState::Completed, - ) - .await; - test.saga_params() + outcome + .setup_test(cptestctx, &other_sleds) + .await + .saga_params() }) }, || Box::pin(after_unwinding(cptestctx)), @@ -1215,25 +1185,13 @@ mod test { ) { let _project_id = setup_test_project(&cptestctx.external_client).await; let other_sleds = test_helpers::add_sleds(cptestctx, 1).await; - let test = MigrationTest::setup(cptestctx, &other_sleds).await; - - // Pretend the migration target has completed. - test.update_target_state( - cptestctx, - VmmState::Running, - MigrationState::Completed, - ) - .await; - // Run the instance-update saga. - let nexus = &cptestctx.server.server_context().nexus; - nexus - .sagas - .saga_execute::(test.saga_params()) + MigrationOutcome::default() + .target(MigrationState::Completed, VmmState::Running) + .setup_test(cptestctx, &other_sleds) .await - .expect("update saga should succeed"); - - test.verify_target_succeeded(cptestctx).await; + .run_saga_basic_usage_succeeds_test(cptestctx) + .await; } #[nexus_test(server = crate::Server)] @@ -1242,51 +1200,95 @@ mod test { ) { let _project_id = setup_test_project(&cptestctx.external_client).await; let other_sleds = test_helpers::add_sleds(cptestctx, 1).await; - let test = MigrationTest::setup(cptestctx, &other_sleds).await; - // Pretend the migration target has completed. - test.update_target_state( - cptestctx, - VmmState::Running, - MigrationState::Completed, - ) - .await; + MigrationOutcome::default() + .target(MigrationState::Completed, VmmState::Running) + .setup_test(cptestctx, &other_sleds) + .await + .run_actions_succeed_idempotently_test(cptestctx) + .await; + } - // Build the saga DAG with the provided test parameters - let dag = - create_saga_dag::(test.saga_params()).unwrap(); + #[nexus_test(server = crate::Server)] + async fn test_migration_target_completed_can_unwind( + cptestctx: &ControlPlaneTestContext, + ) { + let nexus = &cptestctx.server.server_context().nexus; + let other_sleds = test_helpers::add_sleds(cptestctx, 1).await; + let _project_id = setup_test_project(&cptestctx.external_client).await; + let outcome = MigrationOutcome::default() + .target(MigrationState::Completed, VmmState::Running); - crate::app::sagas::test_helpers::actions_succeed_idempotently( - &cptestctx.server.server_context().nexus, - dag, + test_helpers::action_failure_can_unwind::( + nexus, + || { + Box::pin(async { + outcome + .setup_test(cptestctx, &other_sleds) + .await + .saga_params() + }) + }, + || Box::pin(after_unwinding(cptestctx)), + &cptestctx.logctx.log, ) .await; + } + + // === migration completed and source destroyed tests === + + #[nexus_test(server = crate::Server)] + async fn test_migration_completed_source_destroyed_succeeds( + cptestctx: &ControlPlaneTestContext, + ) { + let _project_id = setup_test_project(&cptestctx.external_client).await; + let other_sleds = test_helpers::add_sleds(cptestctx, 1).await; - test.verify_target_succeeded(cptestctx).await; + MigrationOutcome::default() + .target(MigrationState::Completed, VmmState::Running) + .source(MigrationState::Completed, VmmState::Destroyed) + .setup_test(cptestctx, &other_sleds) + .await + .run_saga_basic_usage_succeeds_test(cptestctx) + .await; } #[nexus_test(server = crate::Server)] - async fn test_migration_target_completed_can_unwind( + async fn test_migration_completed_source_destroyed_actions_succeed_idempotently( + cptestctx: &ControlPlaneTestContext, + ) { + let _project_id = setup_test_project(&cptestctx.external_client).await; + let other_sleds = test_helpers::add_sleds(cptestctx, 1).await; + + MigrationOutcome::default() + .target(MigrationState::Completed, VmmState::Running) + .source(MigrationState::Completed, VmmState::Destroyed) + .setup_test(cptestctx, &other_sleds) + .await + .run_actions_succeed_idempotently_test(cptestctx) + .await; + } + + #[nexus_test(server = crate::Server)] + async fn test_migration_completed_source_destroyed_can_unwind( cptestctx: &ControlPlaneTestContext, ) { let nexus = &cptestctx.server.server_context().nexus; let other_sleds = test_helpers::add_sleds(cptestctx, 1).await; let _project_id = setup_test_project(&cptestctx.external_client).await; + let outcome = MigrationOutcome::default() + .target(MigrationState::Completed, VmmState::Running) + .source(MigrationState::Completed, VmmState::Destroyed); + test_helpers::action_failure_can_unwind::( nexus, || { Box::pin(async { - let test = - MigrationTest::setup(cptestctx, &other_sleds).await; - // Pretend the migration target has completed. - test.update_target_state( - cptestctx, - VmmState::Running, - MigrationState::Completed, - ) - .await; - test.saga_params() + outcome + .setup_test(cptestctx, &other_sleds) + .await + .saga_params() }) }, || Box::pin(after_unwinding(cptestctx)), @@ -1295,23 +1297,64 @@ mod test { .await; } + #[derive(Clone, Copy, Default)] + struct MigrationOutcome { + source: Option<(MigrationState, VmmState)>, + target: Option<(MigrationState, VmmState)>, + failed: bool, + } + + impl MigrationOutcome { + fn source(self, migration: MigrationState, vmm: VmmState) -> Self { + let failed = self.failed + || migration == MigrationState::Failed + || vmm == VmmState::Failed; + Self { source: Some((migration, vmm)), failed, ..self } + } + + fn target(self, migration: MigrationState, vmm: VmmState) -> Self { + let failed = self.failed + || migration == MigrationState::Failed + || vmm == VmmState::Failed; + Self { target: Some((migration, vmm)), failed, ..self } + } + + async fn setup_test( + self, + cptestctx: &ControlPlaneTestContext, + other_sleds: &[(SledUuid, omicron_sled_agent::sim::Server)], + ) -> MigrationTest { + MigrationTest::setup(self, cptestctx, other_sleds).await + } + } + struct MigrationTest { + outcome: MigrationOutcome, instance_id: InstanceUuid, - state: InstanceSnapshot, + initial_state: InstanceSnapshot, authz_instance: authz::Instance, opctx: OpContext, } impl MigrationTest { fn target_vmm_id(&self) -> Uuid { - self.state + self.initial_state .target_vmm .as_ref() .expect("migrating instance must have a target VMM") .id } + fn src_vmm_id(&self) -> Uuid { + self.initial_state + .active_vmm + .as_ref() + .expect("migrating instance must have a source VMM") + .id + } + async fn setup( + outcome: MigrationOutcome, cptestctx: &ControlPlaneTestContext, other_sleds: &[(SledUuid, omicron_sled_agent::sim::Server)], ) -> Self { @@ -1358,12 +1401,64 @@ mod test { .fetch() .await .expect("test instance should be present in datastore"); - let state = datastore + let initial_state = datastore .instance_fetch_all(&opctx, &authz_instance) .await .expect("test instance should be present in datastore"); - Self { authz_instance, state, opctx, instance_id } + let this = Self { + authz_instance, + initial_state, + outcome, + opctx, + instance_id, + }; + if let Some((migration_state, vmm_state)) = this.outcome.source { + this.update_src_state(cptestctx, vmm_state, migration_state) + .await; + } + + if let Some((migration_state, vmm_state)) = this.outcome.target { + this.update_target_state(cptestctx, vmm_state, migration_state) + .await; + } + + this + } + + async fn run_saga_basic_usage_succeeds_test( + &self, + cptestctx: &ControlPlaneTestContext, + ) { + // Run the instance-update saga. + let nexus = &cptestctx.server.server_context().nexus; + nexus + .sagas + .saga_execute::(self.saga_params()) + .await + .expect("update saga should succeed"); + + // Check the results + self.verify(cptestctx).await; + } + + async fn run_actions_succeed_idempotently_test( + &self, + cptestctx: &ControlPlaneTestContext, + ) { + // Build the saga DAG with the provided test parameters + let dag = create_saga_dag::(self.saga_params()) + .unwrap(); + + // Run the actions-succeed-idempotently test + crate::app::sagas::test_helpers::actions_succeed_idempotently( + &cptestctx.server.server_context().nexus, + dag, + ) + .await; + + // Check the results + self.verify(cptestctx).await; } async fn update_src_state( @@ -1373,7 +1468,7 @@ mod test { migration_state: MigrationState, ) { let src_vmm = self - .state + .initial_state .active_vmm .as_ref() .expect("must have an active VMM"); @@ -1385,7 +1480,7 @@ mod test { }; let migration = self - .state + .initial_state .migration .as_ref() .expect("must have an active migration"); @@ -1428,8 +1523,11 @@ mod test { vmm_state: VmmState, migration_state: MigrationState, ) { - let target_vmm = - self.state.target_vmm.as_ref().expect("must have a target VMM"); + let target_vmm = self + .initial_state + .target_vmm + .as_ref() + .expect("must have a target VMM"); let vmm_id = PropolisUuid::from_untyped_uuid(target_vmm.id); let new_runtime = nexus_db_model::VmmRuntimeState { time_state_updated: Utc::now(), @@ -1438,7 +1536,7 @@ mod test { }; let migration = self - .state + .initial_state .migration .as_ref() .expect("must have an active migration"); @@ -1484,80 +1582,137 @@ mod test { } } - async fn verify_src_succeeded( - &self, - cptestctx: &ControlPlaneTestContext, - ) { - let state = self.verify_migration_succeeded(cptestctx).await; - let instance = state.instance(); - let instance_runtime = instance.runtime(); - assert_eq!( - instance_runtime.dst_propolis_id, - Some(self.target_vmm_id()), - "target VMM ID must remain set until target VMM reports success", - ); - assert_eq!( - instance_runtime.migration_id, - self.state.instance.runtime().migration_id, - "migration ID must remain set until target VMM reports success", - ); - } - - async fn verify_target_succeeded( - &self, - cptestctx: &ControlPlaneTestContext, - ) { - let state = self.verify_migration_succeeded(cptestctx).await; - let instance = state.instance(); - let instance_runtime = instance.runtime(); - assert_eq!( - instance_runtime.dst_propolis_id, None, - "target VMM ID must be unset once VMM reports success", - ); - assert_eq!( - instance_runtime.migration_id, None, - "migration ID must be unset once target VMM reports success", + async fn verify(&self, cptestctx: &ControlPlaneTestContext) { + info!( + cptestctx.logctx.log, + "checking update saga results after migration"; + "source_outcome" => ?self.outcome.source.as_ref(), + "target_outcome" => ?self.outcome.target.as_ref(), + "migration_failed" => self.outcome.failed, ); - } - async fn verify_migration_succeeded( - &self, - cptestctx: &ControlPlaneTestContext, - ) -> InstanceAndActiveVmm { + use test_helpers::*; let state = test_helpers::instance_fetch(cptestctx, self.instance_id).await; let instance = state.instance(); let instance_runtime = instance.runtime(); let active_vmm_id = instance_runtime.propolis_id; + + assert_instance_unlocked(instance); + + if self.outcome.failed { + todo!("eliza: verify migration-failed postconditions"); + } else { + assert_eq!( + active_vmm_id, + Some(self.target_vmm_id()), + "target VMM must be in the active VMM position after migration success", + ); + assert_eq!(instance_runtime.nexus_state, InstanceState::Vmm); + if self + .outcome + .target + .as_ref() + .map(|(state, _)| state == &MigrationState::Completed) + .unwrap_or(false) + { + assert_eq!( + instance_runtime.dst_propolis_id, None, + "target VMM ID must be unset once target VMM reports success", + ); + assert_eq!( + instance_runtime.migration_id, None, + "migration ID must be unset once target VMM reports success", + ); + } else { + assert_eq!( + instance_runtime.dst_propolis_id, + Some(self.target_vmm_id()), + "target VMM ID must remain set until the target VMM reports success", + ); + assert_eq!( + instance_runtime.migration_id, + self.initial_state.instance.runtime().migration_id, + "migration ID must remain set until target VMM reports success", + ); + } + } + + let src_destroyed = self + .outcome + .source + .as_ref() + .map(|(_, state)| state == &VmmState::Destroyed) + .unwrap_or(false); assert_eq!( - active_vmm_id, - Some(self.target_vmm_id()), - "target VMM must be in the active VMM position after migration success", + self.src_resource_records_exist(cptestctx).await, + !src_destroyed, + "source VMM should exist if and only if the source hasn't been destroyed", ); - assert_eq!(instance_runtime.nexus_state, InstanceState::Vmm); - assert_instance_unlocked(instance); - assert!( - !test_helpers::no_virtual_provisioning_resource_records_exist( - cptestctx - ) - .await, - "virtual provisioning records must exist after successful migration", + + let target_destroyed = self + .outcome + .source + .as_ref() + .map(|(_, state)| state == &VmmState::Destroyed) + .unwrap_or(false); + + // TODO(eliza): this doesn't actually work because we don't actually + // poke the target simulated sled agent enough to get it to have + // resource records... + // assert_eq!( + // self.target_resource_records_exist(cptestctx).await, + // !target_destroyed, + // "target VMM should exist if and only if the target hasn't been destroyed", + // ); + + let all_vmms_destroyed = src_destroyed && target_destroyed; + + assert_eq!( + no_virtual_provisioning_resource_records_exist(cptestctx).await, + all_vmms_destroyed, + "virtual provisioning resource records must exist as long as \ + the instance has a VMM", ); - assert!( - !test_helpers::no_virtual_provisioning_collection_records_using_instances(cptestctx) - .await, - "virtual provisioning records must exist after successful migration", - ); - assert!( - !test_helpers::no_sled_resource_instance_records_exist( + assert_eq!( + no_virtual_provisioning_collection_records_using_instances( cptestctx ) .await, - "sled resource records must exist after successful migration", + all_vmms_destroyed, + "virtual provisioning collection records must exist as long \ + as the instance has a VMM", ); - state + let instance_state = if all_vmms_destroyed { + InstanceState::NoVmm + } else { + InstanceState::Vmm + }; + assert_eq!(instance_runtime.nexus_state, instance_state); + } + + async fn src_resource_records_exist( + &self, + cptestctx: &ControlPlaneTestContext, + ) -> bool { + test_helpers::sled_resources_exist_for_vmm( + cptestctx, + PropolisUuid::from_untyped_uuid(self.src_vmm_id()), + ) + .await + } + + async fn target_resource_records_exist( + &self, + cptestctx: &ControlPlaneTestContext, + ) -> bool { + test_helpers::sled_resources_exist_for_vmm( + cptestctx, + PropolisUuid::from_untyped_uuid(self.target_vmm_id()), + ) + .await } } } diff --git a/nexus/src/app/sagas/test_helpers.rs b/nexus/src/app/sagas/test_helpers.rs index 74041c4f52a..7a8cdf7f9de 100644 --- a/nexus/src/app/sagas/test_helpers.rs +++ b/nexus/src/app/sagas/test_helpers.rs @@ -28,7 +28,7 @@ use nexus_types::identity::Resource; use omicron_common::api::external::Error; use omicron_common::api::external::NameOrId; use omicron_test_utils::dev::poll; -use omicron_uuid_kinds::{GenericUuid, InstanceUuid, SledUuid}; +use omicron_uuid_kinds::{GenericUuid, InstanceUuid, PropolisUuid, SledUuid}; use sled_agent_client::TestInterfaces as _; use slog::{info, warn, Logger}; use std::{num::NonZeroU32, sync::Arc, time::Duration}; @@ -324,6 +324,12 @@ async fn instance_poll_state( pub async fn no_virtual_provisioning_resource_records_exist( cptestctx: &ControlPlaneTestContext, ) -> bool { + count_virtual_provisioning_resource_records(cptestctx).await == 0 +} + +pub async fn count_virtual_provisioning_resource_records( + cptestctx: &ControlPlaneTestContext, +) -> usize { use nexus_db_queries::db::model::VirtualProvisioningResource; use nexus_db_queries::db::schema::virtual_provisioning_resource::dsl; @@ -331,7 +337,7 @@ pub async fn no_virtual_provisioning_resource_records_exist( let conn = datastore.pool_connection_for_tests().await.unwrap(); datastore - .transaction_retry_wrapper("no_virtual_provisioning_resource_records_exist") + .transaction_retry_wrapper("count_virtual_provisioning_resource_records") .transaction(&conn, |conn| async move { conn .batch_execute_async(nexus_test_utils::db::ALLOW_FULL_TABLE_SCAN_SQL) @@ -345,7 +351,7 @@ pub async fn no_virtual_provisioning_resource_records_exist( .get_results_async::(&conn) .await .unwrap() - .is_empty() + .len() ) }).await.unwrap() } @@ -353,6 +359,14 @@ pub async fn no_virtual_provisioning_resource_records_exist( pub async fn no_virtual_provisioning_collection_records_using_instances( cptestctx: &ControlPlaneTestContext, ) -> bool { + count_virtual_provisioning_collection_records_using_instances(cptestctx) + .await + == 0 +} + +pub async fn count_virtual_provisioning_collection_records_using_instances( + cptestctx: &ControlPlaneTestContext, +) -> usize { use nexus_db_queries::db::model::VirtualProvisioningCollection; use nexus_db_queries::db::schema::virtual_provisioning_collection::dsl; @@ -361,7 +375,7 @@ pub async fn no_virtual_provisioning_collection_records_using_instances( datastore .transaction_retry_wrapper( - "no_virtual_provisioning_collection_records_using_instances", + "count_virtual_provisioning_collection_records_using_instances", ) .transaction(&conn, |conn| async move { conn.batch_execute_async( @@ -377,7 +391,7 @@ pub async fn no_virtual_provisioning_collection_records_using_instances( .get_results_async::(&conn) .await .unwrap() - .is_empty()) + .len()) }) .await .unwrap() @@ -414,6 +428,33 @@ pub async fn no_sled_resource_instance_records_exist( .unwrap() } +pub async fn sled_resources_exist_for_vmm( + cptestctx: &ControlPlaneTestContext, + vmm_id: PropolisUuid, +) -> bool { + use nexus_db_queries::db::model::SledResource; + use nexus_db_queries::db::model::SledResourceKind; + use nexus_db_queries::db::schema::sled_resource::dsl; + + let datastore = cptestctx.server.server_context().nexus.datastore(); + let conn = datastore.pool_connection_for_tests().await.unwrap(); + + let results = dsl::sled_resource + .filter(dsl::kind.eq(SledResourceKind::Instance)) + .filter(dsl::id.eq(vmm_id.into_untyped_uuid())) + .select(SledResource::as_select()) + .load_async(&*conn) + .await + .unwrap(); + info!( + cptestctx.logctx.log, + "queried sled reservation records for VMM"; + "vmm_id" => %vmm_id, + "results" => ?results, + ); + !results.is_empty() +} + /// Tests that the saga described by `dag` succeeds if each of its nodes is /// repeated. /// From 98d5c4a99f81d13d07252dac9669c1d961dc4ba5 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Thu, 18 Jul 2024 11:02:03 -0700 Subject: [PATCH 157/234] fix disappearing target sled resources --- nexus/src/app/sagas/instance_update/mod.rs | 28 ++++++++++++++-------- nexus/src/app/sagas/test_helpers.rs | 20 ++++++++++++++++ 2 files changed, 38 insertions(+), 10 deletions(-) diff --git a/nexus/src/app/sagas/instance_update/mod.rs b/nexus/src/app/sagas/instance_update/mod.rs index 22859d0f48f..8f9828c86db 100644 --- a/nexus/src/app/sagas/instance_update/mod.rs +++ b/nexus/src/app/sagas/instance_update/mod.rs @@ -1373,10 +1373,10 @@ mod test { let state = test_helpers::instance_fetch(cptestctx, instance_id).await; test_helpers::instance_simulate(cptestctx, &instance_id).await; - // Wait for the instance update saga triggered by a transition to // Running to complete. let state = wait_for_update(cptestctx, state.instance()).await; + let vmm = state.vmm().as_ref().unwrap(); let dst_sled_id = test_helpers::select_first_alternate_sled(vmm, other_sleds); @@ -1395,6 +1395,17 @@ mod test { .await .expect("Migration saga should succeed"); + // Poke the destination sled just enough to make it appear to have a VMM. + test_helpers::instance_single_step_on_sled( + cptestctx, + &instance_id, + &dst_sled_id, + ) + .await; + // Wait for the instance update saga triggered by poking the target + // VMM to complete (it should be a NOP). + wait_for_update(cptestctx, state.instance()).await; + let (_, _, authz_instance, ..) = LookupPath::new(&opctx, &datastore) .instance_id(instance_id.into_untyped_uuid()) @@ -1653,19 +1664,16 @@ mod test { let target_destroyed = self .outcome - .source + .target .as_ref() .map(|(_, state)| state == &VmmState::Destroyed) .unwrap_or(false); - // TODO(eliza): this doesn't actually work because we don't actually - // poke the target simulated sled agent enough to get it to have - // resource records... - // assert_eq!( - // self.target_resource_records_exist(cptestctx).await, - // !target_destroyed, - // "target VMM should exist if and only if the target hasn't been destroyed", - // ); + assert_eq!( + self.target_resource_records_exist(cptestctx).await, + !target_destroyed, + "target VMM should exist if and only if the target hasn't been destroyed", + ); let all_vmms_destroyed = src_destroyed && target_destroyed; diff --git a/nexus/src/app/sagas/test_helpers.rs b/nexus/src/app/sagas/test_helpers.rs index 7a8cdf7f9de..31a77d49988 100644 --- a/nexus/src/app/sagas/test_helpers.rs +++ b/nexus/src/app/sagas/test_helpers.rs @@ -142,6 +142,26 @@ pub(crate) async fn instance_simulate( sa.instance_finish_transition(instance_id.into_untyped_uuid()).await; } +pub(crate) async fn instance_single_step_on_sled( + cptestctx: &ControlPlaneTestContext, + instance_id: &InstanceUuid, + sled_id: &SledUuid, +) { + info!( + &cptestctx.logctx.log, + "Single-stepping simulated instance on sled"; + "instance_id" => %instance_id, + "sled_id" => %sled_id, + ); + let nexus = &cptestctx.server.server_context().nexus; + let sa = nexus + .sled_client(sled_id) + .await + .expect("sled must exist to simulate a state change"); + + sa.instance_single_step(instance_id.into_untyped_uuid()).await; +} + pub(crate) async fn instance_simulate_by_name( cptestctx: &ControlPlaneTestContext, name: &str, From 1aefe47f546d53d9d567be575602833c5fb5552b Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Thu, 18 Jul 2024 13:52:20 -0700 Subject: [PATCH 158/234] avoid spurious update sagas --- nexus/db-queries/src/db/datastore/instance.rs | 26 +------ nexus/db-queries/src/db/datastore/vmm.rs | 29 +++---- nexus/src/app/instance.rs | 77 ++++++++++++++----- nexus/src/app/sagas/instance_update/mod.rs | 69 +---------------- 4 files changed, 75 insertions(+), 126 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/instance.rs b/nexus/db-queries/src/db/datastore/instance.rs index 74de734951c..18c3b129d57 100644 --- a/nexus/db-queries/src/db/datastore/instance.rs +++ b/nexus/db-queries/src/db/datastore/instance.rs @@ -230,15 +230,8 @@ pub struct InstanceUpdateResult { pub instance_updated: bool, /// `true` if the VMM record was updated, `false` otherwise. pub vmm_updated: bool, - /// Indicates whether a migration record for this instance was updated, if - /// [`Migrations`] were provided to - /// [`DataStore::instance_and_vmm_update_runtime`]. - /// - /// - `Some(true)` if a migration record was updated - /// - `Some(false)` if [`Migrations`] were provided, but the - /// migration record was not updated - /// - `None` if no [`Migrations`] were provided - pub migration_updated: Option, + pub migration_in_updated: bool, + pub migration_out_updated: bool, } impl DataStore { @@ -781,22 +774,11 @@ impl DataStore { Some(UpdateStatus::NotUpdatedButExists) => false, None => false, }; - - let migration_updated = if migrations.migration_in.is_some() - || migrations.migration_out.is_some() - { - Some( - result.migration_in_status.was_updated() - || result.migration_out_status.was_updated(), - ) - } else { - None - }; - Ok(InstanceUpdateResult { instance_updated, vmm_updated, - migration_updated, + migration_in_updated: result.migration_in_status.was_updated(), + migration_out_updated: result.migration_out_status.was_updated(), }) } diff --git a/nexus/db-queries/src/db/datastore/vmm.rs b/nexus/db-queries/src/db/datastore/vmm.rs index 0ffd4b1f88d..308cfc67db2 100644 --- a/nexus/db-queries/src/db/datastore/vmm.rs +++ b/nexus/db-queries/src/db/datastore/vmm.rs @@ -7,6 +7,7 @@ use super::DataStore; use crate::authz; use crate::context::OpContext; +use crate::db::datastore::instance::InstanceUpdateResult; use crate::db::error::public_error_from_diesel; use crate::db::error::ErrorHandler; use crate::db::model::Vmm; @@ -147,7 +148,7 @@ impl DataStore { vmm_id: PropolisUuid, new_runtime: &VmmRuntimeState, migrations: Migrations<'_>, - ) -> Result<(bool, Option), Error> { + ) -> Result { let query = crate::db::queries::instance::InstanceAndVmmUpdate::new( vmm_id, new_runtime.clone(), @@ -163,22 +164,16 @@ impl DataStore { .await .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?; - let vmm_updated = match result.vmm_status { - Some(UpdateStatus::Updated) => true, - Some(UpdateStatus::NotUpdatedButExists) => false, - None => false, - }; - let migration_updated = if migrations.migration_in.is_some() - || migrations.migration_out.is_some() - { - Some( - result.migration_in_status.was_updated() - || result.migration_out_status.was_updated(), - ) - } else { - None - }; - Ok((vmm_updated, migration_updated)) + Ok(InstanceUpdateResult { + instance_updated: false, + vmm_updated: match result.vmm_status { + Some(UpdateStatus::Updated) => true, + Some(UpdateStatus::NotUpdatedButExists) => false, + None => false, + }, + migration_in_updated: result.migration_in_status.was_updated(), + migration_out_updated: result.migration_out_status.was_updated(), + }) } /// Forcibly overwrites the Propolis IP/Port in the supplied VMM's record with diff --git a/nexus/src/app/instance.rs b/nexus/src/app/instance.rs index 9750b2dd4e6..fc5f362a94d 100644 --- a/nexus/src/app/instance.rs +++ b/nexus/src/app/instance.rs @@ -1201,19 +1201,15 @@ impl super::Nexus { "instance_id" => %instance_id, "propolis_id" => %state.propolis_id, "result" => ?update_result); - let (vmm_updated, migration_updated) = update_result?; - Ok(InstanceUpdateResult { - instance_updated: false, - vmm_updated, - migration_updated, - }) + update_result } else { // There was no instance state to write back, so --- perhaps // obviously --- nothing happened. Ok(InstanceUpdateResult { instance_updated: false, vmm_updated: false, - migration_updated: None, + migration_in_updated: false, + migration_out_updated: false, }) } } @@ -1906,7 +1902,7 @@ pub(crate) async fn notify_instance_updated( "migration_state" => ?migrations, ); - let (vmm_updated, migration_updated) = datastore + let result = datastore .vmm_and_migration_update_runtime( propolis_id, // TODO(eliza): probably should take this by value... @@ -1915,19 +1911,60 @@ pub(crate) async fn notify_instance_updated( ) .await?; - // If the instance or VMM records in the database have changed as a result - // of this update, prepare an `instance-update` saga to ensure that the - // changes are reflected by the instance record. + // Determine whether an `instance-update` saga should be executed. + // + // We determine this only after actually updating the database records, + // because we don't know whether a particular VMM or migration state is + // *new* or not until we know whether the corresponding database record has + // actually changed (determined by the generation number). For example, when + // an instance has migrated into a Propolis process, Propolis will continue + // to report the migration in in the `Completed` state as part of all state + // updates regarding that instance, but we no longer need to act on it if + // the migration record has already been updated to reflect that the + // migration has completed. + // + // Once we know what rows have been updated, we can inspect the states + // written to the DB to determine whether an instance-update saga is + // required to bring the instance record's state in line with the new + // VMM/migration states. // - // TODO(eliza): it would be nice to be smarter about determining whether we - // need to run a saga here. We don't need to run an instance-update saga - // *every* time a VMM or migration has been updated. instead, we should only - // trigger them if any side of the migration has *terminated*, or if the - // active VMM state transitioned to Destroyed. Eliding unnecessary start - // sagas would reduce updater lock contention and allow the necessary sagas - // to run in a timelier manner. - let updated = vmm_updated || migration_updated.unwrap_or(false); - if updated { + // Currently, an instance-update saga is required if (and only if): + // + // - The instance's active VMM has transitioned to `Destroyed`. We don't + // actually know whether the VMM whose state was updated here was the + // active VMM or not, so we will always attempt to run an instance-update + // saga if the VMM was `Destroyed`. + let vmm_needs_update = result.vmm_updated + && new_runtime_state.vmm_state.state == nexus::VmmState::Destroyed; + // - A migration in to this VMM has transitioned to a terminal state + // (`Failed` or `Completed`). + let migration_in_needs_update = result.migration_in_updated + && migrations + .migration_in + .map(|migration| migration.state.is_terminal()) + .unwrap_or(false); + // - A migration out from this VMM has transitioned to a terminal state + // (`Failed` or `Completed`). + let migration_out_needs_update = result.migration_out_updated + && migrations + .migration_out + .map(|migration| migration.state.is_terminal()) + .unwrap_or(false); + // If any of the above conditions are true, prepare an instance-update saga + // for this instance. + if vmm_needs_update + || migration_in_needs_update + || migration_out_needs_update + { + debug!(opctx.log, + "new VMM runtime state from sled agent requires an instance-update saga"; + "instance_id" => %instance_id, + "propolis_id" => %propolis_id, + "vmm_needs_update" => vmm_needs_update, + "migration_in_needs_update" => migration_in_needs_update, + "migration_out_needs_update" => migration_out_needs_update, + ); + let (.., authz_instance) = LookupPath::new(&opctx, datastore) .instance_id(instance_id.into_untyped_uuid()) .lookup_for(authz::Action::Modify) diff --git a/nexus/src/app/sagas/instance_update/mod.rs b/nexus/src/app/sagas/instance_update/mod.rs index 8f9828c86db..9cdd041d0c2 100644 --- a/nexus/src/app/sagas/instance_update/mod.rs +++ b/nexus/src/app/sagas/instance_update/mod.rs @@ -830,63 +830,6 @@ mod test { .await } - /// Wait for an update saga to complete for the provided `instance`. - async fn wait_for_update( - cptestctx: &ControlPlaneTestContext, - instance: &Instance, - ) -> InstanceAndActiveVmm { - // I'd be pretty surprised if an update saga takes longer than a minute - // to complete in a unit test. If a saga hasn't run, failing the test in - // a timely manner is helpful, so making this *too* long could be annoying... - const MAX_WAIT: Duration = Duration::from_secs(60); - - let instance_id = InstanceUuid::from_untyped_uuid(instance.id()); - let initial_gen = instance.updater_gen; - let log = &cptestctx.logctx.log; - - info!( - log, - "waiting for instance update to complete..."; - "instance_id" => %instance_id, - "initial_gen" => ?initial_gen, - ); - - poll::wait_for_condition( - || async { - let state = - test_helpers::instance_fetch(cptestctx, instance_id).await; - let instance = state.instance(); - if instance.updater_gen > initial_gen - && instance.updater_id.is_none() - { - info!( - log, - "instance update completed!"; - "instance_id" => %instance_id, - "initial_gen" => ?initial_gen, - "current_gen" => ?instance.updater_gen, - ); - return Ok(state); - } - - info!( - log, - "instance update has not yet completed..."; - "instance_id" => %instance_id, - "initial_gen" => ?initial_gen, - "current_gen" => ?instance.updater_gen, - "current_updater" => ?instance.updater_id, - ); - Err(poll::CondCheckError::NotYet::<()>) - }, - // A lot can happen in one second... - &Duration::from_secs(1), - &MAX_WAIT, - ) - .await - .unwrap() - } - #[track_caller] fn assert_instance_unlocked(instance: &Instance) { assert_eq!( @@ -1025,11 +968,9 @@ mod test { let instance_id = InstanceUuid::from_untyped_uuid(instance.identity.id); // Poke the instance to get it into the Running state. - let state = test_helpers::instance_fetch(cptestctx, instance_id).await; test_helpers::instance_simulate(cptestctx, &instance_id).await; - // Wait for the instance update saga triggered by a transition to - // Running to complete. - let state = wait_for_update(cptestctx, state.instance()).await; + + let state = test_helpers::instance_fetch(cptestctx, instance_id).await; // The instance should have an active VMM. let instance_runtime = state.instance().runtime(); assert_eq!(instance_runtime.nexus_state, InstanceState::Vmm); @@ -1373,9 +1314,6 @@ mod test { let state = test_helpers::instance_fetch(cptestctx, instance_id).await; test_helpers::instance_simulate(cptestctx, &instance_id).await; - // Wait for the instance update saga triggered by a transition to - // Running to complete. - let state = wait_for_update(cptestctx, state.instance()).await; let vmm = state.vmm().as_ref().unwrap(); let dst_sled_id = @@ -1402,9 +1340,6 @@ mod test { &dst_sled_id, ) .await; - // Wait for the instance update saga triggered by poking the target - // VMM to complete (it should be a NOP). - wait_for_update(cptestctx, state.instance()).await; let (_, _, authz_instance, ..) = LookupPath::new(&opctx, &datastore) From f91a2d1034b40ba741de0dec3690dd67f4eb8fab Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Thu, 18 Jul 2024 14:37:23 -0700 Subject: [PATCH 159/234] move update saga needed logic into saga module I could be convinced otherwise about this change. My thought process was that it seemed nice for that logic to live next to the documentation for the update saga itself, so that a reader can see it all at the same time. But, on the other hand, this is a function that's only called in one place, which is also not my favorite thing. --- nexus/src/app/instance.rs | 65 ++++----------------- nexus/src/app/sagas/instance_update/mod.rs | 67 ++++++++++++++++++++++ 2 files changed, 77 insertions(+), 55 deletions(-) diff --git a/nexus/src/app/instance.rs b/nexus/src/app/instance.rs index fc5f362a94d..cdd228e7b3e 100644 --- a/nexus/src/app/instance.rs +++ b/nexus/src/app/instance.rs @@ -1884,7 +1884,8 @@ impl super::Nexus { } /// Invoked by a sled agent to publish an updated runtime state for an -/// Instance, returning an update saga for that instance. +/// Instance, returning an update saga for that instance (if one must be +/// executed). pub(crate) async fn notify_instance_updated( datastore: &DataStore, opctx: &OpContext, @@ -1911,60 +1912,14 @@ pub(crate) async fn notify_instance_updated( ) .await?; - // Determine whether an `instance-update` saga should be executed. - // - // We determine this only after actually updating the database records, - // because we don't know whether a particular VMM or migration state is - // *new* or not until we know whether the corresponding database record has - // actually changed (determined by the generation number). For example, when - // an instance has migrated into a Propolis process, Propolis will continue - // to report the migration in in the `Completed` state as part of all state - // updates regarding that instance, but we no longer need to act on it if - // the migration record has already been updated to reflect that the - // migration has completed. - // - // Once we know what rows have been updated, we can inspect the states - // written to the DB to determine whether an instance-update saga is - // required to bring the instance record's state in line with the new - // VMM/migration states. - // - // Currently, an instance-update saga is required if (and only if): - // - // - The instance's active VMM has transitioned to `Destroyed`. We don't - // actually know whether the VMM whose state was updated here was the - // active VMM or not, so we will always attempt to run an instance-update - // saga if the VMM was `Destroyed`. - let vmm_needs_update = result.vmm_updated - && new_runtime_state.vmm_state.state == nexus::VmmState::Destroyed; - // - A migration in to this VMM has transitioned to a terminal state - // (`Failed` or `Completed`). - let migration_in_needs_update = result.migration_in_updated - && migrations - .migration_in - .map(|migration| migration.state.is_terminal()) - .unwrap_or(false); - // - A migration out from this VMM has transitioned to a terminal state - // (`Failed` or `Completed`). - let migration_out_needs_update = result.migration_out_updated - && migrations - .migration_out - .map(|migration| migration.state.is_terminal()) - .unwrap_or(false); - // If any of the above conditions are true, prepare an instance-update saga - // for this instance. - if vmm_needs_update - || migration_in_needs_update - || migration_out_needs_update - { - debug!(opctx.log, - "new VMM runtime state from sled agent requires an instance-update saga"; - "instance_id" => %instance_id, - "propolis_id" => %propolis_id, - "vmm_needs_update" => vmm_needs_update, - "migration_in_needs_update" => migration_in_needs_update, - "migration_out_needs_update" => migration_out_needs_update, - ); - + // If an instance-update saga must be executed as a result of this update, + // prepare and return it. + if instance_update::update_saga_needed( + &opctx.log, + instance_id, + new_runtime_state, + &result, + ) { let (.., authz_instance) = LookupPath::new(&opctx, datastore) .instance_id(instance_id.into_untyped_uuid()) .lookup_for(authz::Action::Modify) diff --git a/nexus/src/app/sagas/instance_update/mod.rs b/nexus/src/app/sagas/instance_update/mod.rs index 9cdd041d0c2..b87e82d6c29 100644 --- a/nexus/src/app/sagas/instance_update/mod.rs +++ b/nexus/src/app/sagas/instance_update/mod.rs @@ -24,6 +24,7 @@ use chrono::Utc; use nexus_db_queries::{authn, authz}; use nexus_types::identity::Resource; use omicron_common::api::external::Error; +use omicron_common::api::internal::nexus; use omicron_uuid_kinds::GenericUuid; use omicron_uuid_kinds::InstanceUuid; use omicron_uuid_kinds::PropolisUuid; @@ -48,6 +49,72 @@ pub(crate) use self::start::{Params, SagaInstanceUpdate}; mod destroyed; +/// Returns `true` if an `instance-update` saga should be executed as a result +/// of writing the provided [`SledInstanceState`] to the database with the +/// provided [`InstanceUpdateResult`]. +/// +/// We determine this only after actually updating the database records, +/// because we don't know whether a particular VMM or migration state is +/// *new* or not until we know whether the corresponding database record has +/// actually changed (determined by the generation number). For example, when +/// an instance has migrated into a Propolis process, Propolis will continue +/// to report the migration in in the `Completed` state as part of all state +/// updates regarding that instance, but we no longer need to act on it if +/// the migration record has already been updated to reflect that the +/// migration has completed. +/// +/// Once we know what rows have been updated, we can inspect the states +/// written to the DB to determine whether an instance-update saga is +/// required to bring the instance record's state in line with the new +/// VMM/migration states. +pub fn update_saga_needed( + log: &slog::Logger, + instance_id: InstanceUuid, + state: &nexus::SledInstanceState, + result: &instance::InstanceUpdateResult, +) -> bool { + // Currently, an instance-update saga is required if (and only if): + // + // - The instance's active VMM has transitioned to `Destroyed`. We don't + // actually know whether the VMM whose state was updated here was the + // active VMM or not, so we will always attempt to run an instance-update + // saga if the VMM was `Destroyed`. + let vmm_needs_update = result.vmm_updated + && state.vmm_state.state == nexus::VmmState::Destroyed; + // - A migration in to this VMM has transitioned to a terminal state + // (`Failed` or `Completed`). + let migrations = state.migrations(); + let migration_in_needs_update = result.migration_in_updated + && migrations + .migration_in + .map(|migration| migration.state.is_terminal()) + .unwrap_or(false); + // - A migration out from this VMM has transitioned to a terminal state + // (`Failed` or `Completed`). + let migration_out_needs_update = result.migration_out_updated + && migrations + .migration_out + .map(|migration| migration.state.is_terminal()) + .unwrap_or(false); + // If any of the above conditions are true, prepare an instance-update saga + // for this instance. + let needed = vmm_needs_update + || migration_in_needs_update + || migration_out_needs_update; + if needed { + debug!(log, + "new VMM runtime state from sled agent requires an \ + instance-update saga"; + "instance_id" => %instance_id, + "propolis_id" => %state.propolis_id, + "vmm_needs_update" => vmm_needs_update, + "migration_in_needs_update" => migration_in_needs_update, + "migration_out_needs_update" => migration_out_needs_update, + ); + } + needed +} + #[derive(Debug, Deserialize, Serialize)] struct UpdatesRequired { /// The new runtime state that must be written back to the database. From e74fe4cc306c7e1b8a0aea850028d041859427b9 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Thu, 18 Jul 2024 14:46:30 -0700 Subject: [PATCH 160/234] get rid of `write_returned_instance_state` This function is now basically just a version of `notify_instance_updated` that doesn't run an update saga. We should run update sagas for states returned by sled-agents (if needed), so `write_returned_instance_state` should no longer exist. Use `notify_instance_updated` instead. --- nexus/src/app/instance.rs | 66 +++++---------------------------------- 1 file changed, 8 insertions(+), 58 deletions(-) diff --git a/nexus/src/app/instance.rs b/nexus/src/app/instance.rs index cdd228e7b3e..55147ae835c 100644 --- a/nexus/src/app/instance.rs +++ b/nexus/src/app/instance.rs @@ -852,13 +852,13 @@ impl super::Nexus { // the caller to let it decide how to handle it. // // When creating the zone for the first time, we just get - // Ok(None) here, which is a no-op in write_returned_instance_state. + // Ok(None) here, in which case, there's nothing to write back. match instance_put_result { - Ok(state) => self - .write_returned_instance_state(&instance_id, state) + Ok(Some(ref state)) => self + .notify_instance_updated(opctx, instance_id, state) .await - .map(|_| ()) .map_err(Into::into), + Ok(None) => Ok(()), Err(e) => Err(InstanceStateChangeError::SledAgent(e)), } } @@ -1139,12 +1139,13 @@ impl super::Nexus { }, ) .await - .map(|res| Some(res.into_inner().into())) + .map(|res| res.into_inner().into()) .map_err(|e| SledAgentInstancePutError(e)); match instance_register_result { Ok(state) => { - self.write_returned_instance_state(&instance_id, state).await?; + self.notify_instance_updated(opctx, instance_id, &state) + .await?; } Err(e) => { if e.instance_unhealthy() { @@ -1163,57 +1164,6 @@ impl super::Nexus { Ok(()) } - /// Takes an updated instance state returned from a call to sled agent and - /// writes it back to the database. - /// - /// # Return value - /// - /// - `Ok((instance_updated, vmm_updated))` if no failures occurred. The - /// tuple fields indicate which database records (if any) were updated. - /// Note that it is possible for sled agent not to return an updated - /// instance state from a particular API call. In that case, the `state` - /// parameter is `None` and this routine returns `Ok((false, false))`. - /// - `Err` if an error occurred while writing state to the database. A - /// database operation that succeeds but doesn't update anything (e.g. - /// owing to an outdated generation number) will return `Ok`. - async fn write_returned_instance_state( - &self, - instance_id: &InstanceUuid, - state: Option, - ) -> Result { - slog::debug!(&self.log, - "writing instance state returned from sled agent"; - "instance_id" => %instance_id, - "new_state" => ?state); - - if let Some(state) = state { - let update_result = self - .db_datastore - .vmm_and_migration_update_runtime( - state.propolis_id, - &state.vmm_state.clone().into(), - state.migrations(), - ) - .await; - - slog::debug!(&self.log, - "attempted to write instance state from sled agent"; - "instance_id" => %instance_id, - "propolis_id" => %state.propolis_id, - "result" => ?update_result); - update_result - } else { - // There was no instance state to write back, so --- perhaps - // obviously --- nothing happened. - Ok(InstanceUpdateResult { - instance_updated: false, - vmm_updated: false, - migration_in_updated: false, - migration_out_updated: false, - }) - } - } - /// Attempts to move an instance from `prev_instance_runtime` to the /// `Failed` state in response to an error returned from a call to a sled /// agent instance API, supplied in `reason`. @@ -1375,7 +1325,7 @@ impl super::Nexus { /// Invoked by a sled agent to publish an updated runtime state for an /// Instance. pub(crate) async fn notify_instance_updated( - self: &Arc, + &self, opctx: &OpContext, instance_id: InstanceUuid, new_runtime_state: &nexus::SledInstanceState, From 20311aa9e006b2ad3d1537e003787cb956e93a43 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Thu, 18 Jul 2024 16:11:01 -0700 Subject: [PATCH 161/234] cleanup warnings --- nexus/src/app/instance.rs | 1 - nexus/src/app/sagas/instance_update/mod.rs | 2 -- 2 files changed, 3 deletions(-) diff --git a/nexus/src/app/instance.rs b/nexus/src/app/instance.rs index 55147ae835c..089f8f9e1e8 100644 --- a/nexus/src/app/instance.rs +++ b/nexus/src/app/instance.rs @@ -28,7 +28,6 @@ use nexus_db_queries::authn; use nexus_db_queries::authz; use nexus_db_queries::context::OpContext; use nexus_db_queries::db; -use nexus_db_queries::db::datastore::instance::InstanceUpdateResult; use nexus_db_queries::db::datastore::InstanceAndActiveVmm; use nexus_db_queries::db::identity::Resource; use nexus_db_queries::db::lookup; diff --git a/nexus/src/app/sagas/instance_update/mod.rs b/nexus/src/app/sagas/instance_update/mod.rs index b87e82d6c29..06f0ec053d3 100644 --- a/nexus/src/app/sagas/instance_update/mod.rs +++ b/nexus/src/app/sagas/instance_update/mod.rs @@ -828,11 +828,9 @@ mod test { use omicron_common::api::internal::nexus::{ MigrationRuntimeState, MigrationState, Migrations, }; - use omicron_test_utils::dev::poll; use omicron_uuid_kinds::GenericUuid; use omicron_uuid_kinds::PropolisUuid; use omicron_uuid_kinds::SledUuid; - use std::time::Duration; use uuid::Uuid; type ControlPlaneTestContext = From 77dc84bf6cdee5767312b1baa4c96914cd93b5c3 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Thu, 18 Jul 2024 16:26:34 -0700 Subject: [PATCH 162/234] docs unhappiness --- nexus/src/app/sagas/instance_update/mod.rs | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/nexus/src/app/sagas/instance_update/mod.rs b/nexus/src/app/sagas/instance_update/mod.rs index 06f0ec053d3..56154722794 100644 --- a/nexus/src/app/sagas/instance_update/mod.rs +++ b/nexus/src/app/sagas/instance_update/mod.rs @@ -10,6 +10,7 @@ use super::{ ACTION_GENERATE_ID, }; use crate::app::db::datastore::instance; +use crate::app::db::datastore::instance::InstanceUpdateResult; use crate::app::db::datastore::InstanceSnapshot; use crate::app::db::lookup::LookupPath; use crate::app::db::model::Generation; @@ -50,8 +51,8 @@ pub(crate) use self::start::{Params, SagaInstanceUpdate}; mod destroyed; /// Returns `true` if an `instance-update` saga should be executed as a result -/// of writing the provided [`SledInstanceState`] to the database with the -/// provided [`InstanceUpdateResult`]. +/// of writing the provided [`nexus::SledInstanceState`] to the database with +/// the provided [`InstanceUpdateResult`]. /// /// We determine this only after actually updating the database records, /// because we don't know whether a particular VMM or migration state is @@ -71,7 +72,7 @@ pub fn update_saga_needed( log: &slog::Logger, instance_id: InstanceUuid, state: &nexus::SledInstanceState, - result: &instance::InstanceUpdateResult, + result: &InstanceUpdateResult, ) -> bool { // Currently, an instance-update saga is required if (and only if): // From bc4f27640fe0863ccd952bfaa814b74180ee5614 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Thu, 18 Jul 2024 18:51:36 -0700 Subject: [PATCH 163/234] start on the documentation i promised i'd write --- nexus/src/app/sagas/instance_update/mod.rs | 90 +++++++++++++++++++++- 1 file changed, 86 insertions(+), 4 deletions(-) diff --git a/nexus/src/app/sagas/instance_update/mod.rs b/nexus/src/app/sagas/instance_update/mod.rs index 56154722794..2aea5048abf 100644 --- a/nexus/src/app/sagas/instance_update/mod.rs +++ b/nexus/src/app/sagas/instance_update/mod.rs @@ -4,7 +4,88 @@ //! Instance Update Saga //! -//! # Theory of Operation +//! ## Background +//! +//! The state of a VM instance, as understood by Nexus, consists of a +//! combination of database tables: +//! +//! - The `instance` table, owned exclusively by Nexus itself, represents the +//! user-facing "logical" VM instance. +//! - The `vmm` table, which represents a "physical" Propolis VMM process on +//! which a running instance is incarnated. +//! - The `migration` table, which represents the state of an in-progress live +//! migration of an instance between two VMMs. +//! +//! When an instance is incarnated on a sled, the `propolis_id` field in an +//! `instance` record contains a UUID foreign key into the `vmm` table that points +//! to the `vmm` record for the Propolis process on which the instance is +//! currently running. If an instance is undergoing live migration, its record +//! additionally contains a `dst_propolis_id` foreign key pointing at the `vmm` +//! row representing the *target* Propolis process that it is migrating to, and +//! a `migration_id` foreign key into the `migration` table record tracking the +//! state of that migration. +//! +//! Sled-agents report the state of the VMMs they manage to Nexus. This occurs +//! when a VMM state transition occurs and the sled-agent *pushes* an update to +//! Nexus' `cpapi_instances_put` internal API endpoint, when a Nexus' +//! `instance-watcher` background task *pulls* instance states from sled-agents +//! periodically, or as the return value of an API call from Nexus to a +//! sled-agent. When a Nexus receives a new [`SledInstanceState`] from a +//! sled-agent through any of these mechanisms, the Nexus will write any changed +//! state to the `vmm` and/or `migration` tables directly on behalf of the +//! sled-agent. +//! +//! Although Nexus is technically the party responsible for the database query +//! that writes VMM and migration state updates received from sled-agent, it is +//! the sled-agent that *logically* "owns" these records. A row in the `vmm` +//! table represents a particular Propolis process running on a particular sled, +//! and that sled's sled-agent is the sole source of truth for that process. The +//! generation number for a `vmm` record is the property of the sled-agent +//! responsible for that VMM. Similarly, a `migration` record has separate +//! generation numbers for the source and target VMM states, which are owned by +//! the sled-agents responsible for the source and target Propolis processes, +//! respectively. If a sled-agent pushes a state update to a particular Nexus +//! instance and that Nexus fails to write the state to the database, that isn't +//! the end of the world: the sled-agent can simply retry with a different +//! Nexus, and the generation number, which is incremented exclusively by the +//! sled-agent, ensures that state changes are idempotent and ordered. If a +//! faulty Nexus were to return 200 OK to a sled-agent's call to +//! `cpapi_instances_put` but choose to simply eat the received instance state +//! update rather than writing it to the database, even *that* wouldn't +//! necessarily mean that the state change was gone forever: the +//! `instance-watcher` background task on another Nexus instance would +//! eventually poll the sled-agent's state and observe any changes that were +//! accidentally missed. This is all very neat and tidy, and we should feel +//! proud of ourselves for having designed such a nice little system. +//! +//! Unfortunately, when we look beyond the `vmm` and `migration` tables, things +//! rapidly become interesting (in the "may you live in interesting times" +//! sense). The `instance` record *cannot* be owned exclusively by anyone. The +//! logical instance state it represents is a gestalt that may consist of state +//! that exists in multiple VMM processes on multiple sleds, as well as +//! control-plane operations triggered by operator inputs and performed by +//! multiple Nexus instances. This is, as they say, "hairy". The neat and tidy +//! little state updates published by sled-agents to Nexus in the previous +//! paragraph may, in some cases, represent a state transition that also +//! requires changes to the `instance` table: for instance, a live migration may +//! have completed, necessitating a change in the instance's `propolis_id` to +//! point to the new VMM. +//! +//! Oh, and one other thing: the `instance` table record in turn logically +//! "owns" other resources, such as the virtual-provisioning counters that +//! represent rack-level resources allocated to the instance, and the instance's +//! network configuration. When the instance's state changes, these resources +//! owned by the instance may also need to be updated, such as changing the +//! network configuration to point at an instance's new home after a successful +//! migration, or deallocating virtual provisioning counters when an instance is +//! destroyed. Naturally, these updates must also be performed reliably and +//! inconsistent states must be avoided. +//! +//! Thus, we arrive here, at the instance-update saga. +//! +//! ## Theory of Operation +//! +//! Some ground rules: use super::{ ActionRegistry, NexusActionContext, NexusSaga, SagaInitError, ACTION_GENERATE_ID, @@ -26,6 +107,7 @@ use nexus_db_queries::{authn, authz}; use nexus_types::identity::Resource; use omicron_common::api::external::Error; use omicron_common::api::internal::nexus; +use omicron_common::api::internal::nexus::SledInstanceState; use omicron_uuid_kinds::GenericUuid; use omicron_uuid_kinds::InstanceUuid; use omicron_uuid_kinds::PropolisUuid; @@ -51,8 +133,8 @@ pub(crate) use self::start::{Params, SagaInstanceUpdate}; mod destroyed; /// Returns `true` if an `instance-update` saga should be executed as a result -/// of writing the provided [`nexus::SledInstanceState`] to the database with -/// the provided [`InstanceUpdateResult`]. +/// of writing the provided [`SledInstanceState`] to the database with the +/// provided [`InstanceUpdateResult`]. /// /// We determine this only after actually updating the database records, /// because we don't know whether a particular VMM or migration state is @@ -71,7 +153,7 @@ mod destroyed; pub fn update_saga_needed( log: &slog::Logger, instance_id: InstanceUuid, - state: &nexus::SledInstanceState, + state: &SledInstanceState, result: &InstanceUpdateResult, ) -> bool { // Currently, an instance-update saga is required if (and only if): From 7abe81443e794a9c7aaeb0e15a1f3f983da72619 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Fri, 19 Jul 2024 10:24:40 -0700 Subject: [PATCH 164/234] use the same code for determining effective states everywhere --- nexus/db-queries/src/db/datastore/instance.rs | 71 ++++++++++++------- nexus/src/app/instance.rs | 9 ++- 2 files changed, 50 insertions(+), 30 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/instance.rs b/nexus/db-queries/src/db/datastore/instance.rs index 18c3b129d57..c47b50a5c76 100644 --- a/nexus/db-queries/src/db/datastore/instance.rs +++ b/nexus/db-queries/src/db/datastore/instance.rs @@ -78,37 +78,42 @@ impl InstanceAndActiveVmm { self.vmm.as_ref().map(|v| SledUuid::from_untyped_uuid(v.sled_id)) } + /// Returns the operator-visible [external API + /// `InstanceState`](external::InstanceState) for this instance and its + /// active VMM. pub fn effective_state(&self) -> external::InstanceState { - if let Some(vmm) = &self.vmm { - vmm.runtime.state.into() - } else { - self.instance.runtime().nexus_state.into() - } - } -} - -impl From<(Instance, Option)> for InstanceAndActiveVmm { - fn from(value: (Instance, Option)) -> Self { - Self { instance: value.0, vmm: value.1 } + Self::determine_effective_state(&self.instance, self.vmm.as_ref()) } -} -impl From for external::Instance { - fn from(value: InstanceAndActiveVmm) -> Self { + /// Returns the operator-visible [external API + /// `InstanceState`](external::InstanceState) for the provided [`Instance`] + /// and its active [`Vmm`], if one exists. + /// + /// # Arguments + /// + /// - `instance`: the instance + /// - `active_vmm`: the instance's active VMM, if one exists. + /// + /// # Notes + /// + /// Generally, the value of `active_vmm` should be + /// the VMM pointed to by `instance.runtime_state.propolis_id`. However, + /// this is not enforced by this function, as the `instance_migrate` saga + /// must in some cases determine an effective instance state from the + /// instance and *target* VMM states. + pub fn determine_effective_state( + instance: &Instance, + active_vmm: Option<&Vmm>, + ) -> external::InstanceState { use crate::db::model::InstanceState; use crate::db::model::VmmState; - let time_run_state_updated = value - .vmm - .as_ref() - .map(|vmm| vmm.runtime.time_state_updated) - .unwrap_or(value.instance.runtime_state.time_updated); - let instance_state = value.instance.runtime_state.nexus_state; - let vmm_state = value.vmm.as_ref().map(|vmm| vmm.runtime.state); + let instance_state = instance.runtime_state.nexus_state; + let vmm_state = active_vmm.map(|vmm| vmm.runtime.state); // We want to only report that an instance is `Stopped` when a new // `instance-start` saga is able to proceed. That means that: - let run_state = match (instance_state, vmm_state) { + match (instance_state, vmm_state) { // - If there's an active migration ID for the instance, *always* // treat its state as "migration" regardless of the VMM's state. // @@ -127,7 +132,7 @@ impl From for external::Instance { // and migration IDs. // (InstanceState::Vmm, Some(_)) - if value.instance.runtime_state.migration_id.is_some() => + if instance.runtime_state.migration_id.is_some() => { external::InstanceState::Migrating } @@ -157,7 +162,23 @@ impl From for external::Instance { } // If there's no VMM state, use the instance's state. (instance_state, None) => instance_state.into(), - }; + } + } +} + +impl From<(Instance, Option)> for InstanceAndActiveVmm { + fn from(value: (Instance, Option)) -> Self { + Self { instance: value.0, vmm: value.1 } + } +} + +impl From for external::Instance { + fn from(value: InstanceAndActiveVmm) -> Self { + let time_run_state_updated = value + .vmm + .as_ref() + .map(|vmm| vmm.runtime.time_state_updated) + .unwrap_or(value.instance.runtime_state.time_updated); Self { identity: value.instance.identity(), @@ -170,7 +191,7 @@ impl From for external::Instance { .parse() .expect("found invalid hostname in the database"), runtime: external::InstanceRuntimeState { - run_state, + run_state: value.effective_state(), time_run_state_updated, }, } diff --git a/nexus/src/app/instance.rs b/nexus/src/app/instance.rs index 089f8f9e1e8..00076dfa483 100644 --- a/nexus/src/app/instance.rs +++ b/nexus/src/app/instance.rs @@ -695,11 +695,10 @@ impl super::Nexus { vmm_state: &Option, requested: &InstanceStateChangeRequest, ) -> Result { - let effective_state = if let Some(vmm) = vmm_state { - vmm.runtime.state.into() - } else { - instance_state.runtime().nexus_state.into() - }; + let effective_state = InstanceAndActiveVmm::determine_effective_state( + instance_state, + vmm_state.as_ref(), + ); // Requests that operate on active instances have to be directed to the // instance's current sled agent. If there is none, the request needs to From 17bb219fcbc43b4a9162afe326f0fc2009a2a2f5 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Fri, 19 Jul 2024 11:23:27 -0700 Subject: [PATCH 165/234] Revert "use the same code for determining effective states" This reverts commit 1589237b4250aa00dc42d939b4ee48c73e5db582. I think something may have broke due to this. --- nexus/db-queries/src/db/datastore/instance.rs | 71 +++++++------------ nexus/src/app/instance.rs | 9 +-- 2 files changed, 30 insertions(+), 50 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/instance.rs b/nexus/db-queries/src/db/datastore/instance.rs index c47b50a5c76..18c3b129d57 100644 --- a/nexus/db-queries/src/db/datastore/instance.rs +++ b/nexus/db-queries/src/db/datastore/instance.rs @@ -78,42 +78,37 @@ impl InstanceAndActiveVmm { self.vmm.as_ref().map(|v| SledUuid::from_untyped_uuid(v.sled_id)) } - /// Returns the operator-visible [external API - /// `InstanceState`](external::InstanceState) for this instance and its - /// active VMM. pub fn effective_state(&self) -> external::InstanceState { - Self::determine_effective_state(&self.instance, self.vmm.as_ref()) + if let Some(vmm) = &self.vmm { + vmm.runtime.state.into() + } else { + self.instance.runtime().nexus_state.into() + } } +} - /// Returns the operator-visible [external API - /// `InstanceState`](external::InstanceState) for the provided [`Instance`] - /// and its active [`Vmm`], if one exists. - /// - /// # Arguments - /// - /// - `instance`: the instance - /// - `active_vmm`: the instance's active VMM, if one exists. - /// - /// # Notes - /// - /// Generally, the value of `active_vmm` should be - /// the VMM pointed to by `instance.runtime_state.propolis_id`. However, - /// this is not enforced by this function, as the `instance_migrate` saga - /// must in some cases determine an effective instance state from the - /// instance and *target* VMM states. - pub fn determine_effective_state( - instance: &Instance, - active_vmm: Option<&Vmm>, - ) -> external::InstanceState { +impl From<(Instance, Option)> for InstanceAndActiveVmm { + fn from(value: (Instance, Option)) -> Self { + Self { instance: value.0, vmm: value.1 } + } +} + +impl From for external::Instance { + fn from(value: InstanceAndActiveVmm) -> Self { use crate::db::model::InstanceState; use crate::db::model::VmmState; + let time_run_state_updated = value + .vmm + .as_ref() + .map(|vmm| vmm.runtime.time_state_updated) + .unwrap_or(value.instance.runtime_state.time_updated); - let instance_state = instance.runtime_state.nexus_state; - let vmm_state = active_vmm.map(|vmm| vmm.runtime.state); + let instance_state = value.instance.runtime_state.nexus_state; + let vmm_state = value.vmm.as_ref().map(|vmm| vmm.runtime.state); // We want to only report that an instance is `Stopped` when a new // `instance-start` saga is able to proceed. That means that: - match (instance_state, vmm_state) { + let run_state = match (instance_state, vmm_state) { // - If there's an active migration ID for the instance, *always* // treat its state as "migration" regardless of the VMM's state. // @@ -132,7 +127,7 @@ impl InstanceAndActiveVmm { // and migration IDs. // (InstanceState::Vmm, Some(_)) - if instance.runtime_state.migration_id.is_some() => + if value.instance.runtime_state.migration_id.is_some() => { external::InstanceState::Migrating } @@ -162,23 +157,7 @@ impl InstanceAndActiveVmm { } // If there's no VMM state, use the instance's state. (instance_state, None) => instance_state.into(), - } - } -} - -impl From<(Instance, Option)> for InstanceAndActiveVmm { - fn from(value: (Instance, Option)) -> Self { - Self { instance: value.0, vmm: value.1 } - } -} - -impl From for external::Instance { - fn from(value: InstanceAndActiveVmm) -> Self { - let time_run_state_updated = value - .vmm - .as_ref() - .map(|vmm| vmm.runtime.time_state_updated) - .unwrap_or(value.instance.runtime_state.time_updated); + }; Self { identity: value.instance.identity(), @@ -191,7 +170,7 @@ impl From for external::Instance { .parse() .expect("found invalid hostname in the database"), runtime: external::InstanceRuntimeState { - run_state: value.effective_state(), + run_state, time_run_state_updated, }, } diff --git a/nexus/src/app/instance.rs b/nexus/src/app/instance.rs index 00076dfa483..089f8f9e1e8 100644 --- a/nexus/src/app/instance.rs +++ b/nexus/src/app/instance.rs @@ -695,10 +695,11 @@ impl super::Nexus { vmm_state: &Option, requested: &InstanceStateChangeRequest, ) -> Result { - let effective_state = InstanceAndActiveVmm::determine_effective_state( - instance_state, - vmm_state.as_ref(), - ); + let effective_state = if let Some(vmm) = vmm_state { + vmm.runtime.state.into() + } else { + instance_state.runtime().nexus_state.into() + }; // Requests that operate on active instances have to be directed to the // instance's current sled agent. If there is none, the request needs to From 5d699c9360aad4ee6bb3ddb16863b3c209037f46 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Fri, 19 Jul 2024 13:18:44 -0700 Subject: [PATCH 166/234] Reapply "use the same code for determining effective states" This reverts commit 1b6bc5d78176114d8fe0e0e9bbb296e19367e30b. --- nexus/db-queries/src/db/datastore/instance.rs | 71 ++++++++++++------- nexus/src/app/instance.rs | 9 ++- 2 files changed, 50 insertions(+), 30 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/instance.rs b/nexus/db-queries/src/db/datastore/instance.rs index 18c3b129d57..c47b50a5c76 100644 --- a/nexus/db-queries/src/db/datastore/instance.rs +++ b/nexus/db-queries/src/db/datastore/instance.rs @@ -78,37 +78,42 @@ impl InstanceAndActiveVmm { self.vmm.as_ref().map(|v| SledUuid::from_untyped_uuid(v.sled_id)) } + /// Returns the operator-visible [external API + /// `InstanceState`](external::InstanceState) for this instance and its + /// active VMM. pub fn effective_state(&self) -> external::InstanceState { - if let Some(vmm) = &self.vmm { - vmm.runtime.state.into() - } else { - self.instance.runtime().nexus_state.into() - } - } -} - -impl From<(Instance, Option)> for InstanceAndActiveVmm { - fn from(value: (Instance, Option)) -> Self { - Self { instance: value.0, vmm: value.1 } + Self::determine_effective_state(&self.instance, self.vmm.as_ref()) } -} -impl From for external::Instance { - fn from(value: InstanceAndActiveVmm) -> Self { + /// Returns the operator-visible [external API + /// `InstanceState`](external::InstanceState) for the provided [`Instance`] + /// and its active [`Vmm`], if one exists. + /// + /// # Arguments + /// + /// - `instance`: the instance + /// - `active_vmm`: the instance's active VMM, if one exists. + /// + /// # Notes + /// + /// Generally, the value of `active_vmm` should be + /// the VMM pointed to by `instance.runtime_state.propolis_id`. However, + /// this is not enforced by this function, as the `instance_migrate` saga + /// must in some cases determine an effective instance state from the + /// instance and *target* VMM states. + pub fn determine_effective_state( + instance: &Instance, + active_vmm: Option<&Vmm>, + ) -> external::InstanceState { use crate::db::model::InstanceState; use crate::db::model::VmmState; - let time_run_state_updated = value - .vmm - .as_ref() - .map(|vmm| vmm.runtime.time_state_updated) - .unwrap_or(value.instance.runtime_state.time_updated); - let instance_state = value.instance.runtime_state.nexus_state; - let vmm_state = value.vmm.as_ref().map(|vmm| vmm.runtime.state); + let instance_state = instance.runtime_state.nexus_state; + let vmm_state = active_vmm.map(|vmm| vmm.runtime.state); // We want to only report that an instance is `Stopped` when a new // `instance-start` saga is able to proceed. That means that: - let run_state = match (instance_state, vmm_state) { + match (instance_state, vmm_state) { // - If there's an active migration ID for the instance, *always* // treat its state as "migration" regardless of the VMM's state. // @@ -127,7 +132,7 @@ impl From for external::Instance { // and migration IDs. // (InstanceState::Vmm, Some(_)) - if value.instance.runtime_state.migration_id.is_some() => + if instance.runtime_state.migration_id.is_some() => { external::InstanceState::Migrating } @@ -157,7 +162,23 @@ impl From for external::Instance { } // If there's no VMM state, use the instance's state. (instance_state, None) => instance_state.into(), - }; + } + } +} + +impl From<(Instance, Option)> for InstanceAndActiveVmm { + fn from(value: (Instance, Option)) -> Self { + Self { instance: value.0, vmm: value.1 } + } +} + +impl From for external::Instance { + fn from(value: InstanceAndActiveVmm) -> Self { + let time_run_state_updated = value + .vmm + .as_ref() + .map(|vmm| vmm.runtime.time_state_updated) + .unwrap_or(value.instance.runtime_state.time_updated); Self { identity: value.instance.identity(), @@ -170,7 +191,7 @@ impl From for external::Instance { .parse() .expect("found invalid hostname in the database"), runtime: external::InstanceRuntimeState { - run_state, + run_state: value.effective_state(), time_run_state_updated, }, } diff --git a/nexus/src/app/instance.rs b/nexus/src/app/instance.rs index 089f8f9e1e8..00076dfa483 100644 --- a/nexus/src/app/instance.rs +++ b/nexus/src/app/instance.rs @@ -695,11 +695,10 @@ impl super::Nexus { vmm_state: &Option, requested: &InstanceStateChangeRequest, ) -> Result { - let effective_state = if let Some(vmm) = vmm_state { - vmm.runtime.state.into() - } else { - instance_state.runtime().nexus_state.into() - }; + let effective_state = InstanceAndActiveVmm::determine_effective_state( + instance_state, + vmm_state.as_ref(), + ); // Requests that operate on active instances have to be directed to the // instance's current sled agent. If there is none, the request needs to From 0146cc737c349015a8973e046c0c5234abf94c9b Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Mon, 22 Jul 2024 09:26:19 -0700 Subject: [PATCH 167/234] cleanup visibilities --- nexus/src/app/sagas/instance_update/destroyed.rs | 2 +- nexus/src/app/sagas/instance_update/mod.rs | 4 ++-- nexus/src/app/sagas/instance_update/start.rs | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/nexus/src/app/sagas/instance_update/destroyed.rs b/nexus/src/app/sagas/instance_update/destroyed.rs index a43abad8eba..e5bdab49263 100644 --- a/nexus/src/app/sagas/instance_update/destroyed.rs +++ b/nexus/src/app/sagas/instance_update/destroyed.rs @@ -102,7 +102,7 @@ async fn siu_destroyed_release_sled_resources( .map_err(ActionError::action_failed) } -pub(super) async fn siu_destroyed_mark_vmm_deleted( +async fn siu_destroyed_mark_vmm_deleted( sagactx: NexusActionContext, ) -> Result<(), ActionError> { let osagactx = sagactx.user_data(); diff --git a/nexus/src/app/sagas/instance_update/mod.rs b/nexus/src/app/sagas/instance_update/mod.rs index 2aea5048abf..7493841091f 100644 --- a/nexus/src/app/sagas/instance_update/mod.rs +++ b/nexus/src/app/sagas/instance_update/mod.rs @@ -671,7 +671,7 @@ async fn siu_update_network_config( Ok(()) } -pub(super) async fn siu_release_virtual_provisioning( +async fn siu_release_virtual_provisioning( sagactx: NexusActionContext, ) -> Result<(), ActionError> { let osagactx = sagactx.user_data(); @@ -731,7 +731,7 @@ pub(super) async fn siu_release_virtual_provisioning( Ok(()) } -pub(super) async fn siu_unassign_oximeter_producer( +async fn siu_unassign_oximeter_producer( sagactx: NexusActionContext, ) -> Result<(), ActionError> { let osagactx = sagactx.user_data(); diff --git a/nexus/src/app/sagas/instance_update/start.rs b/nexus/src/app/sagas/instance_update/start.rs index b9377822a12..27620d01aaf 100644 --- a/nexus/src/app/sagas/instance_update/start.rs +++ b/nexus/src/app/sagas/instance_update/start.rs @@ -25,9 +25,9 @@ use uuid::Uuid; pub(crate) struct Params { /// Authentication context to use to fetch the instance's current state from /// the database. - pub serialized_authn: authn::saga::Serialized, + pub(crate) serialized_authn: authn::saga::Serialized, - pub authz_instance: authz::Instance, + pub(crate) authz_instance: authz::Instance, } // instance update saga: actions From 1805b8dfb9381ce46f1c9b5c7e87469679de2ab1 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Mon, 22 Jul 2024 10:40:30 -0700 Subject: [PATCH 168/234] ensure lock is reliably unlocked when unwinding --- nexus/db-queries/src/db/datastore/instance.rs | 17 +- nexus/src/app/sagas/instance_update/mod.rs | 188 +++++++++++++++--- nexus/src/app/sagas/instance_update/start.rs | 40 ++-- 3 files changed, 190 insertions(+), 55 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/instance.rs b/nexus/db-queries/src/db/datastore/instance.rs index c47b50a5c76..e65a13b22cb 100644 --- a/nexus/db-queries/src/db/datastore/instance.rs +++ b/nexus/db-queries/src/db/datastore/instance.rs @@ -1178,12 +1178,13 @@ impl DataStore { &self, opctx: &OpContext, authz_instance: &authz::Instance, - UpdaterLock { updater_id, locked_gen }: UpdaterLock, + lock: &UpdaterLock, new_runtime: Option<&InstanceRuntimeState>, ) -> Result { use db::schema::instance::dsl; let instance_id = authz_instance.id(); + let UpdaterLock { updater_id, locked_gen } = *lock; let result = diesel::update(dsl::instance) .filter(dsl::time_deleted.is_null()) @@ -1381,7 +1382,7 @@ mod tests { // unlock the instance from saga 1 let unlocked = datastore - .instance_updater_unlock(&opctx, &authz_instance, lock1, None) + .instance_updater_unlock(&opctx, &authz_instance, &lock1, None) .await .expect("instance must be unlocked by saga 1"); assert!(unlocked, "instance must actually be unlocked"); @@ -1394,7 +1395,7 @@ mod tests { // unlock the instance from saga 2 let unlocked = datastore - .instance_updater_unlock(&opctx, &authz_instance, lock2, None) + .instance_updater_unlock(&opctx, &authz_instance, &lock2, None) .await .expect("instance must be unlocked by saga 2"); assert!(unlocked, "instance must actually be unlocked"); @@ -1440,7 +1441,7 @@ mod tests { // now, unlock the instance. let unlocked = dbg!( datastore - .instance_updater_unlock(&opctx, &authz_instance, lock1, None) + .instance_updater_unlock(&opctx, &authz_instance, &lock1, None) .await ) .expect("instance should unlock"); @@ -1449,7 +1450,7 @@ mod tests { // unlocking it again should also succeed... let unlocked = dbg!( datastore - .instance_updater_unlock(&opctx, &authz_instance, lock2, None) + .instance_updater_unlock(&opctx, &authz_instance, &lock2, None) .await ) .expect("instance should unlock again"); @@ -1492,7 +1493,7 @@ mod tests { // what we're doing here. But this simulates a case where // an incorrect one is constructed, or a raw database query // attempts an invalid unlock operation. - UpdaterLock { + &UpdaterLock { updater_id: saga2, locked_gen: lock1.locked_gen, }, @@ -1515,7 +1516,7 @@ mod tests { // unlocking with the correct ID should succeed. let unlocked = dbg!( datastore - .instance_updater_unlock(&opctx, &authz_instance, lock1, None) + .instance_updater_unlock(&opctx, &authz_instance, &lock1, None) .await ) .expect("instance should unlock"); @@ -1531,7 +1532,7 @@ mod tests { // Again, these fields are private specifically to prevent // you from doing this exact thing. But, we should still // test that we handle it gracefully. - UpdaterLock { updater_id: saga1, locked_gen: next_gen }, + &UpdaterLock { updater_id: saga1, locked_gen: next_gen }, None, ) .await diff --git a/nexus/src/app/sagas/instance_update/mod.rs b/nexus/src/app/sagas/instance_update/mod.rs index 7493841091f..4b7f92ef01c 100644 --- a/nexus/src/app/sagas/instance_update/mod.rs +++ b/nexus/src/app/sagas/instance_update/mod.rs @@ -605,8 +605,10 @@ async fn siu_unbecome_updater( ) -> Result<(), anyhow::Error> { let RealParams { ref serialized_authn, ref authz_instance, .. } = sagactx.saga_params::()?; - unlock_instance_inner(serialized_authn, authz_instance, &sagactx, None) - .await?; + let lock = sagactx.lookup::(INSTANCE_LOCK)?; + + unwind_instance_lock(lock, serialized_authn, authz_instance, &sagactx) + .await; Ok(()) } @@ -760,17 +762,43 @@ async fn siu_unassign_oximeter_producer( async fn siu_commit_instance_updates( sagactx: NexusActionContext, ) -> Result<(), ActionError> { + let osagactx = sagactx.user_data(); let RealParams { serialized_authn, authz_instance, ref update, .. } = sagactx.saga_params::()?; - unlock_instance_inner( - &serialized_authn, - &authz_instance, - &sagactx, - Some(&update.new_runtime), - ) - .await?; + let lock = sagactx.lookup::(INSTANCE_LOCK)?; + + let opctx = + crate::context::op_context_for_saga_action(&sagactx, &serialized_authn); + let instance_id = authz_instance.id(); + slog::debug!( + osagactx.log(), + "instance update: committing new runtime state and unlocking..."; + "instance_id" => %instance_id, + "new_runtime" => ?update.new_runtime, + "lock" => ?lock, + ); + + let did_unlock = osagactx + .datastore() + .instance_updater_unlock( + &opctx, + &authz_instance, + &lock, + Some(&update.new_runtime), + ) + .await + .map_err(ActionError::action_failed)?; + + slog::info!( + osagactx.log(), + "instance update: committed update new runtime state!"; + "instance_id" => %instance_id, + "new_runtime" => ?update.new_runtime, + "did_unlock" => ?did_unlock, + ); + // Check if the VMM or migration state has changed while the update saga was // running and whether an additional update saga is now required. If one is // required, try to start it. @@ -858,37 +886,135 @@ async fn chain_update_saga( Ok(()) } -async fn unlock_instance_inner( +/// Unlock the instance record while unwinding. +/// +/// This is factored out of the actual reverse action, because the `Params` type +/// differs between the start saga and the actual instance update sagas, both of +/// which must unlock the instance in their reverse actions. +async fn unwind_instance_lock( + lock: instance::UpdaterLock, serialized_authn: &authn::saga::Serialized, authz_instance: &authz::Instance, sagactx: &NexusActionContext, - new_runtime: Option<&InstanceRuntimeState>, -) -> Result<(), ActionError> { - let lock = sagactx.lookup::(INSTANCE_LOCK)?; - let opctx = - crate::context::op_context_for_saga_action(&sagactx, serialized_authn); +) { + // /!\ EXTREMELY IMPORTANT WARNING /!\ + // + // This comment is a message, and part of a system of messages. Pay + // attention to it! The message is a warning about danger. + // + // The danger is still present in your time, as it was in ours. The danger + // is to the instance record, and it can deadlock. + // + // When unwinding, unlocking an instance MUST succeed at all costs. This is + // of the upmost importance. It's fine for unlocking an instance in a + // forward action to fail, since the reverse action will still unlock the + // instance when the saga is unwound. However, when unwinding, we must + // ensure the instance is unlocked, no matter what. This is because a + // failure to unlock the instance will leave the instance record in a + // PERMANENTLY LOCKED state, since no other update saga will ever be + // able to lock it again. If we can't unlock the instance here, our death + // will ruin the instance record forever and it will only be able to be + // removed by manual operator intervention. That would be...not great. + // + // Therefore, this action will retry the attempt to unlock the instance + // until it either: + // + // - succeeds, and we know the instance is now unlocked. + // - fails *because the instance doesn't exist*, in which case we can die + // happily because it doesn't matter if the instance is actually unlocked. + use dropshot::HttpError; + use futures::{future, TryFutureExt}; + use omicron_common::backoff; + let osagactx = sagactx.user_data(); - slog::info!( - osagactx.log(), - "instance update: unlocking instance"; - "instance_id" => %authz_instance.id(), + let log = osagactx.log(); + let instance_id = authz_instance.id(); + let opctx = + crate::context::op_context_for_saga_action(sagactx, &serialized_authn); + + debug!( + log, + "instance update: unlocking instance on unwind"; + "instance_id" => %instance_id, "lock" => ?lock, ); - let did_unlock = osagactx - .datastore() - .instance_updater_unlock(&opctx, authz_instance, lock, new_runtime) - .await - .map_err(ActionError::action_failed)?; + const WARN_DURATION: std::time::Duration = + std::time::Duration::from_secs(20); - slog::info!( - osagactx.log(), - "instance update: unlocked instance"; - "instance_id" => %authz_instance.id(), - "did_unlock" => ?did_unlock, - ); + let did_unlock = backoff::retry_notify_ext( + // This is an internal service query to CockroachDB. + backoff::retry_policy_internal_service(), + || { + osagactx + .datastore() + .instance_updater_unlock(&opctx, authz_instance, &lock, None) + .or_else(|err| future::ready(match err { + // The instance record was not found. It's probably been + // deleted. That's fine, we can now die happily, since we won't + // be leaving the instance permanently locked. + Error::ObjectNotFound { .. } => { + info!( + log, + "instance update: giving up on unlocking instance, \ + as it no longer exists"; + "instance_id" => %instance_id, + "lock" => ?lock, + ); - Ok(()) + Ok(false) + }, + // All other errors should be retried. + _ => Err(backoff::BackoffError::transient(err)), + })) + }, + |error, call_count, total_duration| { + let http_error = HttpError::from(error.clone()); + if http_error.status_code.is_client_error() { + error!( + &log, + "instance update: client error while unlocking instance \ + (likely requires operator intervention), retrying anyway"; + "instance_id" => %instance_id, + "lock" => ?lock, + "error" => &error, + "call_count" => call_count, + "total_duration" => ?total_duration, + ); + } else if total_duration > WARN_DURATION { + warn!( + &log, + "instance update: server error while unlocking instance, + retrying"; + "instance_id" => %instance_id, + "lock" => ?lock, + "error" => &error, + "call_count" => call_count, + "total_duration" => ?total_duration, + ); + } else { + info!( + &log, + "server error while recording saga event, retrying"; + "instance_id" => %instance_id, + "lock" => ?lock, + "error" => &error, + "call_count" => call_count, + "total_duration" => ?total_duration, + ); + } + }, + ) + .await + .expect("errors should be retried indefinitely"); + + info!( + log, + "instance update: unlocked instance while unwinding"; + "instance_id" => %instance_id, + "lock" => ?lock, + "did_unlock" => did_unlock, + ); } #[cfg(test)] diff --git a/nexus/src/app/sagas/instance_update/start.rs b/nexus/src/app/sagas/instance_update/start.rs index 27620d01aaf..a16d67e06fd 100644 --- a/nexus/src/app/sagas/instance_update/start.rs +++ b/nexus/src/app/sagas/instance_update/start.rs @@ -119,13 +119,23 @@ async fn siu_lock_instance_undo( ) -> Result<(), anyhow::Error> { let Params { ref serialized_authn, ref authz_instance, .. } = sagactx.saga_params::()?; - super::unlock_instance_inner( - serialized_authn, - authz_instance, - &sagactx, - None, - ) - .await?; + + // If the instance lock node in the saga context was `None`, that means + // we didn't acquire the lock, and we can die happily without having to + // worry about unlocking the instance. It would be pretty surprising if this + // saga unwound without having acquired the lock, but...whatever. + if let Some(lock) = + sagactx.lookup::>(INSTANCE_LOCK)? + { + super::unwind_instance_lock( + lock, + serialized_authn, + authz_instance, + &sagactx, + ) + .await; + } + Ok(()) } @@ -152,9 +162,9 @@ async fn siu_fetch_state_and_start_real_saga( let opctx = crate::context::op_context_for_saga_action(&sagactx, &serialized_authn); + let datastore = osagactx.datastore(); - let state = osagactx - .datastore() + let state = datastore .instance_fetch_all(&opctx, &authz_instance) .await .map_err(ActionError::action_failed)?; @@ -201,13 +211,11 @@ async fn siu_fetch_state_and_start_real_saga( "current.active_vmm" => ?state.active_vmm, "current.target_vmm" => ?state.target_vmm, ); - super::unlock_instance_inner( - &serialized_authn, - &authz_instance, - &sagactx, - None, - ) - .await?; + osagactx + .datastore() + .instance_updater_unlock(&opctx, &authz_instance, &orig_lock, None) + .await + .map_err(ActionError::action_failed)?; } Ok(()) From 80bb38ddcacca6a9cf49824805e23a5eab067122 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Mon, 22 Jul 2024 12:12:54 -0700 Subject: [PATCH 169/234] assert instance state is consistent when unwinding as suggested by @gjcolombo in https://github.com/oxidecomputer/omicron/pull/5749#discussion_r1681841277 --- nexus/src/app/sagas/instance_update/mod.rs | 56 +++++++++++++++++++--- 1 file changed, 50 insertions(+), 6 deletions(-) diff --git a/nexus/src/app/sagas/instance_update/mod.rs b/nexus/src/app/sagas/instance_update/mod.rs index 4b7f92ef01c..6ce665bf01b 100644 --- a/nexus/src/app/sagas/instance_update/mod.rs +++ b/nexus/src/app/sagas/instance_update/mod.rs @@ -1112,6 +1112,44 @@ mod test { ) } + // Asserts that an instance record is in a consistent state (e.g., that all + // state changes performed by the update saga are either applied atomically, + // or have not been applied). This is particularly important to check when a + // saga unwinds. + #[track_caller] + fn assert_instance_record_is_consistent(instance: &Instance) { + let run_state = instance.runtime(); + match run_state.nexus_state { + InstanceState::Vmm => assert!( + run_state.propolis_id.is_some(), + "if the instance record is in the `Vmm` state, it must have \ + an active VMM\ninstance: {instance:#?}", + ), + state => assert_eq!( + run_state.propolis_id, None, + "if the instance record is in the `{state:?}` state, it must \ + not have an active VMM\ninstance: {instance:#?}", + ), + } + + if run_state.dst_propolis_id.is_some() { + assert!( + run_state.migration_id.is_some(), + "if the instance record has a target VMM ID, then it must \ + also have a migration\ninstance: {instance:#?}", + ); + } + + if run_state.migration_id.is_some() { + assert_eq!( + run_state.nexus_state, + InstanceState::Vmm, + "if an instance is migrating, it must be in the VMM state\n\ + instance: {instance:#?}", + ); + } + } + async fn after_unwinding(cptestctx: &ControlPlaneTestContext) { let state = test_helpers::instance_fetch_by_name( cptestctx, @@ -1121,13 +1159,19 @@ mod test { .await; let instance = state.instance(); - // Unlike most other sagas, we actually don't unwind the - // work performed by an update saga, as we would prefer - // that at least some of it succeeds. The only thing - // that *needs* to be rolled back when an - // instance-update saga fails is that the updater lock - // *MUST* be released so that a subsequent saga can run. + // Unlike most other sagas, we actually don't unwind the work performed + // by an update saga, as we would prefer that at least some of it + // succeeds. The only thing that *needs* to be rolled back when an + // instance-update saga fails is that the updater lock *MUST* be + // released so that a subsequent saga can run. + // + // Additionally, we assert that the instance record is in a + // consistent state, ensuring that all changes to the instance record + // are atomic. This is important *because* we won't roll back changes + // to the instance: if we're going to leave them in place, they can't + // be partially applied, even if we unwound partway through the saga. assert_instance_unlocked(instance); + assert_instance_record_is_consistent(instance); // Throw away the instance so that subsequent unwinding // tests also operate on an instance in the correct From f7afb8554c90740a790e8720654390185e95722a Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Mon, 22 Jul 2024 14:39:36 -0700 Subject: [PATCH 170/234] start on migration failure tests --- nexus/src/app/sagas/instance_update/mod.rs | 72 +++++++++++++++++++++- 1 file changed, 71 insertions(+), 1 deletion(-) diff --git a/nexus/src/app/sagas/instance_update/mod.rs b/nexus/src/app/sagas/instance_update/mod.rs index 6ce665bf01b..58892b87742 100644 --- a/nexus/src/app/sagas/instance_update/mod.rs +++ b/nexus/src/app/sagas/instance_update/mod.rs @@ -1556,6 +1556,68 @@ mod test { .await; } + // === migration failed, target not destroyed === + + #[nexus_test(server = crate::Server)] + async fn test_migration_target_failed_succeeds( + cptestctx: &ControlPlaneTestContext, + ) { + let _project_id = setup_test_project(&cptestctx.external_client).await; + let other_sleds = test_helpers::add_sleds(cptestctx, 1).await; + + MigrationOutcome::default() + .target(MigrationState::Failed, VmmState::Failed) + .source(MigrationState::Failed, VmmState::Running) + .setup_test(cptestctx, &other_sleds) + .await + .run_saga_basic_usage_succeeds_test(cptestctx) + .await; + } + + #[nexus_test(server = crate::Server)] + async fn test_migration_target_failed_actions_succeed_idempotently( + cptestctx: &ControlPlaneTestContext, + ) { + let _project_id = setup_test_project(&cptestctx.external_client).await; + let other_sleds = test_helpers::add_sleds(cptestctx, 1).await; + + MigrationOutcome::default() + .target(MigrationState::Failed, VmmState::Failed) + .source(MigrationState::Failed, VmmState::Running) + .setup_test(cptestctx, &other_sleds) + .await + .run_saga_basic_usage_succeeds_test(cptestctx) + .await; + } + + #[nexus_test(server = crate::Server)] + async fn test_migration_target_failed_can_unwind( + cptestctx: &ControlPlaneTestContext, + ) { + let nexus = &cptestctx.server.server_context().nexus; + let other_sleds = test_helpers::add_sleds(cptestctx, 1).await; + let _project_id = setup_test_project(&cptestctx.external_client).await; + + let outcome = MigrationOutcome::default() + .target(MigrationState::Failed, VmmState::Failed) + .source(MigrationState::Failed, VmmState::Running); + + test_helpers::action_failure_can_unwind::( + nexus, + || { + Box::pin(async { + outcome + .setup_test(cptestctx, &other_sleds) + .await + .saga_params() + }) + }, + || Box::pin(after_unwinding(cptestctx)), + &cptestctx.logctx.log, + ) + .await; + } + #[derive(Clone, Copy, Default)] struct MigrationOutcome { source: Option<(MigrationState, VmmState)>, @@ -1864,9 +1926,17 @@ mod test { let active_vmm_id = instance_runtime.propolis_id; assert_instance_unlocked(instance); + assert_instance_record_is_consistent(instance); if self.outcome.failed { - todo!("eliza: verify migration-failed postconditions"); + assert_eq!( + instance_runtime.migration_id, None, + "migration ID must be unset when a migration has failed" + ); + assert_eq!( + instance_runtime.dst_propolis_id, None, + "target VMM ID must be unset when a migration has failed" + ); } else { assert_eq!( active_vmm_id, From 9cca199e9007623336619770e1344968b582e765 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Mon, 22 Jul 2024 15:03:49 -0700 Subject: [PATCH 171/234] test more destroyed outcomes --- nexus/src/app/sagas/instance_update/mod.rs | 210 ++++++++++++++++++++- 1 file changed, 202 insertions(+), 8 deletions(-) diff --git a/nexus/src/app/sagas/instance_update/mod.rs b/nexus/src/app/sagas/instance_update/mod.rs index 58892b87742..4ed91d268e6 100644 --- a/nexus/src/app/sagas/instance_update/mod.rs +++ b/nexus/src/app/sagas/instance_update/mod.rs @@ -1618,6 +1618,192 @@ mod test { .await; } + // === migration failed, migration target destroyed tests === + + #[nexus_test(server = crate::Server)] + async fn test_migration_target_failed_destroyed_succeeds( + cptestctx: &ControlPlaneTestContext, + ) { + let _project_id = setup_test_project(&cptestctx.external_client).await; + let other_sleds = test_helpers::add_sleds(cptestctx, 1).await; + + MigrationOutcome::default() + .target(MigrationState::Failed, VmmState::Destroyed) + .source(MigrationState::Failed, VmmState::Running) + .setup_test(cptestctx, &other_sleds) + .await + .run_saga_basic_usage_succeeds_test(cptestctx) + .await; + } + + #[nexus_test(server = crate::Server)] + async fn test_migration_target_failed_destroyed_actions_succeed_idempotently( + cptestctx: &ControlPlaneTestContext, + ) { + let _project_id = setup_test_project(&cptestctx.external_client).await; + let other_sleds = test_helpers::add_sleds(cptestctx, 1).await; + + MigrationOutcome::default() + .target(MigrationState::Failed, VmmState::Destroyed) + .source(MigrationState::Failed, VmmState::Running) + .setup_test(cptestctx, &other_sleds) + .await + .run_saga_basic_usage_succeeds_test(cptestctx) + .await; + } + + #[nexus_test(server = crate::Server)] + async fn test_migration_target_failed_destroyed_can_unwind( + cptestctx: &ControlPlaneTestContext, + ) { + let nexus = &cptestctx.server.server_context().nexus; + let other_sleds = test_helpers::add_sleds(cptestctx, 1).await; + let _project_id = setup_test_project(&cptestctx.external_client).await; + + let outcome = MigrationOutcome::default() + .target(MigrationState::Failed, VmmState::Destroyed) + .source(MigrationState::Failed, VmmState::Running); + + test_helpers::action_failure_can_unwind::( + nexus, + || { + Box::pin(async { + outcome + .setup_test(cptestctx, &other_sleds) + .await + .saga_params() + }) + }, + || Box::pin(after_unwinding(cptestctx)), + &cptestctx.logctx.log, + ) + .await; + } + + // === migration failed, migration source destroyed tests === + + #[nexus_test(server = crate::Server)] + async fn test_migration_source_failed_destroyed_succeeds( + cptestctx: &ControlPlaneTestContext, + ) { + let _project_id = setup_test_project(&cptestctx.external_client).await; + let other_sleds = test_helpers::add_sleds(cptestctx, 1).await; + + MigrationOutcome::default() + .target(MigrationState::InProgress, VmmState::Running) + .source(MigrationState::Failed, VmmState::Destroyed) + .setup_test(cptestctx, &other_sleds) + .await + .run_saga_basic_usage_succeeds_test(cptestctx) + .await; + } + + #[nexus_test(server = crate::Server)] + async fn test_migration_source_failed_destroyed_actions_succeed_idempotently( + cptestctx: &ControlPlaneTestContext, + ) { + let _project_id = setup_test_project(&cptestctx.external_client).await; + let other_sleds = test_helpers::add_sleds(cptestctx, 1).await; + + MigrationOutcome::default() + .target(MigrationState::InProgress, VmmState::Running) + .source(MigrationState::Failed, VmmState::Destroyed) + .setup_test(cptestctx, &other_sleds) + .await + .run_saga_basic_usage_succeeds_test(cptestctx) + .await; + } + + #[nexus_test(server = crate::Server)] + async fn test_migration_source_failed_destroyed_can_unwind( + cptestctx: &ControlPlaneTestContext, + ) { + let nexus = &cptestctx.server.server_context().nexus; + let other_sleds = test_helpers::add_sleds(cptestctx, 1).await; + let _project_id = setup_test_project(&cptestctx.external_client).await; + + let outcome = MigrationOutcome::default() + .target(MigrationState::InProgress, VmmState::Running) + .source(MigrationState::Failed, VmmState::Destroyed); + + test_helpers::action_failure_can_unwind::( + nexus, + || { + Box::pin(async { + outcome + .setup_test(cptestctx, &other_sleds) + .await + .saga_params() + }) + }, + || Box::pin(after_unwinding(cptestctx)), + &cptestctx.logctx.log, + ) + .await; + } + + // === migration failed, source and target both destroyed === + + #[nexus_test(server = crate::Server)] + async fn test_migration_failed_everyone_died_succeeds( + cptestctx: &ControlPlaneTestContext, + ) { + let _project_id = setup_test_project(&cptestctx.external_client).await; + let other_sleds = test_helpers::add_sleds(cptestctx, 1).await; + + MigrationOutcome::default() + .target(MigrationState::Failed, VmmState::Destroyed) + .source(MigrationState::Failed, VmmState::Destroyed) + .setup_test(cptestctx, &other_sleds) + .await + .run_saga_basic_usage_succeeds_test(cptestctx) + .await; + } + + #[nexus_test(server = crate::Server)] + async fn test_migration_failed_everyone_died_actions_succeed_idempotently( + cptestctx: &ControlPlaneTestContext, + ) { + let _project_id = setup_test_project(&cptestctx.external_client).await; + let other_sleds = test_helpers::add_sleds(cptestctx, 1).await; + + MigrationOutcome::default() + .target(MigrationState::Failed, VmmState::Destroyed) + .source(MigrationState::Failed, VmmState::Destroyed) + .setup_test(cptestctx, &other_sleds) + .await + .run_saga_basic_usage_succeeds_test(cptestctx) + .await; + } + + #[nexus_test(server = crate::Server)] + async fn test_migration_failed_everyone_died_can_unwind( + cptestctx: &ControlPlaneTestContext, + ) { + let nexus = &cptestctx.server.server_context().nexus; + let other_sleds = test_helpers::add_sleds(cptestctx, 1).await; + let _project_id = setup_test_project(&cptestctx.external_client).await; + + let outcome = MigrationOutcome::default() + .target(MigrationState::Failed, VmmState::Destroyed) + .source(MigrationState::Failed, VmmState::Destroyed); + + test_helpers::action_failure_can_unwind::( + nexus, + || { + Box::pin(async { + outcome + .setup_test(cptestctx, &other_sleds) + .await + .saga_params() + }) + }, + || Box::pin(after_unwinding(cptestctx)), + &cptestctx.logctx.log, + ) + .await; + } + #[derive(Clone, Copy, Default)] struct MigrationOutcome { source: Option<(MigrationState, VmmState)>, @@ -1998,11 +2184,22 @@ mod test { "target VMM should exist if and only if the target hasn't been destroyed", ); - let all_vmms_destroyed = src_destroyed && target_destroyed; + // VThe instance has a VMM if (and only if): + let has_vmm = if self.outcome.failed { + // If the migration failed, the instance should have a VMM if + // and only if the source VMM is still okay. It doesn't matter + // whether the target is still there or not, because we didn't + // migrate to it successfully. + !src_destroyed + } else { + // Otherwise, if the migration succeeded, the instance should be + // on the target VMM. + true + }; assert_eq!( no_virtual_provisioning_resource_records_exist(cptestctx).await, - all_vmms_destroyed, + !has_vmm, "virtual provisioning resource records must exist as long as \ the instance has a VMM", ); @@ -2011,16 +2208,13 @@ mod test { cptestctx ) .await, - all_vmms_destroyed, + !has_vmm, "virtual provisioning collection records must exist as long \ as the instance has a VMM", ); - let instance_state = if all_vmms_destroyed { - InstanceState::NoVmm - } else { - InstanceState::Vmm - }; + let instance_state = + if has_vmm { InstanceState::Vmm } else { InstanceState::NoVmm }; assert_eq!(instance_runtime.nexus_state, instance_state); } From af207ca46edd7733b1e0fec07b3840fb2add005a Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Mon, 22 Jul 2024 15:56:50 -0700 Subject: [PATCH 172/234] more commentary --- nexus/src/app/sagas/instance_update/mod.rs | 103 ++++++++++++++++++++- 1 file changed, 102 insertions(+), 1 deletion(-) diff --git a/nexus/src/app/sagas/instance_update/mod.rs b/nexus/src/app/sagas/instance_update/mod.rs index 4ed91d268e6..58ffa50c4d7 100644 --- a/nexus/src/app/sagas/instance_update/mod.rs +++ b/nexus/src/app/sagas/instance_update/mod.rs @@ -85,7 +85,108 @@ //! //! ## Theory of Operation //! -//! Some ground rules: +//! In order to ensure that changes to the state of an instance are handled +//! reliably, we require that all mutations of an instance` record are performed +//! by a saga. The following sagas currently touch the `instance` record: +//! +//! - [`instance_start`](super::instance_start) +//! - [`instance_migrate`](super::instance_migrate) +//! - [`instance_delete`](super::instance_delete) +//! - `instance_update` (this saga) +//! +//! For most of these sagas, the instance state machine itself guards against +//! potential race conditions. By considering the valid and invalid flows +//! through an instance's state machine, we arrive at some ground rules: +//! +//! - The `instance_migrate` and `instance_delete` sagas will +//! only modify the instance record if the instance *has* an active Propolis +//! ID. +//! - The `instance_start` and instance_delete` sagas will only modify the +//! instance record if the instance does *not* have an active VMM. +//! - The presence of a migration ID prevents an `instance_migrate` saga from +//! succeeding until the current migration is resolved (either completes or +//! fails). +//! - Only the `instance_start` saga can set the instance's *active* Propolis ID, +//! and it can only do this if there is currently no active Propolis. +//! - Only the `instance_migrate` saga can set the instance's *target* Propolis +//! ID and migration ID, and it can only do that if these fields are unset. +//! - Only the `instance_update` saga can unset a migration ID and target +//! Propolis ID, which it will do when handling an update from sled-agent that +//! indicates that a migration has succeeded or failed. +//! - Only the `instance_update` saga can unset an instance's active Propolis +//! ID, which it will do when handling an update from sled-agent that +//! indicates that the VMM has been destroyed (peacefully or violently). +//! +//! For the most part, this state machine prevents race conditions where +//! multiple sagas mutate the same fields in the instance record, because the +//! states from which a particular transition may start limited. However, this +//! is not the case for the `instance-update` saga, which may need to run any +//! time a sled-agent publishes a new instance state. Therefore, this saga has +//! the dubious honor of using the only distributed lock in Nexus (at the time +//! of writing), the "instance updater lock". +//! +//! ### The Instance-Updater Lock, or, "Distributed RAII" +//! +//! Distributed locks [are scary][dist-locking]. One of the *scariest* things +//! about distributed locks is that a process can die[^1] while holding a lock, +//! which results in the protected resource (in this case, the `instance` +//! record) being locked forever.[^2] It would be good for that to not happen. +//! Fortunately, *if* (and only if) we promise to *only* ever acquire the the +//! instance-updater lock inside of a saga, we can guarantee forward progress: +//! should a saga fail while holding the lock, it will unwind into a reverse +//! action that releases the lock. This is essentially the distributed +//! equivalent to holding a RAII guard in a Rust program: if the thread holding +//! the lock panics, it unwinds its stack, drops the [`std::sync::MutexGuard`], +//! and the rest of the system is not left in a deadlocked state. As long as we +//! ensure that the instance-updater lock is only ever acquired by sagas, and +//! that any saga holding a lock will reliably release it when it unwinds, we're +//! ... *probably* ... okay. +//! +//! When an `instance-update` saga is started, it attempts to [acquire the +//! updater lock][instance_updater_lock]. If the lock is already held by another +//! update saga, then the update saga completes immediately. Otherwise, the saga +//! then queries CRDB for a snapshot of the current state of the `instance`` +//! record, the active and migration-target `vmm` records (if any exist), and +//! the current `migration` record (if one exists). This snapshot represents the +//! state from which the update will be applied, and must be read only after +//! locking the instance to ensure that it cannot race with another saga. +//! +//! This is where another of this saga's weird quirks shows up: the shape of the +//! saga DAG we wish to execute depends on this instance, active VMM, target +//! VMM, and migration snapshot. But, because this snapshot may only be taken +//! once the lock is acquired, and --- as we discussed above --- the +//! instance-updater lock may only ever be acquired within a saga, we arrive at +//! a bit of a weird impasse: we can't determine what saga DAG to build without +//! looking at the snapshot, but we can't take the snapshot until we've already +//! started a saga. To solve this, we've split this saga into two pieces: the +//! first, `start-instance-update`, is a very small saga that just tries to lock +//! the instance, and upon doing so, loads the instance snapshot from the +//! database and prepares and executes the "real" instance update saga. Once the +//! "real" saga starts, it "inherits" the lock from the start saga by performing +//! [the SQL equivalent equivalent of a compare-and-swap +//! operation][instance_updater_inherit_lock] with its own UUID. +//! +//! The DAG for the "real" update saga depends on the snapshot read within the +//! lock, and since the lock was never released, that snapshot remains valid for +//! its execution. As the final action of the update saga, the instance record's +//! new runtime state is written back to the database and the lock is released, +//! in a [single atomic operation][instance_updater_unlock]. Should the update +//! saga fail, it will release the inherited lock. And, if the unwinding update +//! saga unwinds into the start saga, that's fine, because a double-unlock is +//! prevented by the saga ID having changed in the "inherit lock" operation. +//! +//! [instance_updater_lock]: +//! crate::app::db::datastore::DataStore::instance_updater_lock +//! [instance_updater_inherit_lock]: +//! crate::app::db::datastore::DataStore::instance_updater_inherit_lock +//! [instance_updater_unlock]: +//! crate::app::db::datastore::DataStore::instance_updater_unlock +//! [dist-locking]: +//! https://martin.kleppmann.com/2016/02/08/how-to-do-distributed-locking.html +//! +//! [^1]: And, if a process *can* die, well...we can assume it *will*. +//! [^2]: Barring human intervention. + use super::{ ActionRegistry, NexusActionContext, NexusSaga, SagaInitError, ACTION_GENERATE_ID, From 7998371d17f72e4ee8efe475823c56633ab6f9ce Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Mon, 22 Jul 2024 16:11:20 -0700 Subject: [PATCH 173/234] more of my nonsense --- nexus/src/app/sagas/instance_update/mod.rs | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/nexus/src/app/sagas/instance_update/mod.rs b/nexus/src/app/sagas/instance_update/mod.rs index 58ffa50c4d7..ea6bdebcda4 100644 --- a/nexus/src/app/sagas/instance_update/mod.rs +++ b/nexus/src/app/sagas/instance_update/mod.rs @@ -175,6 +175,19 @@ //! saga unwinds into the start saga, that's fine, because a double-unlock is //! prevented by the saga ID having changed in the "inherit lock" operation. //! +//! ### Avoiding Missed Updates, or, "The `InstanceRuntimeState` Will Always Get Through" +//! +//! The lock operation we've described above is really more of a "try-lock" +//! operation: if the lock is already held, the saga trying to acquire it just +//! ends immediately, rather than waiting for the lock to be released. This begs +//! the question, "what happens if an instance update comes in while the lock is +//! held?" Do we just...leave it on the floor? Wasn't the whole point of this +//! Rube Goldberg mechanism of sagas to *prevent* instance state changes from +//! being missed? +//! +//! We solve this using an ~~even more layers of complexity~~defense-in-depth +//! approach. +//! //! [instance_updater_lock]: //! crate::app::db::datastore::DataStore::instance_updater_lock //! [instance_updater_inherit_lock]: From de9391bef201107140f27c1ebb0314a02874f1d7 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Tue, 23 Jul 2024 09:03:12 -0700 Subject: [PATCH 174/234] misc style consistency/cleanup --- .../app/sagas/instance_update/destroyed.rs | 4 +- nexus/src/app/sagas/instance_update/mod.rs | 108 +++++++++++------- nexus/src/app/sagas/instance_update/start.rs | 21 ++-- 3 files changed, 80 insertions(+), 53 deletions(-) diff --git a/nexus/src/app/sagas/instance_update/destroyed.rs b/nexus/src/app/sagas/instance_update/destroyed.rs index e5bdab49263..243f952c8bf 100644 --- a/nexus/src/app/sagas/instance_update/destroyed.rs +++ b/nexus/src/app/sagas/instance_update/destroyed.rs @@ -82,10 +82,9 @@ async fn siu_destroyed_release_sled_resources( info!( osagactx.log(), - "instance update (active VMM destroyed): deallocating sled resource reservation"; + "instance update (VMM destroyed): deallocating sled resource reservation"; "instance_id" => %instance_id, "propolis_id" => %vmm_id, - "instance_update" => %"VMM destroyed", ); osagactx @@ -117,7 +116,6 @@ async fn siu_destroyed_mark_vmm_deleted( "instance update (VMM destroyed): marking VMM record deleted"; "instance_id" => %instance_id, "propolis_id" => %vmm_id, - "instance_update" => %"VMM destroyed", ); osagactx diff --git a/nexus/src/app/sagas/instance_update/mod.rs b/nexus/src/app/sagas/instance_update/mod.rs index ea6bdebcda4..00a9b5c80a9 100644 --- a/nexus/src/app/sagas/instance_update/mod.rs +++ b/nexus/src/app/sagas/instance_update/mod.rs @@ -312,14 +312,36 @@ pub fn update_saga_needed( needed } +/// The set of updates to the instance and its owned resources to perform in +/// response to a VMM/migration state update. +/// +/// Depending on the current state of the instance and its VMM(s) and migration, +/// an update saga may perform a variety of operations. Which operations need to +/// be performed for the current state snapshot of the instance, VMM, and +/// migration records is determined by the [`UpdatesRequired::for_snapshot`] +/// function. #[derive(Debug, Deserialize, Serialize)] struct UpdatesRequired { - /// The new runtime state that must be written back to the database. + /// The new runtime state that must be written back to the database when the + /// saga completes. new_runtime: InstanceRuntimeState, + /// If this is [`Some`], the instance's active VMM with this UUID has + /// transitioned to [`VmmState::Destroyed`], and its resources must be + /// cleaned up by a [`destroyed`] subsaga. destroy_active_vmm: Option, + + /// If this is [`Some`], the instance's migration target VMM with this UUID + /// has transitioned to [`VmmState::Destroyed`], and its resources must be + /// cleaned up by a [`destroyed`] subsaga. destroy_target_vmm: Option, + + /// If `true`, the instance no longer has an active VMM, and its + /// virtual provisioning resource records and Oximeter producer should be + /// deallocated. deprovision: bool, + + /// If this is [`Some`], network_config: Option, } @@ -684,9 +706,10 @@ async fn siu_become_updater( let opctx = crate::context::op_context_for_saga_action(&sagactx, serialized_authn); let osagactx = sagactx.user_data(); + let log = osagactx.log(); - slog::debug!( - osagactx.log(), + debug!( + log, "instance update: trying to become instance updater..."; "instance_id" => %authz_instance.id(), "saga_id" => %saga_id, @@ -704,9 +727,9 @@ async fn siu_become_updater( .await .map_err(ActionError::action_failed)?; - slog::info!( - osagactx.log(), - "Now, I am become Updater, the destroyer of VMMs."; + info!( + log, + "instance_update: Now, I am become Updater, the destroyer of VMMs."; "instance_id" => %authz_instance.id(), "saga_id" => %saga_id, ); @@ -732,19 +755,23 @@ async fn siu_update_network_config( ) -> Result<(), ActionError> { let Params { ref serialized_authn, ref authz_instance, .. } = sagactx.saga_params()?; + + let update = + sagactx.lookup::(NETWORK_CONFIG_UPDATE)?; + let opctx = crate::context::op_context_for_saga_action(&sagactx, serialized_authn); + let osagactx = sagactx.user_data(); let nexus = osagactx.nexus(); - let instance_id = InstanceUuid::from_untyped_uuid(authz_instance.id()); + let log = osagactx.log(); - let update = - sagactx.lookup::(NETWORK_CONFIG_UPDATE)?; + let instance_id = InstanceUuid::from_untyped_uuid(authz_instance.id()); match update { NetworkConfigUpdate::Delete => { info!( - osagactx.log(), + log, "instance update: deleting network config"; "instance_id" => %instance_id, ); @@ -755,7 +782,7 @@ async fn siu_update_network_config( } NetworkConfigUpdate::Update { active_propolis_id, new_sled_id } => { info!( - osagactx.log(), + log, "instance update: ensuring updated instance network config"; "instance_id" => %instance_id, "active_propolis_id" => %active_propolis_id, @@ -796,14 +823,15 @@ async fn siu_release_virtual_provisioning( let instance = state.instance; let vmm_id = { - let id = instance - .runtime() - .propolis_id - .expect("a `release_virtual_provisioning` action should not have been pushed if there is no active VMM ID"); + let id = instance.runtime().propolis_id.expect( + "a `release_virtual_provisioning` action should not have been \ + pushed if there is no active VMM ID", + ); PropolisUuid::from_untyped_uuid(id) }; let instance_id = InstanceUuid::from_untyped_uuid(authz_instance.id()); + let log = osagactx.log(); let opctx = crate::context::op_context_for_saga_action(&sagactx, serialized_authn); @@ -820,25 +848,23 @@ async fn siu_release_virtual_provisioning( match result { Ok(deleted) => { info!( - osagactx.log(), - "instance update (VMM destroyed): deallocated virtual \ - provisioning resources"; + log, + "instance update (no VMM): deallocated virtual provisioning \ + resources"; "instance_id" => %instance_id, "propolis_id" => %vmm_id, "records_deleted" => ?deleted, - "instance_update" => %"active VMM destroyed", ); } // Necessary for idempotency --- the virtual provisioning resources may // have been deleted already, that's fine. Err(Error::ObjectNotFound { .. }) => { info!( - osagactx.log(), - "instance update (VMM destroyed): virtual provisioning \ - record not found; perhaps it has already been deleted?"; + log, + "instance update (no VMM): virtual provisioning record not \ + found; perhaps it has already been deleted?"; "instance_id" => %instance_id, "propolis_id" => %vmm_id, - "instance_update" => %"active VMM destroyed", ); } Err(err) => return Err(ActionError::action_failed(err)), @@ -856,16 +882,16 @@ async fn siu_unassign_oximeter_producer( let opctx = crate::context::op_context_for_saga_action(&sagactx, serialized_authn); + let log = osagactx.log(); info!( - osagactx.log(), - "instance update (VMM destroyed): unassigning oximeter producer"; + log, + "instance update (no VMM): unassigning oximeter producer"; "instance_id" => %authz_instance.id(), - "instance_update" => %"active VMM destroyed", ); crate::app::oximeter::unassign_producer( osagactx.datastore(), - osagactx.log(), + log, &opctx, &authz_instance.id(), ) @@ -883,11 +909,12 @@ async fn siu_commit_instance_updates( let opctx = crate::context::op_context_for_saga_action(&sagactx, &serialized_authn); + let log = osagactx.log(); let instance_id = authz_instance.id(); - slog::debug!( - osagactx.log(), + debug!( + log, "instance update: committing new runtime state and unlocking..."; "instance_id" => %instance_id, "new_runtime" => ?update.new_runtime, @@ -905,8 +932,8 @@ async fn siu_commit_instance_updates( .await .map_err(ActionError::action_failed)?; - slog::info!( - osagactx.log(), + info!( + log, "instance update: committed update new runtime state!"; "instance_id" => %instance_id, "new_runtime" => ?update.new_runtime, @@ -927,13 +954,12 @@ async fn siu_commit_instance_updates( if let Err(error) = chain_update_saga(&sagactx, authz_instance, serialized_authn).await { - let osagactx = sagactx.user_data(); // If starting the new update saga failed, DO NOT unwind this saga and // undo all the work we've done successfully! Instead, just kick the // instance-updater background task to try and start a new saga // eventually, and log a warning. warn!( - osagactx.log(), + log, "instance update: failed to start successor saga!"; "instance_id" => %instance_id, "error" => %error, @@ -952,6 +978,8 @@ async fn chain_update_saga( let opctx = crate::context::op_context_for_saga_action(sagactx, &serialized_authn); let osagactx = sagactx.user_data(); + let log = osagactx.log(); + let instance_id = authz_instance.id(); // Fetch the state from the database again to see if we should immediately @@ -962,11 +990,9 @@ async fn chain_update_saga( .await .context("failed to fetch latest snapshot for instance")?; - if let Some(update) = - UpdatesRequired::for_snapshot(osagactx.log(), &new_state) - { + if let Some(update) = UpdatesRequired::for_snapshot(log, &new_state) { debug!( - osagactx.log(), + log, "instance update: additional updates required, preparing a \ successor update saga..."; "instance_id" => %instance_id, @@ -991,7 +1017,7 @@ async fn chain_update_saga( // N.B. that we don't wait for the successor update saga to *complete* // here. We just want to make sure it starts. info!( - osagactx.log(), + log, "instance update: successor update saga started!"; "instance_id" => %instance_id, ); @@ -1086,7 +1112,7 @@ async fn unwind_instance_lock( let http_error = HttpError::from(error.clone()); if http_error.status_code.is_client_error() { error!( - &log, + log, "instance update: client error while unlocking instance \ (likely requires operator intervention), retrying anyway"; "instance_id" => %instance_id, @@ -1097,7 +1123,7 @@ async fn unwind_instance_lock( ); } else if total_duration > WARN_DURATION { warn!( - &log, + log, "instance update: server error while unlocking instance, retrying"; "instance_id" => %instance_id, @@ -1108,7 +1134,7 @@ async fn unwind_instance_lock( ); } else { info!( - &log, + log, "server error while recording saga event, retrying"; "instance_id" => %instance_id, "lock" => ?lock, diff --git a/nexus/src/app/sagas/instance_update/start.rs b/nexus/src/app/sagas/instance_update/start.rs index a16d67e06fd..385c6b722ae 100644 --- a/nexus/src/app/sagas/instance_update/start.rs +++ b/nexus/src/app/sagas/instance_update/start.rs @@ -92,12 +92,14 @@ async fn siu_lock_instance( let lock_id = sagactx.lookup::(INSTANCE_LOCK_ID)?; let opctx = crate::context::op_context_for_saga_action(&sagactx, serialized_authn); - slog::info!( + + info!( osagactx.log(), "instance update: attempting to lock instance"; "instance_id" => %authz_instance.id(), "saga_id" => %lock_id, ); + let locked = osagactx .datastore() .instance_updater_lock(&opctx, authz_instance, lock_id) @@ -146,13 +148,16 @@ async fn siu_fetch_state_and_start_real_saga( sagactx.saga_params::()?; let osagactx = sagactx.user_data(); let lock_id = sagactx.lookup::(INSTANCE_LOCK_ID)?; + + let log = osagactx.log(); + // Did we get the lock? If so, we can start the next saga, otherwise, just // exit gracefully. let Some(orig_lock) = sagactx.lookup::>(INSTANCE_LOCK)? else { - slog::info!( - osagactx.log(), + info!( + log, "instance update: instance is already locked! doing nothing..."; "instance_id" => %authz_instance.id(), "saga_id" => %lock_id, @@ -173,10 +178,9 @@ async fn siu_fetch_state_and_start_real_saga( // state snapshot. If there are updates to perform, execute the "real" // update saga. Otherwise, if we don't need to do anything else, simply // release the lock and finish this saga. - if let Some(update) = UpdatesRequired::for_snapshot(osagactx.log(), &state) - { + if let Some(update) = UpdatesRequired::for_snapshot(log, &state) { info!( - osagactx.log(), + log, "instance update: starting real update saga..."; "instance_id" => %authz_instance.id(), "current.runtime_state" => ?state.instance.runtime(), @@ -203,7 +207,7 @@ async fn siu_fetch_state_and_start_real_saga( .map_err(ActionError::action_failed)?; } else { info!( - osagactx.log(), + log, "instance update: no updates required, releasing lock."; "instance_id" => %authz_instance.id(), "current.runtime_state" => ?state.instance.runtime(), @@ -211,8 +215,7 @@ async fn siu_fetch_state_and_start_real_saga( "current.active_vmm" => ?state.active_vmm, "current.target_vmm" => ?state.target_vmm, ); - osagactx - .datastore() + datastore .instance_updater_unlock(&opctx, &authz_instance, &orig_lock, None) .await .map_err(ActionError::action_failed)?; From c0a4dda6bc1f32269907afb2b0b8edf202d8a101 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Tue, 23 Jul 2024 09:19:50 -0700 Subject: [PATCH 175/234] don't need to serialize the entire snapshot --- nexus/src/app/sagas/instance_update/mod.rs | 43 ++++++++++------------ 1 file changed, 19 insertions(+), 24 deletions(-) diff --git a/nexus/src/app/sagas/instance_update/mod.rs b/nexus/src/app/sagas/instance_update/mod.rs index 00a9b5c80a9..2cbe1c7d203 100644 --- a/nexus/src/app/sagas/instance_update/mod.rs +++ b/nexus/src/app/sagas/instance_update/mod.rs @@ -209,6 +209,7 @@ use crate::app::db::datastore::instance::InstanceUpdateResult; use crate::app::db::datastore::InstanceSnapshot; use crate::app::db::lookup::LookupPath; use crate::app::db::model::Generation; +use crate::app::db::model::Instance; use crate::app::db::model::InstanceRuntimeState; use crate::app::db::model::InstanceState; use crate::app::db::model::MigrationState; @@ -336,10 +337,13 @@ struct UpdatesRequired { /// cleaned up by a [`destroyed`] subsaga. destroy_target_vmm: Option, - /// If `true`, the instance no longer has an active VMM, and its + /// If this is [`Some`], the instance no longer has an active VMM, and its /// virtual provisioning resource records and Oximeter producer should be /// deallocated. - deprovision: bool, + /// + /// The entire instance record is required for this, since we need to know + /// the instance's virtual resource requests in order to deallocate them. + deprovision: Option, /// If this is [`Some`], network_config: Option, @@ -363,12 +367,13 @@ impl UpdatesRequired { let mut update_required = false; let mut network_config = None; - let mut deprovision = false; + let mut deprovision = true; // Has the active VMM been destroyed? let destroy_active_vmm = snapshot.active_vmm.as_ref().and_then(|active_vmm| { if active_vmm.runtime.state == VmmState::Destroyed { + let id = PropolisUuid::from_untyped_uuid(active_vmm.id); // Unlink the active VMM ID. If the active VMM was destroyed // because a migration out completed, the next block, which // handles migration updates, will set this to the new VMM's ID, @@ -393,7 +398,7 @@ impl UpdatesRequired { // will change this to a network config update if the // instance is now living somewhere else. network_config = Some(NetworkConfigUpdate::Delete); - Some(PropolisUuid::from_untyped_uuid(active_vmm.id)) + Some(id) } else { None } @@ -506,10 +511,10 @@ impl UpdatesRequired { // Even if the active VMM was destroyed (and we set the // instance's state to `NoVmm` above), it has successfully - // migrated, so leave it in the VMM state. + // migrated, so leave it in the VMM state and don't deallocate + // virtual provisioning records --- the instance is still + // incarnated. new_runtime.nexus_state = InstanceState::Vmm; - // If the active VMM has also been destroyed, don't delete - // virtual provisioning records while cleaning it up. deprovision = false; } } @@ -522,7 +527,7 @@ impl UpdatesRequired { new_runtime, destroy_active_vmm, destroy_target_vmm, - deprovision, + deprovision: deprovision.then(|| snapshot.instance.clone()), network_config, }) } @@ -544,8 +549,6 @@ struct RealParams { authz_instance: authz::Instance, - state: InstanceSnapshot, - update: UpdatesRequired, orig_lock: instance::UpdaterLock, @@ -554,6 +557,7 @@ struct RealParams { const INSTANCE_LOCK_ID: &str = "saga_instance_lock_id"; const INSTANCE_LOCK: &str = "updater_lock"; const NETWORK_CONFIG_UPDATE: &str = "network_config_update"; +const DEPROVISION_INSTANCE: &str = "deprovision_instance"; // instance update saga: actions @@ -634,7 +638,8 @@ impl NexusSaga for SagaDoActualInstanceUpdate { // If the instance now has no active VMM, release its virtual // provisioning resources and unassign its Oximeter producer. - if params.update.deprovision { + if let Some(ref instance) = params.update.deprovision { + builder.append(const_node(DEPROVISION_INSTANCE, instance)?); builder.append(release_virtual_provisioning_action()); builder.append(unassign_oximeter_producer_action()); } @@ -818,17 +823,9 @@ async fn siu_release_virtual_provisioning( sagactx: NexusActionContext, ) -> Result<(), ActionError> { let osagactx = sagactx.user_data(); - let RealParams { ref serialized_authn, ref authz_instance, state, .. } = + let RealParams { ref serialized_authn, ref authz_instance, .. } = sagactx.saga_params::()?; - - let instance = state.instance; - let vmm_id = { - let id = instance.runtime().propolis_id.expect( - "a `release_virtual_provisioning` action should not have been \ - pushed if there is no active VMM ID", - ); - PropolisUuid::from_untyped_uuid(id) - }; + let instance = sagactx.lookup::(DEPROVISION_INSTANCE)?; let instance_id = InstanceUuid::from_untyped_uuid(authz_instance.id()); let log = osagactx.log(); @@ -852,7 +849,6 @@ async fn siu_release_virtual_provisioning( "instance update (no VMM): deallocated virtual provisioning \ resources"; "instance_id" => %instance_id, - "propolis_id" => %vmm_id, "records_deleted" => ?deleted, ); } @@ -864,7 +860,6 @@ async fn siu_release_virtual_provisioning( "instance update (no VMM): virtual provisioning record not \ found; perhaps it has already been deleted?"; "instance_id" => %instance_id, - "propolis_id" => %vmm_id, ); } Err(err) => return Err(ActionError::action_failed(err)), @@ -1000,7 +995,7 @@ async fn chain_update_saga( "update.network_config_update" => ?update.network_config, "update.destroy_active_vmm" => ?update.destroy_active_vmm, "update.destroy_target_vmm" => ?update.destroy_target_vmm, - "update.deprovision" => update.deprovision, + "update.deprovision" => ?update.deprovision, ); let saga_dag = SagaInstanceUpdate::prepare(&Params { serialized_authn, From 0090402a7e9473e9216a2c8dd544381745a70b41 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Tue, 23 Jul 2024 09:31:34 -0700 Subject: [PATCH 176/234] finish the Next Great American Novel --- nexus/src/app/sagas/instance_update/mod.rs | 55 +++++++++++++++++++++- 1 file changed, 54 insertions(+), 1 deletion(-) diff --git a/nexus/src/app/sagas/instance_update/mod.rs b/nexus/src/app/sagas/instance_update/mod.rs index 2cbe1c7d203..9a699c17b1f 100644 --- a/nexus/src/app/sagas/instance_update/mod.rs +++ b/nexus/src/app/sagas/instance_update/mod.rs @@ -186,7 +186,57 @@ //! being missed? //! //! We solve this using an ~~even more layers of complexity~~defense-in-depth -//! approach. +//! approach. Together, a number of mechanisms exist to ensure that (a) an +//! instance whose VMM and migration states require an update saga will always +//! have an update saga run eventually, and (b) update sagas are run in as +//! timely a manner as possible. +//! +//! The first of these ~~layers of nonsense~~redundant systems to +//! prevent missed updates is perhaps the simplest one: _avoiding unnecessary +//! update sagas_. The `cpapi_instances_put` API endpoint and instance-watcher +//! background tasks handle changes to VMM and migration states by calling the +//! [`Nexus::notify_instance_updated`] method, which writes the new states to +//! the database and (potentially) starts an update saga. Naively, this method +//! would *always* start an update saga, but remember that --- as we discussed +//! [above](#background) --- many VMM/migration state changes don't actually +//! require modifying the instance record. For example, if an instance's VMM +//! transitions from [`VmmState::Starting`] to [`VmmState::Running`], that +//! changes the instance's externally-visible effective state, but it does *not* +//! require an instance record update. By not starting an update saga unless one +//! is actually required, we reduce updater lock contention, so that the lock is +//! less likely to be held when VMM and migration states that actually *do* +//! require an update saga are published. The [`update_saga_needed`] function in +//! this module contains the logic for determining whether an update saga is +//! required. +//! +//! The second mechanism for ensuring updates are performed in a timely manner +//! is what I'm calling _saga chaining_. When the final action in an +//! instance-update saga writes back the instance record and releases the +//! updater lock, it will then perform a second query to read the instance, VMM, +//! and migration records. If the current state of the instance indicates that +//! another update saga is needed, then the completing saga will execute a new +//! start saga as its final action. +//! +//! The last line of defense is the `instance-updater` background task. This +//! task periodically queries the database to list instances which require +//! update sagas (either their active VMM is `Destroyed` or their active +//! migration has terminated) and are not currently locked by another update +//! saga. A new update saga is started for any such instances found. Because +//! this task runs periodically, it ensures that eventually, an update saga will +//! be started for any instance that requires one.[^3] +//! +//! The background task ensures that sagas are started eventually, but because +//! it only runs occasionally, update sagas started by it may be somewhat +//! delayed. To improve the timeliness of update sagas, we will also explicitly +//! activate the background task at any point where we know that an update saga +//! *should* run but we were not able to run it. If an update saga cannot be +//! started, whether by [`Nexus::notify_instance_updated`], a +//! `start-instance-update` saga attempting to start its real saga, or an +//! `instance-update` saga chaining into a new one as its last action, the +//! `instance-watcher` background task is activated. Similarly, when a +//! `start-instance-update` saga fails to acquire the lock and exits, it +//! activates the background task as well. This ensures that we will attempt the +//! update again. //! //! [instance_updater_lock]: //! crate::app::db::datastore::DataStore::instance_updater_lock @@ -199,6 +249,9 @@ //! //! [^1]: And, if a process *can* die, well...we can assume it *will*. //! [^2]: Barring human intervention. +//! [^3]: Even if the Nexus instance that processed the state update died +//! between when it wrote the state to CRDB and when it started the +//! requisite update saga! use super::{ ActionRegistry, NexusActionContext, NexusSaga, SagaInitError, From 237831ce37e93e1d4441e401c143868f85fa8535 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Tue, 23 Jul 2024 09:33:58 -0700 Subject: [PATCH 177/234] whoops i broke it --- nexus/src/app/sagas/instance_update/start.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/nexus/src/app/sagas/instance_update/start.rs b/nexus/src/app/sagas/instance_update/start.rs index 385c6b722ae..d3dd289c690 100644 --- a/nexus/src/app/sagas/instance_update/start.rs +++ b/nexus/src/app/sagas/instance_update/start.rs @@ -191,7 +191,7 @@ async fn siu_fetch_state_and_start_real_saga( "update.network_config_update" => ?update.network_config, "update.destroy_active_vmm" => ?update.destroy_active_vmm, "update.destroy_target_vmm" => ?update.destroy_target_vmm, - "update.deprovision" => update.deprovision, + "update.deprovision" => update.deprovision.is_some(), ); osagactx .nexus() @@ -199,7 +199,6 @@ async fn siu_fetch_state_and_start_real_saga( .saga_execute::(RealParams { serialized_authn, authz_instance, - state, update, orig_lock, }) From e2f451d8033bca020693c226db025d348fa00514 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Tue, 23 Jul 2024 09:43:00 -0700 Subject: [PATCH 178/234] serialize less stuff --- nexus/src/app/sagas/instance_update/mod.rs | 48 +++++++++++++++------- 1 file changed, 33 insertions(+), 15 deletions(-) diff --git a/nexus/src/app/sagas/instance_update/mod.rs b/nexus/src/app/sagas/instance_update/mod.rs index 9a699c17b1f..48b55493499 100644 --- a/nexus/src/app/sagas/instance_update/mod.rs +++ b/nexus/src/app/sagas/instance_update/mod.rs @@ -261,8 +261,8 @@ use crate::app::db::datastore::instance; use crate::app::db::datastore::instance::InstanceUpdateResult; use crate::app::db::datastore::InstanceSnapshot; use crate::app::db::lookup::LookupPath; +use crate::app::db::model::ByteCount; use crate::app::db::model::Generation; -use crate::app::db::model::Instance; use crate::app::db::model::InstanceRuntimeState; use crate::app::db::model::InstanceState; use crate::app::db::model::MigrationState; @@ -393,10 +393,7 @@ struct UpdatesRequired { /// If this is [`Some`], the instance no longer has an active VMM, and its /// virtual provisioning resource records and Oximeter producer should be /// deallocated. - /// - /// The entire instance record is required for this, since we need to know - /// the instance's virtual resource requests in order to deallocate them. - deprovision: Option, + deprovision: Option, /// If this is [`Some`], network_config: Option, @@ -408,6 +405,15 @@ enum NetworkConfigUpdate { Update { active_propolis_id: PropolisUuid, new_sled_id: Uuid }, } +/// Virtual provisioning counters to release when an instance no longer has a +/// VMM. +#[derive(Debug, Deserialize, Serialize)] +struct Deprovision { + project_id: Uuid, + cpus_diff: i64, + ram_diff: ByteCount, +} + impl UpdatesRequired { fn for_snapshot( log: &slog::Logger, @@ -580,7 +586,11 @@ impl UpdatesRequired { new_runtime, destroy_active_vmm, destroy_target_vmm, - deprovision: deprovision.then(|| snapshot.instance.clone()), + deprovision: deprovision.then(|| Deprovision { + project_id: snapshot.instance.project_id, + cpus_diff: i64::from(snapshot.instance.ncpus.0 .0), + ram_diff: snapshot.instance.memory, + }), network_config, }) } @@ -610,7 +620,6 @@ struct RealParams { const INSTANCE_LOCK_ID: &str = "saga_instance_lock_id"; const INSTANCE_LOCK: &str = "updater_lock"; const NETWORK_CONFIG_UPDATE: &str = "network_config_update"; -const DEPROVISION_INSTANCE: &str = "deprovision_instance"; // instance update saga: actions @@ -691,8 +700,7 @@ impl NexusSaga for SagaDoActualInstanceUpdate { // If the instance now has no active VMM, release its virtual // provisioning resources and unassign its Oximeter producer. - if let Some(ref instance) = params.update.deprovision { - builder.append(const_node(DEPROVISION_INSTANCE, instance)?); + if params.update.deprovision.is_some() { builder.append(release_virtual_provisioning_action()); builder.append(unassign_oximeter_producer_action()); } @@ -876,9 +884,19 @@ async fn siu_release_virtual_provisioning( sagactx: NexusActionContext, ) -> Result<(), ActionError> { let osagactx = sagactx.user_data(); - let RealParams { ref serialized_authn, ref authz_instance, .. } = - sagactx.saga_params::()?; - let instance = sagactx.lookup::(DEPROVISION_INSTANCE)?; + let RealParams { + ref serialized_authn, ref authz_instance, ref update, .. + } = sagactx.saga_params::()?; + let Some(Deprovision { project_id, cpus_diff, ram_diff }) = + update.deprovision + else { + return Err(ActionError::action_failed( + "a `siu_release_virtual_provisioning` action should never have \ + been added to the DAG if the update does not contain virtual \ + resources to deprovision" + .to_string(), + )); + }; let instance_id = InstanceUuid::from_untyped_uuid(authz_instance.id()); let log = osagactx.log(); @@ -890,9 +908,9 @@ async fn siu_release_virtual_provisioning( .virtual_provisioning_collection_delete_instance( &opctx, instance_id, - instance.project_id, - i64::from(instance.ncpus.0 .0), - instance.memory, + project_id, + cpus_diff, + ram_diff, ) .await; match result { From 6902907eae6b1542698c6cd7acd9967880ee5778 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Tue, 23 Jul 2024 10:08:53 -0700 Subject: [PATCH 179/234] oops lol --- nexus/src/app/sagas/instance_update/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nexus/src/app/sagas/instance_update/mod.rs b/nexus/src/app/sagas/instance_update/mod.rs index 48b55493499..04385bd8335 100644 --- a/nexus/src/app/sagas/instance_update/mod.rs +++ b/nexus/src/app/sagas/instance_update/mod.rs @@ -426,7 +426,7 @@ impl UpdatesRequired { let mut update_required = false; let mut network_config = None; - let mut deprovision = true; + let mut deprovision = false; // Has the active VMM been destroyed? let destroy_active_vmm = From 9f3ba514d6f96c8351d5c187fbcb6e9ce1756b80 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Tue, 23 Jul 2024 11:16:28 -0700 Subject: [PATCH 180/234] fix docs links (oops) --- nexus/src/app/sagas/instance_update/mod.rs | 42 +++++++++++----------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/nexus/src/app/sagas/instance_update/mod.rs b/nexus/src/app/sagas/instance_update/mod.rs index 04385bd8335..6e664984333 100644 --- a/nexus/src/app/sagas/instance_update/mod.rs +++ b/nexus/src/app/sagas/instance_update/mod.rs @@ -17,8 +17,8 @@ //! migration of an instance between two VMMs. //! //! When an instance is incarnated on a sled, the `propolis_id` field in an -//! `instance` record contains a UUID foreign key into the `vmm` table that points -//! to the `vmm` record for the Propolis process on which the instance is +//! `instance` record contains a UUID foreign key into the `vmm` table that +//! points to the `vmm` record for the Propolis process on which the instance is //! currently running. If an instance is undergoing live migration, its record //! additionally contains a `dst_propolis_id` foreign key pointing at the `vmm` //! row representing the *target* Propolis process that it is migrating to, and @@ -98,16 +98,15 @@ //! potential race conditions. By considering the valid and invalid flows //! through an instance's state machine, we arrive at some ground rules: //! -//! - The `instance_migrate` and `instance_delete` sagas will -//! only modify the instance record if the instance *has* an active Propolis -//! ID. +//! - The `instance_migrate` and `instance_delete` sagas will only modify the +//! instance record if the instance *has* an active Propolis ID. //! - The `instance_start` and instance_delete` sagas will only modify the //! instance record if the instance does *not* have an active VMM. //! - The presence of a migration ID prevents an `instance_migrate` saga from //! succeeding until the current migration is resolved (either completes or //! fails). -//! - Only the `instance_start` saga can set the instance's *active* Propolis ID, -//! and it can only do this if there is currently no active Propolis. +//! - Only the `instance_start` saga can set the instance's *active* Propolis +//! ID, and it can only do this if there is currently no active Propolis. //! - Only the `instance_migrate` saga can set the instance's *target* Propolis //! ID and migration ID, and it can only do that if these fields are unset. //! - Only the `instance_update` saga can unset a migration ID and target @@ -191,13 +190,13 @@ //! have an update saga run eventually, and (b) update sagas are run in as //! timely a manner as possible. //! -//! The first of these ~~layers of nonsense~~redundant systems to -//! prevent missed updates is perhaps the simplest one: _avoiding unnecessary -//! update sagas_. The `cpapi_instances_put` API endpoint and instance-watcher -//! background tasks handle changes to VMM and migration states by calling the -//! [`Nexus::notify_instance_updated`] method, which writes the new states to -//! the database and (potentially) starts an update saga. Naively, this method -//! would *always* start an update saga, but remember that --- as we discussed +//! The first of these ~~layers of nonsense~~redundant systems to prevent missed +//! updates is perhaps the simplest one: _avoiding unnecessary update sagas_. +//! The `cpapi_instances_put` API endpoint and instance-watcher background tasks +//! handle changes to VMM and migration states by calling the +//! [`notify_instance_updated`] method, which writes the new states to the +//! database and (potentially) starts an update saga. Naively, this method would +//! *always* start an update saga, but remember that --- as we discussed //! [above](#background) --- many VMM/migration state changes don't actually //! require modifying the instance record. For example, if an instance's VMM //! transitions from [`VmmState::Starting`] to [`VmmState::Running`], that @@ -230,13 +229,12 @@ //! delayed. To improve the timeliness of update sagas, we will also explicitly //! activate the background task at any point where we know that an update saga //! *should* run but we were not able to run it. If an update saga cannot be -//! started, whether by [`Nexus::notify_instance_updated`], a -//! `start-instance-update` saga attempting to start its real saga, or an -//! `instance-update` saga chaining into a new one as its last action, the -//! `instance-watcher` background task is activated. Similarly, when a -//! `start-instance-update` saga fails to acquire the lock and exits, it -//! activates the background task as well. This ensures that we will attempt the -//! update again. +//! started, whether by [`notify_instance_updated`], a `start-instance-update` +//! saga attempting to start its real saga, or an `instance-update` saga +//! chaining into a new one as its last action, the `instance-watcher` +//! background task is activated. Similarly, when a `start-instance-update` saga +//! fails to acquire the lock and exits, it activates the background task as +//! well. This ensures that we will attempt the update again. //! //! [instance_updater_lock]: //! crate::app::db::datastore::DataStore::instance_updater_lock @@ -244,6 +242,8 @@ //! crate::app::db::datastore::DataStore::instance_updater_inherit_lock //! [instance_updater_unlock]: //! crate::app::db::datastore::DataStore::instance_updater_unlock +//! [`notify_instance_updated`]: crate::app::Nexus::notify_instance_updated +//! //! [dist-locking]: //! https://martin.kleppmann.com/2016/02/08/how-to-do-distributed-locking.html //! From eab900c9c3ac5d99e6a4a39a47668ba08c254381 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Tue, 23 Jul 2024 12:46:20 -0700 Subject: [PATCH 181/234] remove second bonus license header --- nexus/src/app/sagas/instance_update/start.rs | 4 ---- 1 file changed, 4 deletions(-) diff --git a/nexus/src/app/sagas/instance_update/start.rs b/nexus/src/app/sagas/instance_update/start.rs index d3dd289c690..db8c88bf153 100644 --- a/nexus/src/app/sagas/instance_update/start.rs +++ b/nexus/src/app/sagas/instance_update/start.rs @@ -4,10 +4,6 @@ // instance update start saga -// This Source Code Form is subject to the terms of the Mozilla Public -// License, v. 2.0. If a copy of the MPL was not distributed with this -// file, You can obtain one at https://mozilla.org/MPL/2.0/. - use super::{ ActionRegistry, NexusActionContext, NexusSaga, RealParams, SagaDoActualInstanceUpdate, SagaInitError, UpdatesRequired, From 77af396921fb9a88842edca5e68f13dbd6fc66f4 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Tue, 23 Jul 2024 14:31:28 -0700 Subject: [PATCH 182/234] fix typo (thanks @bcantrill) --- nexus/src/app/sagas/instance_update/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nexus/src/app/sagas/instance_update/mod.rs b/nexus/src/app/sagas/instance_update/mod.rs index 6e664984333..d93e7733048 100644 --- a/nexus/src/app/sagas/instance_update/mod.rs +++ b/nexus/src/app/sagas/instance_update/mod.rs @@ -130,7 +130,7 @@ //! about distributed locks is that a process can die[^1] while holding a lock, //! which results in the protected resource (in this case, the `instance` //! record) being locked forever.[^2] It would be good for that to not happen. -//! Fortunately, *if* (and only if) we promise to *only* ever acquire the the +//! Fortunately, *if* (and only if) we promise to *only* ever acquire the //! instance-updater lock inside of a saga, we can guarantee forward progress: //! should a saga fail while holding the lock, it will unwind into a reverse //! action that releases the lock. This is essentially the distributed From 4fd8320878d7fcf9a9038f3003535ced4abf4277 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Wed, 24 Jul 2024 10:03:12 -0700 Subject: [PATCH 183/234] instance_and_vmm_update_runtime is dead code now --- nexus/db-queries/src/db/datastore/instance.rs | 62 ------------------- 1 file changed, 62 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/instance.rs b/nexus/db-queries/src/db/datastore/instance.rs index e65a13b22cb..39fec43280a 100644 --- a/nexus/db-queries/src/db/datastore/instance.rs +++ b/nexus/db-queries/src/db/datastore/instance.rs @@ -741,68 +741,6 @@ impl DataStore { Ok(updated) } - /// Updates an instance record and a VMM record with a single database - /// command. - /// - /// This is intended to be used to apply updates from sled agent that - /// may change a VMM's runtime state (e.g. moving an instance from Running - /// to Stopped) and its corresponding instance's state (e.g. changing the - /// active Propolis ID to reflect a completed migration) in a single - /// transaction. The caller is responsible for ensuring the instance and - /// VMM states are consistent with each other before calling this routine. - /// - /// # Arguments - /// - /// - instance_id: The ID of the instance to update. - /// - new_instance: The new instance runtime state to try to write. - /// - vmm_id: The ID of the VMM to update. - /// - new_vmm: The new VMM runtime state to try to write. - /// - /// # Return value - /// - /// - `Ok(`[`InstanceUpdateResult`]`)` if the query was issued - /// successfully. The returned [`InstanceUpdateResult`] indicates which - /// database record(s) were updated. Note that an update can fail because - /// it was inapplicable (i.e. the database has state with a newer - /// generation already) or because the relevant record was not found. - /// - `Err` if another error occurred while accessing the database. - pub async fn instance_and_vmm_update_runtime( - &self, - instance_id: &InstanceUuid, - new_instance: &InstanceRuntimeState, - vmm_id: &PropolisUuid, - new_vmm: &VmmRuntimeState, - migrations: Migrations<'_>, - ) -> Result { - let query = crate::db::queries::instance::InstanceAndVmmUpdate::new( - *vmm_id, - new_vmm.clone(), - Some((*instance_id, new_instance.clone())), - migrations, - ); - - // The InstanceAndVmmUpdate query handles and indicates failure to find - // either the instance or the VMM, so a query failure here indicates - // some kind of internal error and not a failed lookup. - let result = query - .execute_and_check(&*self.pool_connection_unauthorized().await?) - .await - .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?; - - let instance_updated = result.instance_status.was_updated(); - let vmm_updated = match result.vmm_status { - Some(UpdateStatus::Updated) => true, - Some(UpdateStatus::NotUpdatedButExists) => false, - None => false, - }; - Ok(InstanceUpdateResult { - instance_updated, - vmm_updated, - migration_in_updated: result.migration_in_status.was_updated(), - migration_out_updated: result.migration_out_status.was_updated(), - }) - } - /// Lists all instances on in-service sleds with active Propolis VMM /// processes, returning the instance along with the VMM on which it's /// running, the sled on which the VMM is running, and the project that owns From 368dacfd95dc06a276ef8c10d2a9a96b69204034 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Wed, 24 Jul 2024 11:02:36 -0700 Subject: [PATCH 184/234] CTE only does VMM and migration updates We no longer use the "InstanceAndVmmUpdate" CTE to update the instance record, just the VMM record and maybe migration record(s). I've removed the instance update from the CTE and updated its naming and docs to reflect this. --- nexus/db-queries/src/db/datastore/instance.rs | 15 +- nexus/db-queries/src/db/datastore/mod.rs | 1 + nexus/db-queries/src/db/datastore/vmm.rs | 52 +++- nexus/db-queries/src/db/queries/mod.rs | 2 +- .../src/db/queries/{instance.rs => vmm.rs} | 250 ++++-------------- ...stance_and_vmm_update_vmm_and_instance.sql | 55 ---- ...pdate_vmm_instance_and_both_migrations.sql | 119 --------- ...m_update_vmm_instance_and_migration_in.sql | 87 ------ ..._update_vmm_instance_and_migration_out.sql | 87 ------ ...ration_update_vmm_and_both_migrations.sql} | 2 - ...migration_update_vmm_and_migration_in.sql} | 2 - ...igration_update_vmm_and_migration_out.sql} | 2 - ... => vmm_and_migration_update_vmm_only.sql} | 2 +- nexus/src/app/sagas/instance_update/mod.rs | 6 +- 14 files changed, 98 insertions(+), 584 deletions(-) rename nexus/db-queries/src/db/queries/{instance.rs => vmm.rs} (68%) delete mode 100644 nexus/db-queries/tests/output/instance_and_vmm_update_vmm_and_instance.sql delete mode 100644 nexus/db-queries/tests/output/instance_and_vmm_update_vmm_instance_and_both_migrations.sql delete mode 100644 nexus/db-queries/tests/output/instance_and_vmm_update_vmm_instance_and_migration_in.sql delete mode 100644 nexus/db-queries/tests/output/instance_and_vmm_update_vmm_instance_and_migration_out.sql rename nexus/db-queries/tests/output/{instance_and_vmm_update_vmm_and_both_migrations.sql => vmm_and_migration_update_vmm_and_both_migrations.sql} (99%) rename nexus/db-queries/tests/output/{instance_and_vmm_update_vmm_and_migration_in.sql => vmm_and_migration_update_vmm_and_migration_in.sql} (98%) rename nexus/db-queries/tests/output/{instance_and_vmm_update_vmm_and_migration_out.sql => vmm_and_migration_update_vmm_and_migration_out.sql} (98%) rename nexus/db-queries/tests/output/{instance_and_vmm_update_vmm_only.sql => vmm_and_migration_update_vmm_only.sql} (87%) diff --git a/nexus/db-queries/src/db/datastore/instance.rs b/nexus/db-queries/src/db/datastore/instance.rs index 39fec43280a..e61c2fdca88 100644 --- a/nexus/db-queries/src/db/datastore/instance.rs +++ b/nexus/db-queries/src/db/datastore/instance.rs @@ -36,7 +36,6 @@ use chrono::Utc; use diesel::prelude::*; use nexus_db_model::ApplySledFilterExt; use nexus_db_model::Disk; -use nexus_db_model::VmmRuntimeState; use nexus_types::deployment::SledFilter; use omicron_common::api; use omicron_common::api::external; @@ -49,7 +48,6 @@ use omicron_common::api::external::ListResultVec; use omicron_common::api::external::LookupResult; use omicron_common::api::external::LookupType; use omicron_common::api::external::ResourceType; -use omicron_common::api::internal::nexus::Migrations; use omicron_common::bail_unless; use omicron_uuid_kinds::GenericUuid; use omicron_uuid_kinds::InstanceUuid; @@ -243,18 +241,6 @@ pub enum UpdaterLockError { Query(#[from] Error), } -/// The result of an [`DataStore::instance_and_vmm_update_runtime`] call, -/// indicating which records were updated. -#[derive(Copy, Clone, Debug)] -pub struct InstanceUpdateResult { - /// `true` if the instance record was updated, `false` otherwise. - pub instance_updated: bool, - /// `true` if the VMM record was updated, `false` otherwise. - pub vmm_updated: bool, - pub migration_in_updated: bool, - pub migration_out_updated: bool, -} - impl DataStore { /// Idempotently insert a database record for an Instance /// @@ -1201,6 +1187,7 @@ mod tests { use crate::db::lookup::LookupPath; use nexus_db_model::InstanceState; use nexus_db_model::Project; + use nexus_db_model::VmmRuntimeState; use nexus_db_model::VmmState; use nexus_test_utils::db::test_setup_database; use nexus_types::external_api::params; diff --git a/nexus/db-queries/src/db/datastore/mod.rs b/nexus/db-queries/src/db/datastore/mod.rs index 209ee94e121..d9ea3ad31ba 100644 --- a/nexus/db-queries/src/db/datastore/mod.rs +++ b/nexus/db-queries/src/db/datastore/mod.rs @@ -123,6 +123,7 @@ pub use sled::SledTransition; pub use sled::TransitionError; pub use switch_port::SwitchPortSettingsCombinedResult; pub use virtual_provisioning_collection::StorageType; +pub use vmm::VmmStateUpdateResult; pub use volume::read_only_resources_associated_with_volume; pub use volume::CrucibleResources; pub use volume::CrucibleTargets; diff --git a/nexus/db-queries/src/db/datastore/vmm.rs b/nexus/db-queries/src/db/datastore/vmm.rs index 308cfc67db2..eb788fdc898 100644 --- a/nexus/db-queries/src/db/datastore/vmm.rs +++ b/nexus/db-queries/src/db/datastore/vmm.rs @@ -7,7 +7,6 @@ use super::DataStore; use crate::authz; use crate::context::OpContext; -use crate::db::datastore::instance::InstanceUpdateResult; use crate::db::error::public_error_from_diesel; use crate::db::error::ErrorHandler; use crate::db::model::Vmm; @@ -34,6 +33,22 @@ use omicron_uuid_kinds::PropolisUuid; use std::net::SocketAddr; use uuid::Uuid; +/// The result of an [`DataStore::vmm_and_migration_update_runtime`] call, +/// indicating which records were updated. +#[derive(Copy, Clone, Debug)] +pub struct VmmStateUpdateResult { + /// `true` if the VMM record was updated, `false` otherwise. + pub vmm_updated: bool, + + /// `true` if a migration record was updated for the migration in, false if + /// no update was performed or no migration in was provided. + pub migration_in_updated: bool, + + /// `true` if a migration record was updated for the migration out, false if + /// no update was performed or no migration out was provided. + pub migration_out_updated: bool, +} + impl DataStore { pub async fn vmm_insert( &self, @@ -143,20 +158,44 @@ impl DataStore { Ok(updated) } + /// Updates a VMM record and associated migration record(s) with a single + /// database command. + /// + /// This is intended to be used to apply updates from sled agent that + /// may change a VMM's runtime state (e.g. moving an instance from Running + /// to Stopped) and the state of its current active mgiration in a single + /// transaction. The caller is responsible for ensuring the VMM and + /// migration states are consistent with each other before calling this + /// routine. + /// + /// # Arguments + /// + /// - `vmm_id`: The ID of the VMM to update. + /// - `new_runtime`: The new VMM runtime state to try to write. + /// - `migrations`: The (optional) migration-in and migration-out states to + /// try to write. + /// + /// # Return value + /// + /// - `Ok(`[`VmmStateUpdateResult`]`)` if the query was issued + /// successfully. The returned [`VmmStateUpdateResult`] indicates which + /// database record(s) were updated. Note that an update can fail because + /// it was inapplicable (i.e. the database has state with a newer + /// generation already) or because the relevant record was not found. + /// - `Err` if another error occurred while accessing the database. pub async fn vmm_and_migration_update_runtime( &self, vmm_id: PropolisUuid, new_runtime: &VmmRuntimeState, migrations: Migrations<'_>, - ) -> Result { - let query = crate::db::queries::instance::InstanceAndVmmUpdate::new( + ) -> Result { + let query = crate::db::queries::vmm::VmmAndMigrationUpdate::new( vmm_id, new_runtime.clone(), - None, migrations, ); - // The InstanceAndVmmUpdate query handles and indicates failure to find + // The VmmAndMigrationUpdate query handles and indicates failure to find // either the VMM or the migration, so a query failure here indicates // some kind of internal error and not a failed lookup. let result = query @@ -164,8 +203,7 @@ impl DataStore { .await .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?; - Ok(InstanceUpdateResult { - instance_updated: false, + Ok(VmmStateUpdateResult { vmm_updated: match result.vmm_status { Some(UpdateStatus::Updated) => true, Some(UpdateStatus::NotUpdatedButExists) => false, diff --git a/nexus/db-queries/src/db/queries/mod.rs b/nexus/db-queries/src/db/queries/mod.rs index a1022f91876..46e8a7bc163 100644 --- a/nexus/db-queries/src/db/queries/mod.rs +++ b/nexus/db-queries/src/db/queries/mod.rs @@ -7,8 +7,8 @@ pub mod disk; pub mod external_ip; -pub mod instance; pub mod ip_pool; +pub mod vmm; #[macro_use] mod next_item; pub mod network_interface; diff --git a/nexus/db-queries/src/db/queries/instance.rs b/nexus/db-queries/src/db/queries/vmm.rs similarity index 68% rename from nexus/db-queries/src/db/queries/instance.rs rename to nexus/db-queries/src/db/queries/vmm.rs index c73f7ac6806..e8eec47141d 100644 --- a/nexus/db-queries/src/db/queries/instance.rs +++ b/nexus/db-queries/src/db/queries/vmm.rs @@ -12,25 +12,22 @@ use diesel::sql_types::{Nullable, Uuid as SqlUuid}; use diesel::{pg::Pg, query_builder::AstPass}; use diesel::{Column, ExpressionMethods, QueryDsl, RunQueryDsl}; use nexus_db_model::{ - schema::{ - instance::dsl as instance_dsl, migration::dsl as migration_dsl, - vmm::dsl as vmm_dsl, - }, - Generation, InstanceRuntimeState, MigrationState, VmmRuntimeState, + schema::{migration::dsl as migration_dsl, vmm::dsl as vmm_dsl}, + Generation, MigrationState, VmmRuntimeState, }; use omicron_common::api::internal::nexus::{MigrationRuntimeState, Migrations}; -use omicron_uuid_kinds::{GenericUuid, InstanceUuid, PropolisUuid}; +use omicron_uuid_kinds::{GenericUuid, PropolisUuid}; use uuid::Uuid; use crate::db::pool::DbConnection; use crate::db::update_and_check::UpdateStatus; -/// A CTE that checks and updates the instance and VMM tables in a single +/// A CTE that checks and updates the VMM and migration tables in a single /// atomic operation. // // The single-table update-and-check CTE has the following form: // -// WITH found AS (SELECT FROM T WHERE ) +// WITH found AS (SELECT FROM T WHERE ) // updated AS (UPDATE T SET RETURNING *) // SELECT // found. @@ -44,48 +41,50 @@ use crate::db::update_and_check::UpdateStatus; // found. = updated.; // // The idea behind this query is to have separate "found" and "updated" -// subqueries for the instance and VMM tables, then use those to create two more +// subqueries for the VMM and migration tables, then use those to create two more // subqueries that perform the joins and yield the results, along the following // lines: // // WITH vmm_found AS (SELECT(SELECT id FROM vmm WHERE vmm.id = id) AS id), // vmm_updated AS (UPDATE vmm SET ... RETURNING *), -// instance_found AS (SELECT( -// SELECT id FROM instance WHERE instance.id = id +// migration_in_found AS (SELECT( +// SELECT id FROM migration WHERE migration.id = migration_in_id // ) AS id), -// instance_updated AS (UPDATE instance SET ... RETURNING *), +// migration_in_updated AS (UPDATE migration SET ... RETURNING *), +// migration_out_found AS (SELECT( +// SELECT id FROM migration WHERE migration.id = migration_out_id +// ) AS id), +// migration_out_updated AS (UPDATE migration SET ... RETURNING *), // vmm_result AS ( // SELECT vmm_found.id AS found, vmm_updated.id AS updated // FROM vmm_found // LEFT JOIN vmm_updated // ON vmm_found.id = vmm_updated.id // ), -// instance_result AS ( -// SELECT instance_found.id AS found, instance_updated.id AS updated -// FROM instance_found -// LEFT JOIN instance_updated -// ON instance_found.id = instance_updated.id -// ) -// SELECT vmm_result.found, vmm_result.updated, instance_result.found, -// instance_result.updated -// FROM vmm_result, instance_result; -/// -/// If a [`MigrationRuntimeState`] is provided, similar "found" and "update" -/// clauses are also added to join the `migration` record for the instance's -/// active migration, if one exists, and update the migration record. If no -/// migration record is provided, this part of the query is skipped, and the -/// `migration_found` and `migration_updated` portions are always `false`. +// migration_in_result AS ( +// SELECT migration_in_found.id AS found, migration_in_updated.id AS updated +// FROM migration_in_found +// LEFT JOIN migration_in_updated +// ON migration_in_found.id = migration_in_updated.id +// ), +// migration_out_result AS ( .. ) +// SELECT vmm_result.found, vmm_result.updated, migration_in_result.found, +// migration_in_result.updated, migration_out_result.found, +// migration_out_result.updated, +// FROM vmm_result, migration_in_result, migration_out_result; +// +// Depending on whether a migration in, migration out, both, or neither were +// provided, the structure of the query will differ somewhat. // -// The "wrapper" SELECTs when finding instances and VMMs are used to get a NULL +// The "wrapper" SELECTs when finding migrations and VMMs are used to get a NULL // result in the final output instead of failing the entire query if the target // object is missing. This maximizes Nexus's flexibility when dealing with // updates from sled agent that refer to one valid and one deleted object. (This // can happen if, e.g., sled agent sends a message indicating that a retired VMM // has finally been destroyed when its instance has since been deleted.) -pub struct InstanceAndVmmUpdate { +pub struct VmmAndMigrationUpdate { vmm_find: Box + Send>, vmm_update: Box + Send>, - instance: Option, migration_in: Option, migration_out: Option, } @@ -99,12 +98,7 @@ struct Update { /// Contains the result of a combined instance-and-VMM update operation. #[derive(Copy, Clone, PartialEq, Debug)] -pub struct InstanceAndVmmUpdateResult { - /// `Some(status)` if the target instance was found; the wrapped - /// `UpdateStatus` indicates whether the row was updated. `None` if the - /// instance was not found. - pub instance_status: RecordUpdateStatus, - +pub struct VmmAndMigrationUpdateResult { /// `Some(status)` if the target VMM was found; the wrapped `UpdateStatus` /// indicates whether the row was updated. `None` if the VMM was not found. pub vmm_status: Option, @@ -171,11 +165,10 @@ where } } -impl InstanceAndVmmUpdate { +impl VmmAndMigrationUpdate { pub fn new( vmm_id: PropolisUuid, new_vmm_runtime_state: VmmRuntimeState, - instance: Option<(InstanceUuid, InstanceRuntimeState)>, Migrations { migration_in, migration_out }: Migrations<'_>, ) -> Self { let vmm_find = Box::new( @@ -192,32 +185,6 @@ impl InstanceAndVmmUpdate { .set(new_vmm_runtime_state), ); - let instance = instance.map(|(instance_id, new_runtime_state)| { - let instance_id = instance_id.into_untyped_uuid(); - let find = Box::new( - instance_dsl::instance - .filter(instance_dsl::id.eq(instance_id)) - .select(instance_dsl::id), - ); - - let update = Box::new( - diesel::update(instance_dsl::instance) - .filter(instance_dsl::time_deleted.is_null()) - .filter(instance_dsl::id.eq(instance_id)) - .filter( - instance_dsl::state_generation - .lt(new_runtime_state.gen), - ) - .set(new_runtime_state), - ); - Update { - find, - update, - name: "instance", - id: instance_dsl::id::NAME, - } - }); - fn migration_find( migration_id: Uuid, ) -> Box + Send> { @@ -293,21 +260,18 @@ impl InstanceAndVmmUpdate { }, ); - Self { vmm_find, vmm_update, instance, migration_in, migration_out } + Self { vmm_find, vmm_update, migration_in, migration_out } } pub async fn execute_and_check( self, conn: &(impl async_bb8_diesel::AsyncConnection + Sync), - ) -> Result { + ) -> Result { let has_migration_in = self.migration_in.is_some(); let has_migration_out = self.migration_out.is_some(); - let has_instance = self.instance.is_some(); let ( vmm_found, vmm_updated, - instance_found, - instance_updated, migration_in_found, migration_in_updated, migration_out_found, @@ -320,22 +284,12 @@ impl InstanceAndVmmUpdate { Option, Option, Option, - Option, - Option, // WHEW! )>(conn) .await?; let vmm_status = compute_update_status(vmm_found, vmm_updated); - let instance_status = if has_instance { - compute_update_status(instance_found, instance_updated) - .map(RecordUpdateStatus::Found) - .unwrap_or(RecordUpdateStatus::NotFound) - } else { - RecordUpdateStatus::NotProvided - }; - let migration_in_status = if has_migration_in { compute_update_status(migration_in_found, migration_in_updated) .map(RecordUpdateStatus::Found) @@ -352,8 +306,7 @@ impl InstanceAndVmmUpdate { RecordUpdateStatus::NotProvided }; - Ok(InstanceAndVmmUpdateResult { - instance_status, + Ok(VmmAndMigrationUpdateResult { vmm_status, migration_in_status, migration_out_status, @@ -361,12 +314,12 @@ impl InstanceAndVmmUpdate { } } -impl QueryId for InstanceAndVmmUpdate { +impl QueryId for VmmAndMigrationUpdate { type QueryId = (); const HAS_STATIC_QUERY_ID: bool = false; } -impl Query for InstanceAndVmmUpdate { +impl Query for VmmAndMigrationUpdate { type SqlType = ( Nullable, Nullable, @@ -374,12 +327,10 @@ impl Query for InstanceAndVmmUpdate { Nullable, Nullable, Nullable, - Nullable, - Nullable, ); } -impl RunQueryDsl for InstanceAndVmmUpdate {} +impl RunQueryDsl for VmmAndMigrationUpdate {} impl Update { fn push_subqueries<'b>( @@ -422,13 +373,9 @@ impl Update { } } -impl QueryFragment for InstanceAndVmmUpdate { +impl QueryFragment for VmmAndMigrationUpdate { fn walk_ast<'b>(&'b self, mut out: AstPass<'_, 'b, Pg>) -> QueryResult<()> { out.push_sql("WITH "); - if let Some(ref instance) = self.instance { - instance.push_subqueries(&mut out)?; - out.push_sql(", "); - } if let Some(ref m) = self.migration_in { m.push_subqueries(&mut out)?; @@ -474,17 +421,12 @@ impl QueryFragment for InstanceAndVmmUpdate { } out.push_sql("SELECT vmm_result.found, vmm_result.updated, "); - push_select_from_result(self.instance.as_ref(), &mut out); - out.push_sql(", "); push_select_from_result(self.migration_in.as_ref(), &mut out); out.push_sql(", "); push_select_from_result(self.migration_out.as_ref(), &mut out); out.push_sql(" "); out.push_sql("FROM vmm_result"); - if self.instance.is_some() { - out.push_sql(", instance_result"); - } if self.migration_in.is_some() { out.push_sql(", migration_in_result"); } @@ -530,52 +472,19 @@ mod test { } } - fn mk_instance_state() -> (InstanceUuid, InstanceRuntimeState) { - let id = InstanceUuid::nil(); - let state = InstanceRuntimeState { - time_updated: Utc::now(), - gen: Generation::new(), - propolis_id: Some(Uuid::nil()), - dst_propolis_id: Some(Uuid::nil()), - migration_id: Some(Uuid::nil()), - nexus_state: nexus_db_model::InstanceState::Vmm, - }; - (id, state) - } - #[tokio::test] async fn expectorate_query_only_vmm() { let vmm_id = PropolisUuid::nil(); let vmm_state = mk_vmm_state(); - let query = InstanceAndVmmUpdate::new( - vmm_id, - vmm_state, - None, - Migrations::default(), - ); - expectorate_query_contents( - &query, - "tests/output/instance_and_vmm_update_vmm_only.sql", - ) - .await; - } - - #[tokio::test] - async fn expectorate_query_vmm_and_instance() { - let vmm_id = PropolisUuid::nil(); - let vmm_state = mk_vmm_state(); - let instance = mk_instance_state(); - - let query = InstanceAndVmmUpdate::new( + let query = VmmAndMigrationUpdate::new( vmm_id, vmm_state, - Some(instance), Migrations::default(), ); expectorate_query_contents( &query, - "tests/output/instance_and_vmm_update_vmm_and_instance.sql", + "tests/output/vmm_and_migration_update_vmm_only.sql", ) .await; } @@ -586,35 +495,14 @@ mod test { let vmm_state = mk_vmm_state(); let migration = mk_migration_state(); - let query = InstanceAndVmmUpdate::new( - vmm_id, - vmm_state, - None, - Migrations { migration_in: Some(&migration), migration_out: None }, - ); - expectorate_query_contents( - &query, - "tests/output/instance_and_vmm_update_vmm_and_migration_in.sql", - ) - .await; - } - - #[tokio::test] - async fn expectorate_query_vmm_instance_and_migration_in() { - let vmm_id = PropolisUuid::nil(); - let vmm_state = mk_vmm_state(); - let instance = mk_instance_state(); - let migration = mk_migration_state(); - - let query = InstanceAndVmmUpdate::new( + let query = VmmAndMigrationUpdate::new( vmm_id, vmm_state, - Some(instance), Migrations { migration_in: Some(&migration), migration_out: None }, ); expectorate_query_contents( &query, - "tests/output/instance_and_vmm_update_vmm_instance_and_migration_in.sql", + "tests/output/vmm_and_migration_update_vmm_and_migration_in.sql", ) .await; } @@ -625,35 +513,14 @@ mod test { let vmm_state = mk_vmm_state(); let migration = mk_migration_state(); - let query = InstanceAndVmmUpdate::new( - vmm_id, - vmm_state, - None, - Migrations { migration_out: Some(&migration), migration_in: None }, - ); - expectorate_query_contents( - &query, - "tests/output/instance_and_vmm_update_vmm_and_migration_out.sql", - ) - .await; - } - - #[tokio::test] - async fn expectorate_query_vmm_instance_and_migration_out() { - let vmm_id = PropolisUuid::nil(); - let vmm_state = mk_vmm_state(); - let instance = mk_instance_state(); - let migration = mk_migration_state(); - - let query = InstanceAndVmmUpdate::new( + let query = VmmAndMigrationUpdate::new( vmm_id, vmm_state, - Some(instance), Migrations { migration_out: Some(&migration), migration_in: None }, ); expectorate_query_contents( &query, - "tests/output/instance_and_vmm_update_vmm_instance_and_migration_out.sql", + "tests/output/vmm_and_migration_update_vmm_and_migration_out.sql", ) .await; } @@ -665,34 +532,9 @@ mod test { let migration_in = mk_migration_state(); let migration_out = mk_migration_state(); - let query = InstanceAndVmmUpdate::new( - vmm_id, - vmm_state, - None, - Migrations { - migration_in: Some(&migration_in), - migration_out: Some(&migration_out), - }, - ); - expectorate_query_contents( - &query, - "tests/output/instance_and_vmm_update_vmm_and_both_migrations.sql", - ) - .await; - } - - #[tokio::test] - async fn expectorate_query_vmm_instance_and_both_migrations() { - let vmm_id = PropolisUuid::nil(); - let vmm_state = mk_vmm_state(); - let instance = mk_instance_state(); - let migration_in = mk_migration_state(); - let migration_out = mk_migration_state(); - - let query = InstanceAndVmmUpdate::new( + let query = VmmAndMigrationUpdate::new( vmm_id, vmm_state, - Some(instance), Migrations { migration_in: Some(&migration_in), migration_out: Some(&migration_out), @@ -700,7 +542,7 @@ mod test { ); expectorate_query_contents( &query, - "tests/output/instance_and_vmm_update_vmm_instance_and_both_migrations.sql", + "tests/output/vmm_and_migration_update_vmm_and_both_migrations.sql", ) .await; } diff --git a/nexus/db-queries/tests/output/instance_and_vmm_update_vmm_and_instance.sql b/nexus/db-queries/tests/output/instance_and_vmm_update_vmm_and_instance.sql deleted file mode 100644 index 3014e9068fb..00000000000 --- a/nexus/db-queries/tests/output/instance_and_vmm_update_vmm_and_instance.sql +++ /dev/null @@ -1,55 +0,0 @@ -WITH - instance_found AS (SELECT (SELECT instance.id FROM instance WHERE instance.id = $1) AS id), - instance_updated - AS ( - UPDATE - instance - SET - time_state_updated = $2, - state_generation = $3, - active_propolis_id = $4, - target_propolis_id = $5, - migration_id = $6, - state = $7 - WHERE - ((instance.time_deleted IS NULL) AND instance.id = $8) AND instance.state_generation < $9 - RETURNING - id - ), - instance_result - AS ( - SELECT - instance_found.id AS found, instance_updated.id AS updated - FROM - instance_found LEFT JOIN instance_updated ON instance_found.id = instance_updated.id - ), - vmm_found AS (SELECT (SELECT vmm.id FROM vmm WHERE vmm.id = $10) AS id), - vmm_updated - AS ( - UPDATE - vmm - SET - time_state_updated = $11, state_generation = $12, state = $13 - WHERE - ((vmm.time_deleted IS NULL) AND vmm.id = $14) AND vmm.state_generation < $15 - RETURNING - id - ), - vmm_result - AS ( - SELECT - vmm_found.id AS found, vmm_updated.id AS updated - FROM - vmm_found LEFT JOIN vmm_updated ON vmm_found.id = vmm_updated.id - ) -SELECT - vmm_result.found, - vmm_result.updated, - instance_result.found, - instance_result.updated, - NULL, - NULL, - NULL, - NULL -FROM - vmm_result, instance_result diff --git a/nexus/db-queries/tests/output/instance_and_vmm_update_vmm_instance_and_both_migrations.sql b/nexus/db-queries/tests/output/instance_and_vmm_update_vmm_instance_and_both_migrations.sql deleted file mode 100644 index 52c28f85c37..00000000000 --- a/nexus/db-queries/tests/output/instance_and_vmm_update_vmm_instance_and_both_migrations.sql +++ /dev/null @@ -1,119 +0,0 @@ -WITH - instance_found AS (SELECT (SELECT instance.id FROM instance WHERE instance.id = $1) AS id), - instance_updated - AS ( - UPDATE - instance - SET - time_state_updated = $2, - state_generation = $3, - active_propolis_id = $4, - target_propolis_id = $5, - migration_id = $6, - state = $7 - WHERE - ((instance.time_deleted IS NULL) AND instance.id = $8) AND instance.state_generation < $9 - RETURNING - id - ), - instance_result - AS ( - SELECT - instance_found.id AS found, instance_updated.id AS updated - FROM - instance_found LEFT JOIN instance_updated ON instance_found.id = instance_updated.id - ), - migration_in_found - AS ( - SELECT - ( - SELECT - migration.id - FROM - migration - WHERE - migration.id = $10 AND (migration.time_deleted IS NULL) - ) - AS id - ), - migration_in_updated - AS ( - UPDATE - migration - SET - target_state = $11, time_target_updated = $12, target_gen = $13 - WHERE - (migration.id = $14 AND migration.target_propolis_id = $15) AND migration.target_gen < $16 - RETURNING - id - ), - migration_in_result - AS ( - SELECT - migration_in_found.id AS found, migration_in_updated.id AS updated - FROM - migration_in_found - LEFT JOIN migration_in_updated ON migration_in_found.id = migration_in_updated.id - ), - migration_out_found - AS ( - SELECT - ( - SELECT - migration.id - FROM - migration - WHERE - migration.id = $17 AND (migration.time_deleted IS NULL) - ) - AS id - ), - migration_out_updated - AS ( - UPDATE - migration - SET - source_state = $18, time_source_updated = $19, source_gen = $20 - WHERE - (migration.id = $21 AND migration.source_propolis_id = $22) AND migration.source_gen < $23 - RETURNING - id - ), - migration_out_result - AS ( - SELECT - migration_out_found.id AS found, migration_out_updated.id AS updated - FROM - migration_out_found - LEFT JOIN migration_out_updated ON migration_out_found.id = migration_out_updated.id - ), - vmm_found AS (SELECT (SELECT vmm.id FROM vmm WHERE vmm.id = $24) AS id), - vmm_updated - AS ( - UPDATE - vmm - SET - time_state_updated = $25, state_generation = $26, state = $27 - WHERE - ((vmm.time_deleted IS NULL) AND vmm.id = $28) AND vmm.state_generation < $29 - RETURNING - id - ), - vmm_result - AS ( - SELECT - vmm_found.id AS found, vmm_updated.id AS updated - FROM - vmm_found LEFT JOIN vmm_updated ON vmm_found.id = vmm_updated.id - ) -SELECT - vmm_result.found, - vmm_result.updated, - instance_result.found, - instance_result.updated, - migration_in_result.found, - migration_in_result.updated, - migration_out_result.found, - migration_out_result.updated -FROM - vmm_result, instance_result, migration_in_result, migration_out_result diff --git a/nexus/db-queries/tests/output/instance_and_vmm_update_vmm_instance_and_migration_in.sql b/nexus/db-queries/tests/output/instance_and_vmm_update_vmm_instance_and_migration_in.sql deleted file mode 100644 index e717008617d..00000000000 --- a/nexus/db-queries/tests/output/instance_and_vmm_update_vmm_instance_and_migration_in.sql +++ /dev/null @@ -1,87 +0,0 @@ -WITH - instance_found AS (SELECT (SELECT instance.id FROM instance WHERE instance.id = $1) AS id), - instance_updated - AS ( - UPDATE - instance - SET - time_state_updated = $2, - state_generation = $3, - active_propolis_id = $4, - target_propolis_id = $5, - migration_id = $6, - state = $7 - WHERE - ((instance.time_deleted IS NULL) AND instance.id = $8) AND instance.state_generation < $9 - RETURNING - id - ), - instance_result - AS ( - SELECT - instance_found.id AS found, instance_updated.id AS updated - FROM - instance_found LEFT JOIN instance_updated ON instance_found.id = instance_updated.id - ), - migration_in_found - AS ( - SELECT - ( - SELECT - migration.id - FROM - migration - WHERE - migration.id = $10 AND (migration.time_deleted IS NULL) - ) - AS id - ), - migration_in_updated - AS ( - UPDATE - migration - SET - target_state = $11, time_target_updated = $12, target_gen = $13 - WHERE - (migration.id = $14 AND migration.target_propolis_id = $15) AND migration.target_gen < $16 - RETURNING - id - ), - migration_in_result - AS ( - SELECT - migration_in_found.id AS found, migration_in_updated.id AS updated - FROM - migration_in_found - LEFT JOIN migration_in_updated ON migration_in_found.id = migration_in_updated.id - ), - vmm_found AS (SELECT (SELECT vmm.id FROM vmm WHERE vmm.id = $17) AS id), - vmm_updated - AS ( - UPDATE - vmm - SET - time_state_updated = $18, state_generation = $19, state = $20 - WHERE - ((vmm.time_deleted IS NULL) AND vmm.id = $21) AND vmm.state_generation < $22 - RETURNING - id - ), - vmm_result - AS ( - SELECT - vmm_found.id AS found, vmm_updated.id AS updated - FROM - vmm_found LEFT JOIN vmm_updated ON vmm_found.id = vmm_updated.id - ) -SELECT - vmm_result.found, - vmm_result.updated, - instance_result.found, - instance_result.updated, - migration_in_result.found, - migration_in_result.updated, - NULL, - NULL -FROM - vmm_result, instance_result, migration_in_result diff --git a/nexus/db-queries/tests/output/instance_and_vmm_update_vmm_instance_and_migration_out.sql b/nexus/db-queries/tests/output/instance_and_vmm_update_vmm_instance_and_migration_out.sql deleted file mode 100644 index c02b73e4f60..00000000000 --- a/nexus/db-queries/tests/output/instance_and_vmm_update_vmm_instance_and_migration_out.sql +++ /dev/null @@ -1,87 +0,0 @@ -WITH - instance_found AS (SELECT (SELECT instance.id FROM instance WHERE instance.id = $1) AS id), - instance_updated - AS ( - UPDATE - instance - SET - time_state_updated = $2, - state_generation = $3, - active_propolis_id = $4, - target_propolis_id = $5, - migration_id = $6, - state = $7 - WHERE - ((instance.time_deleted IS NULL) AND instance.id = $8) AND instance.state_generation < $9 - RETURNING - id - ), - instance_result - AS ( - SELECT - instance_found.id AS found, instance_updated.id AS updated - FROM - instance_found LEFT JOIN instance_updated ON instance_found.id = instance_updated.id - ), - migration_out_found - AS ( - SELECT - ( - SELECT - migration.id - FROM - migration - WHERE - migration.id = $10 AND (migration.time_deleted IS NULL) - ) - AS id - ), - migration_out_updated - AS ( - UPDATE - migration - SET - source_state = $11, time_source_updated = $12, source_gen = $13 - WHERE - (migration.id = $14 AND migration.source_propolis_id = $15) AND migration.source_gen < $16 - RETURNING - id - ), - migration_out_result - AS ( - SELECT - migration_out_found.id AS found, migration_out_updated.id AS updated - FROM - migration_out_found - LEFT JOIN migration_out_updated ON migration_out_found.id = migration_out_updated.id - ), - vmm_found AS (SELECT (SELECT vmm.id FROM vmm WHERE vmm.id = $17) AS id), - vmm_updated - AS ( - UPDATE - vmm - SET - time_state_updated = $18, state_generation = $19, state = $20 - WHERE - ((vmm.time_deleted IS NULL) AND vmm.id = $21) AND vmm.state_generation < $22 - RETURNING - id - ), - vmm_result - AS ( - SELECT - vmm_found.id AS found, vmm_updated.id AS updated - FROM - vmm_found LEFT JOIN vmm_updated ON vmm_found.id = vmm_updated.id - ) -SELECT - vmm_result.found, - vmm_result.updated, - instance_result.found, - instance_result.updated, - NULL, - NULL, - migration_out_result.found, - migration_out_result.updated -FROM - vmm_result, instance_result, migration_out_result diff --git a/nexus/db-queries/tests/output/instance_and_vmm_update_vmm_and_both_migrations.sql b/nexus/db-queries/tests/output/vmm_and_migration_update_vmm_and_both_migrations.sql similarity index 99% rename from nexus/db-queries/tests/output/instance_and_vmm_update_vmm_and_both_migrations.sql rename to nexus/db-queries/tests/output/vmm_and_migration_update_vmm_and_both_migrations.sql index 354fc9a4035..bb460ff7137 100644 --- a/nexus/db-queries/tests/output/instance_and_vmm_update_vmm_and_both_migrations.sql +++ b/nexus/db-queries/tests/output/vmm_and_migration_update_vmm_and_both_migrations.sql @@ -85,8 +85,6 @@ WITH SELECT vmm_result.found, vmm_result.updated, - NULL, - NULL, migration_in_result.found, migration_in_result.updated, migration_out_result.found, diff --git a/nexus/db-queries/tests/output/instance_and_vmm_update_vmm_and_migration_in.sql b/nexus/db-queries/tests/output/vmm_and_migration_update_vmm_and_migration_in.sql similarity index 98% rename from nexus/db-queries/tests/output/instance_and_vmm_update_vmm_and_migration_in.sql rename to nexus/db-queries/tests/output/vmm_and_migration_update_vmm_and_migration_in.sql index 870cce4c02b..3fec792c6f6 100644 --- a/nexus/db-queries/tests/output/instance_and_vmm_update_vmm_and_migration_in.sql +++ b/nexus/db-queries/tests/output/vmm_and_migration_update_vmm_and_migration_in.sql @@ -53,8 +53,6 @@ WITH SELECT vmm_result.found, vmm_result.updated, - NULL, - NULL, migration_in_result.found, migration_in_result.updated, NULL, diff --git a/nexus/db-queries/tests/output/instance_and_vmm_update_vmm_and_migration_out.sql b/nexus/db-queries/tests/output/vmm_and_migration_update_vmm_and_migration_out.sql similarity index 98% rename from nexus/db-queries/tests/output/instance_and_vmm_update_vmm_and_migration_out.sql rename to nexus/db-queries/tests/output/vmm_and_migration_update_vmm_and_migration_out.sql index 4dea3779f7b..7adeff48da0 100644 --- a/nexus/db-queries/tests/output/instance_and_vmm_update_vmm_and_migration_out.sql +++ b/nexus/db-queries/tests/output/vmm_and_migration_update_vmm_and_migration_out.sql @@ -55,8 +55,6 @@ SELECT vmm_result.updated, NULL, NULL, - NULL, - NULL, migration_out_result.found, migration_out_result.updated FROM diff --git a/nexus/db-queries/tests/output/instance_and_vmm_update_vmm_only.sql b/nexus/db-queries/tests/output/vmm_and_migration_update_vmm_only.sql similarity index 87% rename from nexus/db-queries/tests/output/instance_and_vmm_update_vmm_only.sql rename to nexus/db-queries/tests/output/vmm_and_migration_update_vmm_only.sql index 8f81e662a96..cfe56740fe7 100644 --- a/nexus/db-queries/tests/output/instance_and_vmm_update_vmm_only.sql +++ b/nexus/db-queries/tests/output/vmm_and_migration_update_vmm_only.sql @@ -19,6 +19,6 @@ WITH vmm_found LEFT JOIN vmm_updated ON vmm_found.id = vmm_updated.id ) SELECT - vmm_result.found, vmm_result.updated, NULL, NULL, NULL, NULL, NULL, NULL + vmm_result.found, vmm_result.updated, NULL, NULL, NULL, NULL FROM vmm_result diff --git a/nexus/src/app/sagas/instance_update/mod.rs b/nexus/src/app/sagas/instance_update/mod.rs index d93e7733048..8b44290d953 100644 --- a/nexus/src/app/sagas/instance_update/mod.rs +++ b/nexus/src/app/sagas/instance_update/mod.rs @@ -258,8 +258,8 @@ use super::{ ACTION_GENERATE_ID, }; use crate::app::db::datastore::instance; -use crate::app::db::datastore::instance::InstanceUpdateResult; use crate::app::db::datastore::InstanceSnapshot; +use crate::app::db::datastore::VmmStateUpdateResult; use crate::app::db::lookup::LookupPath; use crate::app::db::model::ByteCount; use crate::app::db::model::Generation; @@ -302,7 +302,7 @@ mod destroyed; /// Returns `true` if an `instance-update` saga should be executed as a result /// of writing the provided [`SledInstanceState`] to the database with the -/// provided [`InstanceUpdateResult`]. +/// provided [`VmmStateUpdateResult`]. /// /// We determine this only after actually updating the database records, /// because we don't know whether a particular VMM or migration state is @@ -322,7 +322,7 @@ pub fn update_saga_needed( log: &slog::Logger, instance_id: InstanceUuid, state: &SledInstanceState, - result: &InstanceUpdateResult, + result: &VmmStateUpdateResult, ) -> bool { // Currently, an instance-update saga is required if (and only if): // From b9f77116b75b4fea27ec2c9d2964888fc53735de Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Thu, 25 Jul 2024 17:28:33 -0700 Subject: [PATCH 185/234] separate "commit updates and unlock" and "just unlock now please" operations --- nexus/db-queries/src/db/datastore/instance.rs | 121 +++++++++++++++--- nexus/src/app/sagas/instance_update/mod.rs | 6 +- nexus/src/app/sagas/instance_update/start.rs | 2 +- 3 files changed, 108 insertions(+), 21 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/instance.rs b/nexus/db-queries/src/db/datastore/instance.rs index e61c2fdca88..0f403c02917 100644 --- a/nexus/db-queries/src/db/datastore/instance.rs +++ b/nexus/db-queries/src/db/datastore/instance.rs @@ -1082,8 +1082,92 @@ impl DataStore { } } - /// Release the instance-updater lock acquired by - /// [`DataStore::instance_updater_lock`]. + /// Release the instance-updater lock on this instance, if (and only if) the + /// lock is currently held by the saga represented by the provided + /// [`UpdaterLock`] token. + pub async fn instance_updater_unlock( + &self, + opctx: &OpContext, + authz_instance: &authz::Instance, + lock: &UpdaterLock, + ) -> Result { + use db::schema::instance::dsl; + + let instance_id = authz_instance.id(); + let UpdaterLock { updater_id, locked_gen } = *lock; + + let result = diesel::update(dsl::instance) + // N.B. that we intentionally *don't* filter out instances that have + // been deleted. If the instance doesn't exist, whatever. It is, by + // definition, "unlocked"... :) + .filter(dsl::id.eq(instance_id)) + // Only unlock the instance if: + // - the provided updater ID matches that of the saga that has + // currently locked this instance. + .filter(dsl::updater_id.eq(Some(updater_id))) + // - the provided updater generation matches the current updater + // generation. + .filter(dsl::updater_gen.eq(locked_gen)) + .set(( + dsl::updater_gen.eq(Generation(locked_gen.0.next())), + dsl::updater_id.eq(None::), + )) + .check_if_exists::(instance_id) + .execute_and_check(&*self.pool_connection_authorized(opctx).await?) + .await + .map_err(|e| { + public_error_from_diesel( + e, + ErrorHandler::NotFoundByLookup( + ResourceType::Instance, + LookupType::ById(instance_id), + ), + ) + })?; + + match result { + // If we updated the record, the lock has been released! Return + // `Ok(true)` to indicate that we released the lock successfully. + UpdateAndQueryResult { status: UpdateStatus::Updated, .. } => { + Ok(true) + } + // The generation has advanced past the generation at which the + // lock was held. This means that we have already released the + // lock. Return `Ok(false)` here for idempotency. + UpdateAndQueryResult { + status: UpdateStatus::NotUpdatedButExists, + ref found, + } if found.updater_gen > locked_gen => Ok(false), + // The instance exists, but the lock ID doesn't match our lock ID. + // This means we were trying to release a lock we never held, whcih + // is almost certainly a programmer error. + UpdateAndQueryResult { ref found, .. } => { + match found.updater_id { + Some(actual_id) if actual_id != updater_id => { + slog::error!( + &opctx.log, + "attempted to release a lock held by another saga"; + "instance_id" => %instance_id, + "updater_id" => %updater_id, + "actual_id" => %actual_id, + "found_gen" => ?found.updater_gen, + "locked_gen" => ?locked_gen, + ); + Err(Error::internal_error( + "attempted to release a lock held by another saga! this is a bug!", + )) + }, + Some(_) => Ok(false), + None => Err(Error::internal_error( + "attempted to release a lock on an instance that is not locked! this is a bug!", + )), + } + } + } + } + + /// Write the provided `new_runtime_state` for this instance, and release + /// the provided `lock`. /// /// This method will unlock the instance if (and only if) the lock is /// currently held by the provided `updater_id`. If the lock is held by a @@ -1095,15 +1179,15 @@ impl DataStore { /// - `authz_instance`: the instance to attempt to unlock /// - `updater_lock`: an [`UpdaterLock`] token representing the acquired /// lock to release. - /// - `new_runtime`: an optional [`InstanceRuntimeState`] to write + /// - `new_runtime`: an [`InstanceRuntimeState`] to write /// back to the database when the lock is released. If this is [`None`], /// the instance's runtime state will not be modified. - pub async fn instance_updater_unlock( + pub async fn instance_commit_update( &self, opctx: &OpContext, authz_instance: &authz::Instance, lock: &UpdaterLock, - new_runtime: Option<&InstanceRuntimeState>, + new_runtime: &InstanceRuntimeState, ) -> Result { use db::schema::instance::dsl; @@ -1120,10 +1204,11 @@ impl DataStore { // - the provided updater generation matches the current updater // generation. .filter(dsl::updater_gen.eq(locked_gen)) + .filter(dsl::state_generation.lt(new_runtime.gen)) .set(( dsl::updater_gen.eq(Generation(locked_gen.0.next())), dsl::updater_id.eq(None::), - new_runtime.cloned(), + new_runtime.clone(), )) .check_if_exists::(instance_id) .execute_and_check(&*self.pool_connection_authorized(opctx).await?) @@ -1156,7 +1241,7 @@ impl DataStore { // is almost certainly a programmer error. UpdateAndQueryResult { ref found, .. } => { match found.updater_id { - Some(actual_id) => { + Some(actual_id) if actual_id != updater_id => { slog::error!( &opctx.log, "attempted to release a lock held by another saga"; @@ -1166,14 +1251,16 @@ impl DataStore { "found_gen" => ?found.updater_gen, "locked_gen" => ?locked_gen, ); - debug_assert_ne!(actual_id, updater_id); Err(Error::internal_error( "attempted to release a lock held by another saga! this is a bug!", )) }, + Some(_) => Err(Error::internal_error( + "attempted to commit an instance update, but the state generation has advanced!" + )), None => Err(Error::internal_error( - "attempted to release a lock on an instance that is not locked! this is a bug!", - )), + "attempted to release a lock on an instance that is not locked! this is a bug!", + )), } } } @@ -1307,7 +1394,7 @@ mod tests { // unlock the instance from saga 1 let unlocked = datastore - .instance_updater_unlock(&opctx, &authz_instance, &lock1, None) + .instance_commit_update(&opctx, &authz_instance, &lock1, None) .await .expect("instance must be unlocked by saga 1"); assert!(unlocked, "instance must actually be unlocked"); @@ -1320,7 +1407,7 @@ mod tests { // unlock the instance from saga 2 let unlocked = datastore - .instance_updater_unlock(&opctx, &authz_instance, &lock2, None) + .instance_commit_update(&opctx, &authz_instance, &lock2, None) .await .expect("instance must be unlocked by saga 2"); assert!(unlocked, "instance must actually be unlocked"); @@ -1366,7 +1453,7 @@ mod tests { // now, unlock the instance. let unlocked = dbg!( datastore - .instance_updater_unlock(&opctx, &authz_instance, &lock1, None) + .instance_commit_update(&opctx, &authz_instance, &lock1, None) .await ) .expect("instance should unlock"); @@ -1375,7 +1462,7 @@ mod tests { // unlocking it again should also succeed... let unlocked = dbg!( datastore - .instance_updater_unlock(&opctx, &authz_instance, &lock2, None) + .instance_commit_update(&opctx, &authz_instance, &lock2, None) .await ) .expect("instance should unlock again"); @@ -1410,7 +1497,7 @@ mod tests { // attempting to unlock with a different saga ID should be an error. let err = dbg!( datastore - .instance_updater_unlock( + .instance_commit_update( &opctx, &authz_instance, // N.B. that the `UpdaterLock` type's fields are private @@ -1441,7 +1528,7 @@ mod tests { // unlocking with the correct ID should succeed. let unlocked = dbg!( datastore - .instance_updater_unlock(&opctx, &authz_instance, &lock1, None) + .instance_commit_update(&opctx, &authz_instance, &lock1, None) .await ) .expect("instance should unlock"); @@ -1451,7 +1538,7 @@ mod tests { // (where the lock is no longer held) should fail. let err = dbg!( datastore - .instance_updater_unlock( + .instance_commit_update( &opctx, &authz_instance, // Again, these fields are private specifically to prevent diff --git a/nexus/src/app/sagas/instance_update/mod.rs b/nexus/src/app/sagas/instance_update/mod.rs index 8b44290d953..61060689ed6 100644 --- a/nexus/src/app/sagas/instance_update/mod.rs +++ b/nexus/src/app/sagas/instance_update/mod.rs @@ -989,11 +989,11 @@ async fn siu_commit_instance_updates( let did_unlock = osagactx .datastore() - .instance_updater_unlock( + .instance_commit_update( &opctx, &authz_instance, &lock, - Some(&update.new_runtime), + &update.new_runtime, ) .await .map_err(ActionError::action_failed)?; @@ -1154,7 +1154,7 @@ async fn unwind_instance_lock( || { osagactx .datastore() - .instance_updater_unlock(&opctx, authz_instance, &lock, None) + .instance_updater_unlock(&opctx, authz_instance, &lock) .or_else(|err| future::ready(match err { // The instance record was not found. It's probably been // deleted. That's fine, we can now die happily, since we won't diff --git a/nexus/src/app/sagas/instance_update/start.rs b/nexus/src/app/sagas/instance_update/start.rs index db8c88bf153..3350812a0c8 100644 --- a/nexus/src/app/sagas/instance_update/start.rs +++ b/nexus/src/app/sagas/instance_update/start.rs @@ -211,7 +211,7 @@ async fn siu_fetch_state_and_start_real_saga( "current.target_vmm" => ?state.target_vmm, ); datastore - .instance_updater_unlock(&opctx, &authz_instance, &orig_lock, None) + .instance_updater_unlock(&opctx, &authz_instance, &orig_lock) .await .map_err(ActionError::action_failed)?; } From 377bda174691cbf021827b557571b47e873fed91 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Fri, 26 Jul 2024 08:33:21 -0700 Subject: [PATCH 186/234] fixup tests --- 83e2f8ab-86dc-491f-98d9-b66a16768ddb.log | 565 ++++++++++++++++++ nexus/db-queries/src/db/datastore/instance.rs | 12 +- 2 files changed, 570 insertions(+), 7 deletions(-) create mode 100644 83e2f8ab-86dc-491f-98d9-b66a16768ddb.log diff --git a/83e2f8ab-86dc-491f-98d9-b66a16768ddb.log b/83e2f8ab-86dc-491f-98d9-b66a16768ddb.log new file mode 100644 index 00000000000..7324b45faa5 --- /dev/null +++ b/83e2f8ab-86dc-491f-98d9-b66a16768ddb.log @@ -0,0 +1,565 @@ +root@BRM44220001:~# looker -f $( /opt/oxide/oxlog/oxlog logs oxz_nexus_3090570f-4c2b-43ae-8124-776fbad100fa --current ) -c 'r.instance_id?.contains("83e2f8ab-86dc-491f-98d9-b66a16768ddb")' +21:25:29.253Z INFO 3090570f-4c2b-43ae-8124-776fbad100fa (dropshot_internal): received new VMM runtime state from sled agent + actor_id = 001de000-05e4-4000-8000-000000000002 + authenticated = true + file = nexus/src/app/instance.rs:1869 + instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb + local_addr = [fd00:1122:3344:102::4]:12221 + method = PUT + migration_state = Migrations { migration_in: None, migration_out: None } + propolis_id = 18a38b46-c2b3-45f4-8a77-3fbf43d2175c + remote_addr = [fd00:1122:3344:103::1]:54972 + req_id = e81e031e-a27d-4e06-b5c0-2481f5eea1a4 + uri = /instances/83e2f8ab-86dc-491f-98d9-b66a16768ddb + vmm_state = VmmRuntimeState { state: Destroyed, gen: Generation(2), time_updated: 2024-07-25T21:25:29.252818945Z } +21:25:29.257Z DEBG 3090570f-4c2b-43ae-8124-776fbad100fa (dropshot_internal): new VMM runtime state from sled agent requires an instance-update saga + actor_id = 001de000-05e4-4000-8000-000000000002 + authenticated = true + instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb + local_addr = [fd00:1122:3344:102::4]:12221 + method = PUT + migration_in_needs_update = false + migration_out_needs_update = false + propolis_id = 18a38b46-c2b3-45f4-8a77-3fbf43d2175c + remote_addr = [fd00:1122:3344:103::1]:54972 + req_id = e81e031e-a27d-4e06-b5c0-2481f5eea1a4 + uri = /instances/83e2f8ab-86dc-491f-98d9-b66a16768ddb + vmm_needs_update = true +21:25:29.273Z INFO 3090570f-4c2b-43ae-8124-776fbad100fa (dropshot_internal): starting update saga for 83e2f8ab-86dc-491f-98d9-b66a16768ddb + actor_id = 001de000-05e4-4000-8000-000000000002 + authenticated = true + file = nexus/src/app/instance.rs:1371 + instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb + local_addr = [fd00:1122:3344:102::4]:12221 + method = PUT + migration_state = Migrations { migration_in: None, migration_out: None } + remote_addr = [fd00:1122:3344:103::1]:54972 + req_id = e81e031e-a27d-4e06-b5c0-2481f5eea1a4 + uri = /instances/83e2f8ab-86dc-491f-98d9-b66a16768ddb + vmm_state = VmmRuntimeState { state: Destroyed, gen: Generation(2), time_updated: 2024-07-25T21:25:29.252818945Z } +21:25:29.351Z INFO 3090570f-4c2b-43ae-8124-776fbad100fa (ServerContext): instance update: attempting to lock instance + file = nexus/src/app/sagas/instance_update/start.rs:92 + instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb + saga_id = c5a4e62e-ff9f-4edb-aa68-9a033893a62a + saga_name = start-instance-update +21:25:29.365Z DEBG 3090570f-4c2b-43ae-8124-776fbad100fa (ServerContext): attempting to acquire instance updater lock + actor_id = 001de000-05e4-4000-8000-000000000002 + authenticated = true + current_gen = Generation(Generation(1)) + instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb + saga_id = 734a7030-3e17-4637-9711-19bc78dbd077 + saga_name = start-instance-update + saga_node = LockInstance + updater_id = c5a4e62e-ff9f-4edb-aa68-9a033893a62a +21:25:29.370Z DEBG 3090570f-4c2b-43ae-8124-776fbad100fa (ServerContext): attempting to acquire instance updater lock + actor_id = 001de000-05e4-4000-8000-000000000002 + authenticated = true + current_gen = Generation(Generation(1)) + instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb + saga_id = 734a7030-3e17-4637-9711-19bc78dbd077 + saga_name = start-instance-update + saga_node = LockInstance + updater_id = c5a4e62e-ff9f-4edb-aa68-9a033893a62a +21:25:29.374Z DEBG 3090570f-4c2b-43ae-8124-776fbad100fa (ServerContext): instance updater lock acquired! + actor_id = 001de000-05e4-4000-8000-000000000002 + already_locked = true + authenticated = true + instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb + locked_gen = Generation(Generation(2)) + saga_id = 734a7030-3e17-4637-9711-19bc78dbd077 + saga_name = start-instance-update + saga_node = LockInstance + updater_id = c5a4e62e-ff9f-4edb-aa68-9a033893a62a +21:25:29.399Z INFO 3090570f-4c2b-43ae-8124-776fbad100fa (ServerContext): instance update: starting real update saga... + current.active_vmm = Some(Vmm { id: 18a38b46-c2b3-45f4-8a77-3fbf43d2175c, time_created: 2024-07-25T21:25:26.371086Z, time_deleted: None, instance_id: 83e2f8ab-86dc-491f-98d9-b66a16768ddb, sled_id: 7c8b0a7f-23e1-4b88-8519-eb43ed065667, propolis_ip: V6(Ipv6Network { addr: fd00:1122:3344:103::1:34, prefix: 128 }), propolis_port: SqlU16(12400), runtime: VmmRuntimeState { time_state_updated: 2024-07-25T21:25:29.252818Z, gen: Generation(Generation(2)), state: Destroyed } }) + current.migration = None + current.runtime_state = InstanceRuntimeState { time_updated: 2024-07-25T21:25:26.884469Z, gen: Generation(Generation(2)), propolis_id: Some(18a38b46-c2b3-45f4-8a77-3fbf43d2175c), dst_propolis_id: None, migration_id: None, nexus_state: Vmm } + current.target_vmm = None + file = nexus/src/app/sagas/instance_update/start.rs:178 + instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb + saga_id = 734a7030-3e17-4637-9711-19bc78dbd077 + saga_name = start-instance-update + update.deprovision = true + update.destroy_active_vmm = Some(18a38b46-c2b3-45f4-8a77-3fbf43d2175c (propolis)) + update.destroy_target_vmm = None + update.network_config_update = Some(Delete) + update.new_runtime_state = InstanceRuntimeState { time_updated: 2024-07-25T21:25:29.398999367Z, gen: Generation(Generation(3)), propolis_id: None, dst_propolis_id: None, migration_id: None, nexus_state: NoVmm } +21:25:29.420Z DEBG 3090570f-4c2b-43ae-8124-776fbad100fa (ServerContext): instance update: trying to become instance updater... + instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb + parent_lock = UpdaterLock { updater_id: c5a4e62e-ff9f-4edb-aa68-9a033893a62a, locked_gen: Generation(Generation(2)) } + saga_id = 41b9eb70-84a4-4397-ba86-be068cbffec7 + saga_name = instance-update +21:25:29.427Z INFO 3090570f-4c2b-43ae-8124-776fbad100fa (ServerContext): inherited lock from c5a4e62e-ff9f-4edb-aa68-9a033893a62a to 41b9eb70-84a4-4397-ba86-be068cbffec7 + actor_id = 001de000-05e4-4000-8000-000000000002 + authenticated = true + file = nexus/db-queries/src/db/datastore/instance.rs:1051 + instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb + locked_gen = Generation(Generation(3)) + parent_gen = Generation(Generation(2)) + parent_id = c5a4e62e-ff9f-4edb-aa68-9a033893a62a + saga_id = 07d0a44e-e348-4bd9-ada0-1e9e4eebf1ad + saga_name = instance-update + saga_node = BecomeUpdater + updater_id = 41b9eb70-84a4-4397-ba86-be068cbffec7 +21:25:29.427Z INFO 3090570f-4c2b-43ae-8124-776fbad100fa (ServerContext): instance_update: Now, I am become Updater, the destroyer of VMMs. + file = nexus/src/app/sagas/instance_update/mod.rs:796 + instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb + saga_id = 41b9eb70-84a4-4397-ba86-be068cbffec7 + saga_name = instance-update +21:25:29.438Z INFO 3090570f-4c2b-43ae-8124-776fbad100fa (ServerContext): instance update: deleting network config + file = nexus/src/app/sagas/instance_update/mod.rs:839 + instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb + saga_id = 07d0a44e-e348-4bd9-ada0-1e9e4eebf1ad + saga_name = instance-update +21:25:29.438Z INFO 3090570f-4c2b-43ae-8124-776fbad100fa (ServerContext): deleting instance dpd configuration + file = nexus/src/app/instance_network.rs:548 + instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb +21:25:29.453Z DEBG 3090570f-4c2b-43ae-8124-776fbad100fa (ServerContext): notifying dendrite of updates + instance_id = Some(83e2f8ab-86dc-491f-98d9-b66a16768ddb (instance)) + switch = switch1 +21:25:29.607Z DEBG 3090570f-4c2b-43ae-8124-776fbad100fa (ServerContext): notifying dendrite of updates + instance_id = Some(83e2f8ab-86dc-491f-98d9-b66a16768ddb (instance)) + switch = switch0 +21:25:29.784Z INFO 3090570f-4c2b-43ae-8124-776fbad100fa (ServerContext): instance update (no VMM): deallocated virtual provisioning resources + file = nexus/src/app/sagas/instance_update/mod.rs:918 + instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb + records_deleted = [VirtualProvisioningCollection { id: 001de000-1334-4000-8000-000000000000, time_modified: Some(2024-07-25T21:25:29.775985Z), collection_type: "Fleet", virtual_disk_bytes_provisioned: ByteCount(ByteCount(11811160064)), cpus_provisioned: 10, ram_provisioned: ByteCount(ByteCount(10737418240)) }, VirtualProvisioningCollection { id: 64900d6e-0c92-4d8c-a035-25c1c179125f, time_modified: Some(2024-07-25T21:25:29.775985Z), collection_type: "Silo", virtual_disk_bytes_provisioned: ByteCount(ByteCount(11811160064)), cpus_provisioned: 10, ram_provisioned: ByteCount(ByteCount(10737418240)) }, VirtualProvisioningCollection { id: 73a1ed16-acc9-4913-b82e-108105e6bed2, time_modified: Some(2024-07-25T21:25:29.775985Z), collection_type: "Project", virtual_disk_bytes_provisioned: ByteCount(ByteCount(11811160064)), cpus_provisioned: 10, ram_provisioned: ByteCount(ByteCount(10737418240)) }] + saga_id = 07d0a44e-e348-4bd9-ada0-1e9e4eebf1ad + saga_name = instance-update +21:25:29.791Z INFO 3090570f-4c2b-43ae-8124-776fbad100fa (ServerContext): instance update (no VMM): unassigning oximeter producer + file = nexus/src/app/sagas/instance_update/mod.rs:953 + instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb + saga_id = 07d0a44e-e348-4bd9-ada0-1e9e4eebf1ad + saga_name = instance-update +21:25:29.798Z DEBG 3090570f-4c2b-43ae-8124-776fbad100fa (ServerContext): instance update: committing new runtime state and unlocking... + instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb + lock = UpdaterLock { updater_id: 41b9eb70-84a4-4397-ba86-be068cbffec7, locked_gen: Generation(Generation(3)) } + new_runtime = InstanceRuntimeState { time_updated: 2024-07-25T21:25:29.398999367Z, gen: Generation(Generation(3)), propolis_id: None, dst_propolis_id: None, migration_id: None, nexus_state: NoVmm } + saga_id = 07d0a44e-e348-4bd9-ada0-1e9e4eebf1ad + saga_name = instance-update +21:25:29.834Z INFO 3090570f-4c2b-43ae-8124-776fbad100fa (ServerContext): instance update: committed update new runtime state! + did_unlock = true + file = nexus/src/app/sagas/instance_update/mod.rs:1001 + instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb + new_runtime = InstanceRuntimeState { time_updated: 2024-07-25T21:25:29.398999367Z, gen: Generation(Generation(3)), propolis_id: None, dst_propolis_id: None, migration_id: None, nexus_state: NoVmm } + saga_id = 07d0a44e-e348-4bd9-ada0-1e9e4eebf1ad + saga_name = instance-update +21:25:29.863Z INFO 3090570f-4c2b-43ae-8124-776fbad100fa (ServerContext): instance update (VMM destroyed): deallocating sled resource reservation + file = nexus/src/app/sagas/instance_update/destroyed.rs:83 + instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb + propolis_id = 18a38b46-c2b3-45f4-8a77-3fbf43d2175c + saga_id = 07d0a44e-e348-4bd9-ada0-1e9e4eebf1ad + saga_name = instance-update +21:25:29.872Z INFO 3090570f-4c2b-43ae-8124-776fbad100fa (ServerContext): instance update (VMM destroyed): marking VMM record deleted + file = nexus/src/app/sagas/instance_update/destroyed.rs:114 + instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb + propolis_id = 18a38b46-c2b3-45f4-8a77-3fbf43d2175c + saga_id = 07d0a44e-e348-4bd9-ada0-1e9e4eebf1ad + saga_name = instance-update +21:25:42.274Z INFO 3090570f-4c2b-43ae-8124-776fbad100fa (dropshot_internal): received new VMM runtime state from sled agent + actor_id = 001de000-05e4-4000-8000-000000000002 + authenticated = true + file = nexus/src/app/instance.rs:1869 + instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb + local_addr = [fd00:1122:3344:102::4]:12221 + method = PUT + migration_state = Migrations { migration_in: None, migration_out: None } + propolis_id = a549321e-e2c4-40b7-a2ba-0ae492878f0c + remote_addr = [fd00:1122:3344:101::1]:43502 + req_id = b29d225b-7eb5-40c9-9055-e8011a1bce4e + uri = /instances/83e2f8ab-86dc-491f-98d9-b66a16768ddb + vmm_state = VmmRuntimeState { state: Starting, gen: Generation(2), time_updated: 2024-07-25T21:25:42.273996740Z } +21:25:42.279Z INFO 3090570f-4c2b-43ae-8124-776fbad100fa (dropshot_internal): received new VMM runtime state from sled agent + actor_id = 001de000-05e4-4000-8000-000000000002 + authenticated = true + file = nexus/src/app/instance.rs:1869 + instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb + local_addr = [fd00:1122:3344:102::4]:12221 + method = PUT + migration_state = Migrations { migration_in: None, migration_out: None } + propolis_id = a549321e-e2c4-40b7-a2ba-0ae492878f0c + remote_addr = [fd00:1122:3344:101::1]:43502 + req_id = bbd804e9-76e0-40d9-838b-d710c45cc978 + uri = /instances/83e2f8ab-86dc-491f-98d9-b66a16768ddb + vmm_state = VmmRuntimeState { state: Running, gen: Generation(3), time_updated: 2024-07-25T21:25:42.279615082Z } +21:25:55.258Z DEBG 3090570f-4c2b-43ae-8124-776fbad100fa (ServerContext): updating instance state + background_task = instance_watcher + instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb + state = Running +21:25:55.258Z INFO 3090570f-4c2b-43ae-8124-776fbad100fa (ServerContext): received new VMM runtime state from sled agent + background_task = instance_watcher + file = nexus/src/app/instance.rs:1869 + instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb + migration_state = Migrations { migration_in: None, migration_out: None } + propolis_id = a549321e-e2c4-40b7-a2ba-0ae492878f0c + vmm_state = VmmRuntimeState { state: Running, gen: Generation(3), time_updated: 2024-07-25T21:25:42.279615082Z } +21:26:25.342Z DEBG 3090570f-4c2b-43ae-8124-776fbad100fa (ServerContext): updating instance state + background_task = instance_watcher + instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb + state = Running +21:26:25.342Z INFO 3090570f-4c2b-43ae-8124-776fbad100fa (ServerContext): received new VMM runtime state from sled agent + background_task = instance_watcher + file = nexus/src/app/instance.rs:1869 + instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb + migration_state = Migrations { migration_in: None, migration_out: None } + propolis_id = a549321e-e2c4-40b7-a2ba-0ae492878f0c + vmm_state = VmmRuntimeState { state: Running, gen: Generation(3), time_updated: 2024-07-25T21:25:42.279615082Z } +21:26:55.245Z DEBG 3090570f-4c2b-43ae-8124-776fbad100fa (ServerContext): updating instance state + background_task = instance_watcher + instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb + state = Running +21:26:55.245Z INFO 3090570f-4c2b-43ae-8124-776fbad100fa (ServerContext): received new VMM runtime state from sled agent + background_task = instance_watcher + file = nexus/src/app/instance.rs:1869 + instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb + migration_state = Migrations { migration_in: None, migration_out: None } + propolis_id = a549321e-e2c4-40b7-a2ba-0ae492878f0c + vmm_state = VmmRuntimeState { state: Running, gen: Generation(3), time_updated: 2024-07-25T21:25:42.279615082Z } +21:27:25.806Z DEBG 3090570f-4c2b-43ae-8124-776fbad100fa (ServerContext): updating instance state + background_task = instance_watcher + instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb + state = Running +21:27:25.806Z INFO 3090570f-4c2b-43ae-8124-776fbad100fa (ServerContext): received new VMM runtime state from sled agent + background_task = instance_watcher + file = nexus/src/app/instance.rs:1869 + instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb + migration_state = Migrations { migration_in: None, migration_out: None } + propolis_id = a549321e-e2c4-40b7-a2ba-0ae492878f0c + vmm_state = VmmRuntimeState { state: Running, gen: Generation(3), time_updated: 2024-07-25T21:25:42.279615082Z } +21:27:55.254Z DEBG 3090570f-4c2b-43ae-8124-776fbad100fa (ServerContext): updating instance state + background_task = instance_watcher + instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb + state = Running +21:27:55.254Z INFO 3090570f-4c2b-43ae-8124-776fbad100fa (ServerContext): received new VMM runtime state from sled agent + background_task = instance_watcher + file = nexus/src/app/instance.rs:1869 + instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb + migration_state = Migrations { migration_in: None, migration_out: None } + propolis_id = a549321e-e2c4-40b7-a2ba-0ae492878f0c + vmm_state = VmmRuntimeState { state: Running, gen: Generation(3), time_updated: 2024-07-25T21:25:42.279615082Z } +21:28:25.653Z DEBG 3090570f-4c2b-43ae-8124-776fbad100fa (ServerContext): updating instance state + background_task = instance_watcher + instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb + state = Running +21:28:25.653Z INFO 3090570f-4c2b-43ae-8124-776fbad100fa (ServerContext): received new VMM runtime state from sled agent + background_task = instance_watcher + file = nexus/src/app/instance.rs:1869 + instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb + migration_state = Migrations { migration_in: None, migration_out: None } + propolis_id = a549321e-e2c4-40b7-a2ba-0ae492878f0c + vmm_state = VmmRuntimeState { state: Running, gen: Generation(3), time_updated: 2024-07-25T21:25:42.279615082Z } +21:28:27.578Z INFO 3090570f-4c2b-43ae-8124-776fbad100fa (dropshot_internal): received new VMM runtime state from sled agent + actor_id = 001de000-05e4-4000-8000-000000000002 + authenticated = true + file = nexus/src/app/instance.rs:1869 + instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb + local_addr = [fd00:1122:3344:102::4]:12221 + method = PUT + migration_state = Migrations { migration_in: None, migration_out: None } + propolis_id = a549321e-e2c4-40b7-a2ba-0ae492878f0c + remote_addr = [fd00:1122:3344:101::1]:43502 + req_id = c2913dd1-4210-43be-986f-f0f4ac400678 + uri = /instances/83e2f8ab-86dc-491f-98d9-b66a16768ddb + vmm_state = VmmRuntimeState { state: Stopping, gen: Generation(5), time_updated: 2024-07-25T21:28:27.577646756Z } +21:28:30.621Z INFO 3090570f-4c2b-43ae-8124-776fbad100fa (dropshot_internal): received new VMM runtime state from sled agent + actor_id = 001de000-05e4-4000-8000-000000000002 + authenticated = true + file = nexus/src/app/instance.rs:1869 + instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb + local_addr = [fd00:1122:3344:102::4]:12221 + method = PUT + migration_state = Migrations { migration_in: None, migration_out: None } + propolis_id = a549321e-e2c4-40b7-a2ba-0ae492878f0c + remote_addr = [fd00:1122:3344:101::1]:43502 + req_id = cd2f9516-79a4-4dc4-9dc3-f1d74a9df395 + uri = /instances/83e2f8ab-86dc-491f-98d9-b66a16768ddb + vmm_state = VmmRuntimeState { state: Destroyed, gen: Generation(7), time_updated: 2024-07-25T21:28:30.620811030Z } +21:28:30.635Z DEBG 3090570f-4c2b-43ae-8124-776fbad100fa (dropshot_internal): new VMM runtime state from sled agent requires an instance-update saga + actor_id = 001de000-05e4-4000-8000-000000000002 + authenticated = true + instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb + local_addr = [fd00:1122:3344:102::4]:12221 + method = PUT + migration_in_needs_update = false + migration_out_needs_update = false + propolis_id = a549321e-e2c4-40b7-a2ba-0ae492878f0c + remote_addr = [fd00:1122:3344:101::1]:43502 + req_id = cd2f9516-79a4-4dc4-9dc3-f1d74a9df395 + uri = /instances/83e2f8ab-86dc-491f-98d9-b66a16768ddb + vmm_needs_update = true +21:28:30.651Z INFO 3090570f-4c2b-43ae-8124-776fbad100fa (dropshot_internal): starting update saga for 83e2f8ab-86dc-491f-98d9-b66a16768ddb + actor_id = 001de000-05e4-4000-8000-000000000002 + authenticated = true + file = nexus/src/app/instance.rs:1371 + instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb + local_addr = [fd00:1122:3344:102::4]:12221 + method = PUT + migration_state = Migrations { migration_in: None, migration_out: None } + remote_addr = [fd00:1122:3344:101::1]:43502 + req_id = cd2f9516-79a4-4dc4-9dc3-f1d74a9df395 + uri = /instances/83e2f8ab-86dc-491f-98d9-b66a16768ddb + vmm_state = VmmRuntimeState { state: Destroyed, gen: Generation(7), time_updated: 2024-07-25T21:28:30.620811030Z } +21:28:30.652Z INFO 3090570f-4c2b-43ae-8124-776fbad100fa (dropshot_internal): received new VMM runtime state from sled agent + actor_id = 001de000-05e4-4000-8000-000000000002 + authenticated = true + file = nexus/src/app/instance.rs:1869 + instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb + local_addr = [fd00:1122:3344:102::4]:12221 + method = PUT + migration_state = Migrations { migration_in: None, migration_out: None } + propolis_id = a549321e-e2c4-40b7-a2ba-0ae492878f0c + remote_addr = [fd00:1122:3344:101::1]:43502 + req_id = 5e7ceff7-3899-4054-a9b5-e580da62b6cc + uri = /instances/83e2f8ab-86dc-491f-98d9-b66a16768ddb + vmm_state = VmmRuntimeState { state: Destroyed, gen: Generation(7), time_updated: 2024-07-25T21:28:30.620811030Z } +21:28:30.701Z INFO 3090570f-4c2b-43ae-8124-776fbad100fa (ServerContext): instance update: attempting to lock instance + file = nexus/src/app/sagas/instance_update/start.rs:92 + instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb + saga_id = 716a5faa-9a9d-456d-a242-32cc9a846fab + saga_name = start-instance-update +21:28:30.715Z DEBG 3090570f-4c2b-43ae-8124-776fbad100fa (ServerContext): attempting to acquire instance updater lock + actor_id = 001de000-05e4-4000-8000-000000000002 + authenticated = true + current_gen = Generation(Generation(4)) + instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb + saga_id = 9fde65bf-fb0a-4a82-9b2b-276789f57a76 + saga_name = start-instance-update + saga_node = LockInstance + updater_id = 716a5faa-9a9d-456d-a242-32cc9a846fab +21:28:30.721Z DEBG 3090570f-4c2b-43ae-8124-776fbad100fa (ServerContext): attempting to acquire instance updater lock + actor_id = 001de000-05e4-4000-8000-000000000002 + authenticated = true + current_gen = Generation(Generation(4)) + instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb + saga_id = 9fde65bf-fb0a-4a82-9b2b-276789f57a76 + saga_name = start-instance-update + saga_node = LockInstance + updater_id = 716a5faa-9a9d-456d-a242-32cc9a846fab +21:28:30.724Z DEBG 3090570f-4c2b-43ae-8124-776fbad100fa (ServerContext): instance updater lock acquired! + actor_id = 001de000-05e4-4000-8000-000000000002 + already_locked = true + authenticated = true + instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb + locked_gen = Generation(Generation(5)) + saga_id = 9fde65bf-fb0a-4a82-9b2b-276789f57a76 + saga_name = start-instance-update + saga_node = LockInstance + updater_id = 716a5faa-9a9d-456d-a242-32cc9a846fab +21:28:30.765Z DEBG 3090570f-4c2b-43ae-8124-776fbad100fa (ServerContext): instance update: unlocking instance on unwind + instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb + lock = UpdaterLock { updater_id: 716a5faa-9a9d-456d-a242-32cc9a846fab, locked_gen: Generation(Generation(5)) } + saga_id = 9fde65bf-fb0a-4a82-9b2b-276789f57a76 + saga_name = start-instance-update +21:28:30.768Z ERRO 3090570f-4c2b-43ae-8124-776fbad100fa (ServerContext): attempted to release a lock held by another saga + actor_id = 001de000-05e4-4000-8000-000000000002 + actual_id = 716a5faa-9a9d-456d-a242-32cc9a846fab + authenticated = true + file = nexus/db-queries/src/db/datastore/instance.rs:1160 + found_gen = Generation(Generation(5)) + instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb + locked_gen = Generation(Generation(5)) + saga_id = 9fde65bf-fb0a-4a82-9b2b-276789f57a76 + saga_name = start-instance-update + saga_node = LockInstance + updater_id = 716a5faa-9a9d-456d-a242-32cc9a846fab +21:28:30.768Z INFO 3090570f-4c2b-43ae-8124-776fbad100fa (ServerContext): server error while recording saga event, retrying + call_count = 0 + error = Internal Error: attempted to release a lock held by another saga! this is a bug! + file = nexus/src/app/sagas/instance_update/mod.rs:1202 + instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb + lock = UpdaterLock { updater_id: 716a5faa-9a9d-456d-a242-32cc9a846fab, locked_gen: Generation(Generation(5)) } + saga_id = 9fde65bf-fb0a-4a82-9b2b-276789f57a76 + saga_name = start-instance-update + total_duration = 3.72443ms +21:28:31.082Z ERRO 3090570f-4c2b-43ae-8124-776fbad100fa (ServerContext): attempted to release a lock held by another saga + actor_id = 001de000-05e4-4000-8000-000000000002 + actual_id = 716a5faa-9a9d-456d-a242-32cc9a846fab + authenticated = true + file = nexus/db-queries/src/db/datastore/instance.rs:1160 + found_gen = Generation(Generation(5)) + instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb + locked_gen = Generation(Generation(5)) + saga_id = 9fde65bf-fb0a-4a82-9b2b-276789f57a76 + saga_name = start-instance-update + saga_node = LockInstance + updater_id = 716a5faa-9a9d-456d-a242-32cc9a846fab +21:28:31.082Z INFO 3090570f-4c2b-43ae-8124-776fbad100fa (ServerContext): server error while recording saga event, retrying + call_count = 1 + error = Internal Error: attempted to release a lock held by another saga! this is a bug! + file = nexus/src/app/sagas/instance_update/mod.rs:1202 + instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb + lock = UpdaterLock { updater_id: 716a5faa-9a9d-456d-a242-32cc9a846fab, locked_gen: Generation(Generation(5)) } + saga_id = 9fde65bf-fb0a-4a82-9b2b-276789f57a76 + saga_name = start-instance-update + total_duration = 316.89294ms +21:28:31.634Z ERRO 3090570f-4c2b-43ae-8124-776fbad100fa (ServerContext): attempted to release a lock held by another saga + actor_id = 001de000-05e4-4000-8000-000000000002 + actual_id = 716a5faa-9a9d-456d-a242-32cc9a846fab + authenticated = true + file = nexus/db-queries/src/db/datastore/instance.rs:1160 + found_gen = Generation(Generation(5)) + instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb + locked_gen = Generation(Generation(5)) + saga_id = 9fde65bf-fb0a-4a82-9b2b-276789f57a76 + saga_name = start-instance-update + saga_node = LockInstance + updater_id = 716a5faa-9a9d-456d-a242-32cc9a846fab +21:28:31.635Z INFO 3090570f-4c2b-43ae-8124-776fbad100fa (ServerContext): server error while recording saga event, retrying + call_count = 2 + error = Internal Error: attempted to release a lock held by another saga! this is a bug! + file = nexus/src/app/sagas/instance_update/mod.rs:1202 + instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb + lock = UpdaterLock { updater_id: 716a5faa-9a9d-456d-a242-32cc9a846fab, locked_gen: Generation(Generation(5)) } + saga_id = 9fde65bf-fb0a-4a82-9b2b-276789f57a76 + saga_name = start-instance-update + total_duration = 869.80914ms +21:28:32.994Z ERRO 3090570f-4c2b-43ae-8124-776fbad100fa (ServerContext): attempted to release a lock held by another saga + actor_id = 001de000-05e4-4000-8000-000000000002 + actual_id = 716a5faa-9a9d-456d-a242-32cc9a846fab + authenticated = true + file = nexus/db-queries/src/db/datastore/instance.rs:1160 + found_gen = Generation(Generation(5)) + instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb + locked_gen = Generation(Generation(5)) + saga_id = 9fde65bf-fb0a-4a82-9b2b-276789f57a76 + saga_name = start-instance-update + saga_node = LockInstance + updater_id = 716a5faa-9a9d-456d-a242-32cc9a846fab +21:28:32.994Z INFO 3090570f-4c2b-43ae-8124-776fbad100fa (ServerContext): server error while recording saga event, retrying + call_count = 3 + error = Internal Error: attempted to release a lock held by another saga! this is a bug! + file = nexus/src/app/sagas/instance_update/mod.rs:1202 + instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb + lock = UpdaterLock { updater_id: 716a5faa-9a9d-456d-a242-32cc9a846fab, locked_gen: Generation(Generation(5)) } + saga_id = 9fde65bf-fb0a-4a82-9b2b-276789f57a76 + saga_name = start-instance-update + total_duration = 2.228850544s +21:28:35.581Z ERRO 3090570f-4c2b-43ae-8124-776fbad100fa (ServerContext): attempted to release a lock held by another saga + actor_id = 001de000-05e4-4000-8000-000000000002 + actual_id = 716a5faa-9a9d-456d-a242-32cc9a846fab + authenticated = true + file = nexus/db-queries/src/db/datastore/instance.rs:1160 + found_gen = Generation(Generation(5)) + instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb + locked_gen = Generation(Generation(5)) + saga_id = 9fde65bf-fb0a-4a82-9b2b-276789f57a76 + saga_name = start-instance-update + saga_node = LockInstance + updater_id = 716a5faa-9a9d-456d-a242-32cc9a846fab +21:28:35.581Z INFO 3090570f-4c2b-43ae-8124-776fbad100fa (ServerContext): server error while recording saga event, retrying + call_count = 4 + error = Internal Error: attempted to release a lock held by another saga! this is a bug! + file = nexus/src/app/sagas/instance_update/mod.rs:1202 + instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb + lock = UpdaterLock { updater_id: 716a5faa-9a9d-456d-a242-32cc9a846fab, locked_gen: Generation(Generation(5)) } + saga_id = 9fde65bf-fb0a-4a82-9b2b-276789f57a76 + saga_name = start-instance-update + total_duration = 4.816258375s +21:28:41.493Z ERRO 3090570f-4c2b-43ae-8124-776fbad100fa (ServerContext): attempted to release a lock held by another saga + actor_id = 001de000-05e4-4000-8000-000000000002 + actual_id = 716a5faa-9a9d-456d-a242-32cc9a846fab + authenticated = true + file = nexus/db-queries/src/db/datastore/instance.rs:1160 + found_gen = Generation(Generation(5)) + instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb + locked_gen = Generation(Generation(5)) + saga_id = 9fde65bf-fb0a-4a82-9b2b-276789f57a76 + saga_name = start-instance-update + saga_node = LockInstance + updater_id = 716a5faa-9a9d-456d-a242-32cc9a846fab +21:28:41.493Z INFO 3090570f-4c2b-43ae-8124-776fbad100fa (ServerContext): server error while recording saga event, retrying + call_count = 5 + error = Internal Error: attempted to release a lock held by another saga! this is a bug! + file = nexus/src/app/sagas/instance_update/mod.rs:1202 + instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb + lock = UpdaterLock { updater_id: 716a5faa-9a9d-456d-a242-32cc9a846fab, locked_gen: Generation(Generation(5)) } + saga_id = 9fde65bf-fb0a-4a82-9b2b-276789f57a76 + saga_name = start-instance-update + total_duration = 10.728196913s +21:28:47.821Z ERRO 3090570f-4c2b-43ae-8124-776fbad100fa (ServerContext): attempted to release a lock held by another saga + actor_id = 001de000-05e4-4000-8000-000000000002 + actual_id = 716a5faa-9a9d-456d-a242-32cc9a846fab + authenticated = true + file = nexus/db-queries/src/db/datastore/instance.rs:1160 + found_gen = Generation(Generation(5)) + instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb + locked_gen = Generation(Generation(5)) + saga_id = 9fde65bf-fb0a-4a82-9b2b-276789f57a76 + saga_name = start-instance-update + saga_node = LockInstance + updater_id = 716a5faa-9a9d-456d-a242-32cc9a846fab +21:28:47.821Z INFO 3090570f-4c2b-43ae-8124-776fbad100fa (ServerContext): server error while recording saga event, retrying + call_count = 6 + error = Internal Error: attempted to release a lock held by another saga! this is a bug! + file = nexus/src/app/sagas/instance_update/mod.rs:1202 + instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb + lock = UpdaterLock { updater_id: 716a5faa-9a9d-456d-a242-32cc9a846fab, locked_gen: Generation(Generation(5)) } + saga_id = 9fde65bf-fb0a-4a82-9b2b-276789f57a76 + saga_name = start-instance-update + total_duration = 17.055072611s +21:29:01.460Z ERRO 3090570f-4c2b-43ae-8124-776fbad100fa (ServerContext): attempted to release a lock held by another saga + actor_id = 001de000-05e4-4000-8000-000000000002 + actual_id = 716a5faa-9a9d-456d-a242-32cc9a846fab + authenticated = true + file = nexus/db-queries/src/db/datastore/instance.rs:1160 + found_gen = Generation(Generation(5)) + instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb + locked_gen = Generation(Generation(5)) + saga_id = 9fde65bf-fb0a-4a82-9b2b-276789f57a76 + saga_name = start-instance-update + saga_node = LockInstance + updater_id = 716a5faa-9a9d-456d-a242-32cc9a846fab +21:29:01.460Z WARN 3090570f-4c2b-43ae-8124-776fbad100fa (ServerContext): instance update: server error while unlocking instance, + retrying + call_count = 7 + error = Internal Error: attempted to release a lock held by another saga! this is a bug! + file = nexus/src/app/sagas/instance_update/mod.rs:1191 + instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb + lock = UpdaterLock { updater_id: 716a5faa-9a9d-456d-a242-32cc9a846fab, locked_gen: Generation(Generation(5)) } + saga_id = 9fde65bf-fb0a-4a82-9b2b-276789f57a76 + saga_name = start-instance-update + total_duration = 30.693835165s +21:29:17.617Z ERRO 3090570f-4c2b-43ae-8124-776fbad100fa (ServerContext): attempted to release a lock held by another saga + actor_id = 001de000-05e4-4000-8000-000000000002 + actual_id = 716a5faa-9a9d-456d-a242-32cc9a846fab + authenticated = true + file = nexus/db-queries/src/db/datastore/instance.rs:1160 + found_gen = Generation(Generation(5)) + instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb + locked_gen = Generation(Generation(5)) + saga_id = 9fde65bf-fb0a-4a82-9b2b-276789f57a76 + saga_name = start-instance-update + saga_node = LockInstance + updater_id = 716a5faa-9a9d-456d-a242-32cc9a846fab +21:29:17.617Z WARN 3090570f-4c2b-43ae-8124-776fbad100fa (ServerContext): instance update: server error while unlocking instance, + retrying + call_count = 8 + error = Internal Error: attempted to release a lock held by another saga! this is a bug! + file = nexus/src/app/sagas/instance_update/mod.rs:1191 + instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb + lock = UpdaterLock { updater_id: 716a5faa-9a9d-456d-a242-32cc9a846fab, locked_gen: Generation(Generation(5)) } + saga_id = 9fde65bf-fb0a-4a82-9b2b-276789f57a76 + saga_name = start-instance-update + total_duration = 46.850057263s +21:30:35.990Z ERRO 3090570f-4c2b-43ae-8124-776fbad100fa (ServerContext): attempted to release a lock held by another saga + actor_id = 001de000-05e4-4000-8000-000000000002 + actual_id = 716a5faa-9a9d-456d-a242-32cc9a846fab + authenticated = true + file = nexus/db-queries/src/db/datastore/instance.rs:1160 + found_gen = Generation(Generation(5)) + instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb + locked_gen = Generation(Generation(5)) + saga_id = 9fde65bf-fb0a-4a82-9b2b-276789f57a76 + saga_name = start-instance-update + saga_node = LockInstance + updater_id = 716a5faa-9a9d-456d-a242-32cc9a846fab +21:30:35.990Z WARN 3090570f-4c2b-43ae-8124-776fbad100fa (ServerContext): instance update: server error while unlocking instance, + retrying + call_count = 9 + error = Internal Error: attempted to release a lock held by another saga! this is a bug! + file = nexus/src/app/sagas/instance_update/mod.rs:1191 + instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb + lock = UpdaterLock { updater_id: 716a5faa-9a9d-456d-a242-32cc9a846fab, locked_gen: Generation(Generation(5)) } + saga_id = 9fde65bf-fb0a-4a82-9b2b-276789f57a76 + saga_name = start-instance-update + total_duration = 125.218673206s diff --git a/nexus/db-queries/src/db/datastore/instance.rs b/nexus/db-queries/src/db/datastore/instance.rs index 0f403c02917..a3625a637c3 100644 --- a/nexus/db-queries/src/db/datastore/instance.rs +++ b/nexus/db-queries/src/db/datastore/instance.rs @@ -1453,7 +1453,7 @@ mod tests { // now, unlock the instance. let unlocked = dbg!( datastore - .instance_commit_update(&opctx, &authz_instance, &lock1, None) + .instance_updater_unlock(&opctx, &authz_instance, &lock1) .await ) .expect("instance should unlock"); @@ -1462,7 +1462,7 @@ mod tests { // unlocking it again should also succeed... let unlocked = dbg!( datastore - .instance_commit_update(&opctx, &authz_instance, &lock2, None) + .instance_updater_unlock(&opctx, &authz_instance, &lock2,) .await ) .expect("instance should unlock again"); @@ -1497,7 +1497,7 @@ mod tests { // attempting to unlock with a different saga ID should be an error. let err = dbg!( datastore - .instance_commit_update( + .instance_updater_unlock( &opctx, &authz_instance, // N.B. that the `UpdaterLock` type's fields are private @@ -1509,7 +1509,6 @@ mod tests { updater_id: saga2, locked_gen: lock1.locked_gen, }, - None, ) .await ) @@ -1528,7 +1527,7 @@ mod tests { // unlocking with the correct ID should succeed. let unlocked = dbg!( datastore - .instance_commit_update(&opctx, &authz_instance, &lock1, None) + .instance_updater_unlock(&opctx, &authz_instance, &lock1) .await ) .expect("instance should unlock"); @@ -1538,14 +1537,13 @@ mod tests { // (where the lock is no longer held) should fail. let err = dbg!( datastore - .instance_commit_update( + .instance_updater_unlock( &opctx, &authz_instance, // Again, these fields are private specifically to prevent // you from doing this exact thing. But, we should still // test that we handle it gracefully. &UpdaterLock { updater_id: saga1, locked_gen: next_gen }, - None, ) .await ) From 6f74ced9baeca61c07701baa66760ce34672b09e Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Fri, 26 Jul 2024 09:53:56 -0700 Subject: [PATCH 187/234] fixup tests, add test for unlocking a deleted instance --- nexus/db-queries/src/db/datastore/instance.rs | 78 ++++++++++++++++++- 1 file changed, 75 insertions(+), 3 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/instance.rs b/nexus/db-queries/src/db/datastore/instance.rs index a3625a637c3..a82bcafdce7 100644 --- a/nexus/db-queries/src/db/datastore/instance.rs +++ b/nexus/db-queries/src/db/datastore/instance.rs @@ -1138,6 +1138,7 @@ impl DataStore { status: UpdateStatus::NotUpdatedButExists, ref found, } if found.updater_gen > locked_gen => Ok(false), + // The instance exists, but the lock ID doesn't match our lock ID. // This means we were trying to release a lock we never held, whcih // is almost certainly a programmer error. @@ -1229,6 +1230,24 @@ impl DataStore { UpdateAndQueryResult { status: UpdateStatus::Updated, .. } => { Ok(true) } + + // The instance has been marked as deleted, so no updates were + // committed! + UpdateAndQueryResult { + status: UpdateStatus::NotUpdatedButExists, + ref found, + } if found.time_deleted().is_some() => { + warn!( + &opctx.log, + "cannot commit instance update, as the instance no longer exists"; + "instance_id" => %instance_id, + "updater_id" => %updater_id, + "time_deleted" => ?found.time_deleted() + ); + + Err(LookupType::ById(instance_id).into_not_found(ResourceType::Instance)) + } + // The generation has advanced past the generation at which the // lock was held. This means that we have already released the // lock. Return `Ok(false)` here for idempotency. @@ -1236,6 +1255,7 @@ impl DataStore { status: UpdateStatus::NotUpdatedButExists, ref found, } if found.updater_gen > locked_gen => Ok(false), + // The instance exists, but the lock ID doesn't match our lock ID. // This means we were trying to release a lock we never held, whcih // is almost certainly a programmer error. @@ -1255,7 +1275,7 @@ impl DataStore { "attempted to release a lock held by another saga! this is a bug!", )) }, - Some(_) => Err(Error::internal_error( + Some(_) => Err(Error::conflict( "attempted to commit an instance update, but the state generation has advanced!" )), None => Err(Error::internal_error( @@ -1278,6 +1298,7 @@ mod tests { use nexus_db_model::VmmState; use nexus_test_utils::db::test_setup_database; use nexus_types::external_api::params; + use omicron_common::api::external; use omicron_common::api::external::ByteCount; use omicron_common::api::external::IdentityMetadataCreateParams; use omicron_test_utils::dev; @@ -1394,7 +1415,7 @@ mod tests { // unlock the instance from saga 1 let unlocked = datastore - .instance_commit_update(&opctx, &authz_instance, &lock1, None) + .instance_updater_unlock(&opctx, &authz_instance, &lock1) .await .expect("instance must be unlocked by saga 1"); assert!(unlocked, "instance must actually be unlocked"); @@ -1407,7 +1428,7 @@ mod tests { // unlock the instance from saga 2 let unlocked = datastore - .instance_commit_update(&opctx, &authz_instance, &lock2, None) + .instance_updater_unlock(&opctx, &authz_instance, &lock2) .await .expect("instance must be unlocked by saga 2"); assert!(unlocked, "instance must actually be unlocked"); @@ -1563,6 +1584,57 @@ mod tests { logctx.cleanup_successful(); } + #[tokio::test] + async fn test_unlocking_a_deleted_instance_is_okay() { + // Setup + let logctx = + dev::test_setup_log("test_unlocking_a_deleted_instance_is_okay"); + let mut db = test_setup_database(&logctx.log).await; + let (opctx, datastore) = datastore_test(&logctx, &db).await; + let authz_instance = create_test_instance(&datastore, &opctx).await; + let saga1 = Uuid::new_v4(); + + // put the instance in a state where it will be okay to delete later... + datastore + .instance_update_runtime( + &InstanceUuid::from_untyped_uuid(authz_instance.id()), + &InstanceRuntimeState { + time_updated: Utc::now(), + r#gen: Generation(external::Generation::from_u32(2)), + propolis_id: None, + dst_propolis_id: None, + migration_id: None, + nexus_state: InstanceState::NoVmm, + }, + ) + .await + .expect("should update state successfully"); + + // lock the instance once. + let lock = dbg!( + datastore + .instance_updater_lock(&opctx, &authz_instance, saga1) + .await + ) + .expect("instance should be locked"); + + // mark the instance as deleted + dbg!(datastore.project_delete_instance(&opctx, &authz_instance).await) + .expect("instance should be deleted"); + + // unlocking should still succeed. + dbg!( + datastore + .instance_updater_unlock(&opctx, &authz_instance, &lock) + .await + ) + .expect("instance should unlock"); + + // Clean up. + db.cleanup().await.unwrap(); + logctx.cleanup_successful(); + } + #[tokio::test] async fn test_instance_fetch_all() { // Setup From 836ea7d35b41aebf19628ed850568caf1d90a26c Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Fri, 26 Jul 2024 14:21:28 -0700 Subject: [PATCH 188/234] activate network RPWs when they're likely to see new state --- nexus/src/app/sagas/instance_update/mod.rs | 25 +++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/nexus/src/app/sagas/instance_update/mod.rs b/nexus/src/app/sagas/instance_update/mod.rs index 61060689ed6..8a26db7c52d 100644 --- a/nexus/src/app/sagas/instance_update/mod.rs +++ b/nexus/src/app/sagas/instance_update/mod.rs @@ -873,10 +873,6 @@ async fn siu_update_network_config( } } - // Make sure the V2P manager background task runs to ensure the V2P mappings - // for this instance are up to date. - nexus.background_tasks.activate(&nexus.background_tasks.task_v2p_manager); - Ok(()) } @@ -976,6 +972,7 @@ async fn siu_commit_instance_updates( let opctx = crate::context::op_context_for_saga_action(&sagactx, &serialized_authn); let log = osagactx.log(); + let nexus = osagactx.nexus(); let instance_id = authz_instance.id(); @@ -1006,6 +1003,24 @@ async fn siu_commit_instance_updates( "did_unlock" => ?did_unlock, ); + if update.network_config.is_some() { + // If the update we performed changed networking configuration, activate + // the V2P manager and VPC router RPWs, to ensure that the V2P mapping + // and VPC for this instance are up to date. + // + // We do this here, rather than in the network config update action, so + // that the instance's state in the database reflects the new rather + // than the old state. Otherwise, if the networking RPW ran *before* + // writing the new state to CRDB, it will run with the old VMM, rather + // than the new one, and probably do nothing. Then, the networking + // config update would be delayed until the *next* background task + // activation. This way, we ensure that the RPW runs *after* we are in + // the new state. + + nexus.background_tasks.task_v2p_manager.activate(); + nexus.vpc_needed_notify_sleds(); + } + // Check if the VMM or migration state has changed while the update saga was // running and whether an additional update saga is now required. If one is // required, try to start it. @@ -1030,7 +1045,7 @@ async fn siu_commit_instance_updates( "instance_id" => %instance_id, "error" => %error, ); - osagactx.nexus().background_tasks.task_instance_updater.activate(); + nexus.background_tasks.task_instance_updater.activate(); } Ok(()) From 02085267cfd0a677c2e258371d577426c01ced9d Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Sat, 27 Jul 2024 13:11:26 -0700 Subject: [PATCH 189/234] post-rebase fixup (PutMigrationIds went away) --- sled-agent/src/instance.rs | 3 --- 1 file changed, 3 deletions(-) diff --git a/sled-agent/src/instance.rs b/sled-agent/src/instance.rs index 8060dcea3a2..cb437d824fe 100644 --- a/sled-agent/src/instance.rs +++ b/sled-agent/src/instance.rs @@ -487,9 +487,6 @@ impl InstanceRunner { PutState { tx, .. } => { tx.send(Err(Error::Terminating.into())).map_err(|_| ()) } - PutMigrationIds { tx, .. } => { - tx.send(Err(Error::Terminating.into())).map_err(|_| ()) - } Terminate { tx, .. } => { tx.send(Err(Error::Terminating.into())).map_err(|_| ()) } From 0f5e3403f894dc7f01c9791ead1cfeb775e810a0 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Mon, 29 Jul 2024 15:22:53 -0700 Subject: [PATCH 190/234] update saga should also unlink `SagaUnwound` VMMs --- nexus/src/app/sagas/instance_update/mod.rs | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/nexus/src/app/sagas/instance_update/mod.rs b/nexus/src/app/sagas/instance_update/mod.rs index 8a26db7c52d..2f2c1257f56 100644 --- a/nexus/src/app/sagas/instance_update/mod.rs +++ b/nexus/src/app/sagas/instance_update/mod.rs @@ -431,7 +431,10 @@ impl UpdatesRequired { // Has the active VMM been destroyed? let destroy_active_vmm = snapshot.active_vmm.as_ref().and_then(|active_vmm| { - if active_vmm.runtime.state == VmmState::Destroyed { + if matches!( + active_vmm.runtime.state, + VmmState::Destroyed | VmmState::SagaUnwound + ) { let id = PropolisUuid::from_untyped_uuid(active_vmm.id); // Unlink the active VMM ID. If the active VMM was destroyed // because a migration out completed, the next block, which @@ -465,7 +468,10 @@ impl UpdatesRequired { let destroy_target_vmm = snapshot.target_vmm.as_ref().and_then(|target_vmm| { - if target_vmm.runtime.state == VmmState::Destroyed { + if matches!( + target_vmm.runtime.state, + VmmState::Destroyed | VmmState::SagaUnwound + ) { // Unlink the target VMM ID. new_runtime.dst_propolis_id = None; update_required = true; From c4c3ec8c830e19bd75851ceddab90dd41bff5680 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Mon, 29 Jul 2024 15:54:01 -0700 Subject: [PATCH 191/234] allow start sagas to clobber saga-unwound VMMs --- nexus/src/app/sagas/instance_start.rs | 69 ++++++++++++++++----------- 1 file changed, 40 insertions(+), 29 deletions(-) diff --git a/nexus/src/app/sagas/instance_start.rs b/nexus/src/app/sagas/instance_start.rs index 4e777c931a7..b8acd0927dd 100644 --- a/nexus/src/app/sagas/instance_start.rs +++ b/nexus/src/app/sagas/instance_start.rs @@ -235,23 +235,34 @@ async fn sis_move_to_starting( // For idempotency, refetch the instance to see if this step already applied // its desired update. - let (.., db_instance) = LookupPath::new(&opctx, &datastore) + let (_, _, authz_instance, ..) = LookupPath::new(&opctx, &datastore) .instance_id(instance_id.into_untyped_uuid()) .fetch_for(authz::Action::Modify) .await .map_err(ActionError::action_failed)?; + let state = datastore + .instance_fetch_with_vmm(&opctx, &authz_instance) + .await + .map_err(ActionError::action_failed)?; + + let db_instance = state.instance(); - match db_instance.runtime().propolis_id { + match state.vmm() { // If this saga's Propolis ID is already written to the record, then // this step must have completed already and is being retried, so // proceed. - Some(db_id) if db_id == propolis_id.into_untyped_uuid() => { + Some(vmm) if vmm.id == propolis_id.into_untyped_uuid() => { info!(osagactx.log(), "start saga: Propolis ID already set"; "instance_id" => %instance_id); - Ok(db_instance) + return Ok(db_instance.clone()); } + // If the instance has a Propolis ID, but the Propolis was left behind + // by a previous start saga unwinding, that's fine, we can just clear it + // out and proceed as though there was no Propolis ID here. + Some(vmm) if vmm.runtime.state == db::model::VmmState::SagaUnwound => {} + // If the instance has a different Propolis ID, a competing start saga // must have started the instance already, so unwind. Some(_) => { @@ -266,33 +277,33 @@ async fn sis_move_to_starting( // this point causes the VMM's state, which is Starting, to supersede // the instance's state, so this won't cause the instance to appear to // be running before Propolis thinks it has started.) - None => { - let new_runtime = db::model::InstanceRuntimeState { - nexus_state: db::model::InstanceState::Vmm, - propolis_id: Some(propolis_id.into_untyped_uuid()), - time_updated: Utc::now(), - gen: db_instance.runtime().gen.next().into(), - ..db_instance.runtime_state - }; - - // Bail if another actor managed to update the instance's state in - // the meantime. - if !osagactx - .datastore() - .instance_update_runtime(&instance_id, &new_runtime) - .await - .map_err(ActionError::action_failed)? - { - return Err(ActionError::action_failed(Error::conflict( - "instance changed state before it could be started", - ))); - } + None => {} + } - let mut new_record = db_instance.clone(); - new_record.runtime_state = new_runtime; - Ok(new_record) - } + let new_runtime = db::model::InstanceRuntimeState { + nexus_state: db::model::InstanceState::Vmm, + propolis_id: Some(propolis_id.into_untyped_uuid()), + time_updated: Utc::now(), + gen: db_instance.runtime().gen.next().into(), + ..db_instance.runtime_state + }; + + // Bail if another actor managed to update the instance's state in + // the meantime. + if !osagactx + .datastore() + .instance_update_runtime(&instance_id, &new_runtime) + .await + .map_err(ActionError::action_failed)? + { + return Err(ActionError::action_failed(Error::conflict( + "instance changed state before it could be started", + ))); } + + let mut new_record = db_instance.clone(); + new_record.runtime_state = new_runtime; + Ok(new_record) } async fn sis_move_to_starting_undo( From 8d23d66b63210a13859c8591244ebefabb654dd4 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Mon, 29 Jul 2024 16:33:17 -0700 Subject: [PATCH 192/234] properly handle SagaUnwound, part 2 --- nexus/db-model/src/vmm_state.rs | 5 ++++ nexus/db-queries/src/db/datastore/instance.rs | 30 ++++++++++++++----- nexus/db-queries/src/db/datastore/vmm.rs | 9 +++--- nexus/src/app/sagas/instance_start.rs | 13 +++++++- nexus/src/app/sagas/instance_update/mod.rs | 10 ++----- 5 files changed, 45 insertions(+), 22 deletions(-) diff --git a/nexus/db-model/src/vmm_state.rs b/nexus/db-model/src/vmm_state.rs index b61d79624af..7d44bbedbd1 100644 --- a/nexus/db-model/src/vmm_state.rs +++ b/nexus/db-model/src/vmm_state.rs @@ -41,6 +41,11 @@ impl VmmState { VmmState::SagaUnwound => "saga_unwound", } } + + /// States in which it is safe to deallocate a VMM's sled resources and mark + /// it as deleted. + pub const DESTROYABLE_STATES: &'static [Self] = + &[Self::Destroyed, Self::SagaUnwound]; } impl fmt::Display for VmmState { diff --git a/nexus/db-queries/src/db/datastore/instance.rs b/nexus/db-queries/src/db/datastore/instance.rs index a82bcafdce7..604084ac0a6 100644 --- a/nexus/db-queries/src/db/datastore/instance.rs +++ b/nexus/db-queries/src/db/datastore/instance.rs @@ -140,12 +140,15 @@ impl InstanceAndActiveVmm { // active VMM ID has been unlinked by an update saga. ( InstanceState::Vmm, - Some( - VmmState::Stopped - | VmmState::Destroyed - | VmmState::SagaUnwound, - ), + Some(VmmState::Stopped | VmmState::Destroyed), ) => external::InstanceState::Stopping, + // - An instance with a "saga unwound" VMM, on the other hand, can + // be treated as "stopped", since --- unlike "destroyed" --- a new + // start saga can run at any time by just clearing out the old VMM + // ID. + (InstanceState::Vmm, Some(VmmState::SagaUnwound)) => { + external::InstanceState::Stopped + } // - An instance with no VMM is always "stopped" (as long as it's // not "starting" etc.) (InstanceState::NoVmm, _vmm_state) => { @@ -635,6 +638,13 @@ impl DataStore { .filter(vmm_dsl::id.eq(src_propolis_id)) .filter(vmm_dsl::state.eq_any(ALLOWED_ACTIVE_VMM_STATES)) .select(vmm_dsl::instance_id); + // Subquery for checking if a present target VMM ID points at a VMM + // that's in the saga-unwound state (in which it would be okay to clear + // out that VMM). + let target_vmm_unwound = vmm_dsl::vmm + .filter(vmm_dsl::id.eq(target_propolis_id)) + .filter(vmm_dsl::state.eq(VmmState::SagaUnwound)) + .select(vmm_dsl::instance_id); diesel::update(dsl::instance) .filter(dsl::time_deleted.is_null()) @@ -648,9 +658,13 @@ impl DataStore { // `check_if_exists` which returns the prior state, and still // fail to update the record if another migration/target VMM ID // is already there. - (dsl::migration_id - .is_null() - .and(dsl::target_propolis_id.is_null())) + (dsl::migration_id.is_null().and( + dsl::target_propolis_id + .is_null() + // It's okay to clobber a previously-set target VMM ID + // if (and only if!) it's in the saga-unwound state. + .or(dsl::id.eq_any(target_vmm_unwound)), + )) .or(dsl::migration_id .eq(Some(migration_id)) .and(dsl::target_propolis_id.eq(Some(target_propolis_id)))), diff --git a/nexus/db-queries/src/db/datastore/vmm.rs b/nexus/db-queries/src/db/datastore/vmm.rs index eb788fdc898..4ce6d45603e 100644 --- a/nexus/db-queries/src/db/datastore/vmm.rs +++ b/nexus/db-queries/src/db/datastore/vmm.rs @@ -249,7 +249,7 @@ impl DataStore { /// /// A VMM is considered "abandoned" if (and only if): /// - /// - It is in the `Destroyed` state. + /// - It is in the `Destroyed` or `SagaUnwound` state. /// - It is not currently running an instance, and it is also not the /// migration target of any instance (i.e. it is not pointed to by /// any instance record's `active_propolis_id` and `target_propolis_id` @@ -261,16 +261,15 @@ impl DataStore { pagparams: &DataPageParams<'_, Uuid>, ) -> ListResultVec { use crate::db::schema::instance::dsl as instance_dsl; - let destroyed = DbVmmState::Destroyed; + paginated(dsl::vmm, dsl::id, pagparams) // In order to be considered "abandoned", a VMM must be: - // - in the `Destroyed` state - .filter(dsl::state.eq(destroyed)) + // - in the `Destroyed` or `SagaUnwound` state + .filter(dsl::state.eq_any(DbVmmState::DESTROYABLE_STATES)) // - not deleted yet .filter(dsl::time_deleted.is_null()) // - not pointed to by any instance's `active_propolis_id` or // `target_propolis_id`. - // .left_join( // Left join with the `instance` table on the VMM's instance ID, so // that we can check if the instance pointed to by this VMM (if diff --git a/nexus/src/app/sagas/instance_start.rs b/nexus/src/app/sagas/instance_start.rs index b8acd0927dd..9e4e010eeab 100644 --- a/nexus/src/app/sagas/instance_start.rs +++ b/nexus/src/app/sagas/instance_start.rs @@ -247,6 +247,10 @@ async fn sis_move_to_starting( let db_instance = state.instance(); + // If `true`, we have unlinked a Propolis ID left behind by a previous + // unwinding start saga, and we should activate the activate the abandoned + // VMM reaper background task once we've written back the instance record. + let mut abandoned_unwound_vmm = false; match state.vmm() { // If this saga's Propolis ID is already written to the record, then // this step must have completed already and is being retried, so @@ -261,7 +265,9 @@ async fn sis_move_to_starting( // If the instance has a Propolis ID, but the Propolis was left behind // by a previous start saga unwinding, that's fine, we can just clear it // out and proceed as though there was no Propolis ID here. - Some(vmm) if vmm.runtime.state == db::model::VmmState::SagaUnwound => {} + Some(vmm) if vmm.runtime.state == db::model::VmmState::SagaUnwound => { + abandoned_unwound_vmm = true; + } // If the instance has a different Propolis ID, a competing start saga // must have started the instance already, so unwind. @@ -301,6 +307,11 @@ async fn sis_move_to_starting( ))); } + // Don't fear the reaper! + if abandoned_unwound_vmm { + osagactx.nexus().background_tasks.task_abandoned_vmm_reaper.activate(); + } + let mut new_record = db_instance.clone(); new_record.runtime_state = new_runtime; Ok(new_record) diff --git a/nexus/src/app/sagas/instance_update/mod.rs b/nexus/src/app/sagas/instance_update/mod.rs index 2f2c1257f56..8a26db7c52d 100644 --- a/nexus/src/app/sagas/instance_update/mod.rs +++ b/nexus/src/app/sagas/instance_update/mod.rs @@ -431,10 +431,7 @@ impl UpdatesRequired { // Has the active VMM been destroyed? let destroy_active_vmm = snapshot.active_vmm.as_ref().and_then(|active_vmm| { - if matches!( - active_vmm.runtime.state, - VmmState::Destroyed | VmmState::SagaUnwound - ) { + if active_vmm.runtime.state == VmmState::Destroyed { let id = PropolisUuid::from_untyped_uuid(active_vmm.id); // Unlink the active VMM ID. If the active VMM was destroyed // because a migration out completed, the next block, which @@ -468,10 +465,7 @@ impl UpdatesRequired { let destroy_target_vmm = snapshot.target_vmm.as_ref().and_then(|target_vmm| { - if matches!( - target_vmm.runtime.state, - VmmState::Destroyed | VmmState::SagaUnwound - ) { + if target_vmm.runtime.state == VmmState::Destroyed { // Unlink the target VMM ID. new_runtime.dst_propolis_id = None; update_required = true; From f8b44f64d223bd1e20583edfeeb98c3f34c132c7 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Tue, 30 Jul 2024 11:49:18 -0700 Subject: [PATCH 193/234] fix target VMM unwound check when setting migration IDs This fixes an issue where `instance_set_migration_ids` didn't actually allow clobbering a `SagaUnwound` VMM, because it instead allowed it if the NEW target VMM was SagaUnwound. Also, I added a test, which is how I uncovered this in the first place. --- nexus/db-queries/src/db/datastore/instance.rs | 274 +++++++++++++++++- 1 file changed, 273 insertions(+), 1 deletion(-) diff --git a/nexus/db-queries/src/db/datastore/instance.rs b/nexus/db-queries/src/db/datastore/instance.rs index 604084ac0a6..1fe6ffce90a 100644 --- a/nexus/db-queries/src/db/datastore/instance.rs +++ b/nexus/db-queries/src/db/datastore/instance.rs @@ -642,7 +642,7 @@ impl DataStore { // that's in the saga-unwound state (in which it would be okay to clear // out that VMM). let target_vmm_unwound = vmm_dsl::vmm - .filter(vmm_dsl::id.eq(target_propolis_id)) + .filter(vmm_dsl::id.nullable().eq(dsl::target_propolis_id)) .filter(vmm_dsl::state.eq(VmmState::SagaUnwound)) .select(vmm_dsl::instance_id); @@ -684,6 +684,17 @@ impl DataStore { ) .await .map_err(|e| { + // Turning all these errors into `NotFound` errors is a bit + // unfortunate. The query will not find anything fail if the + // instance ID actually doesn't exist, *or* if any of the "is + // it valid to set migration IDs in the current state?" checks + // fail, which should probably be `Error::Conflict` + // instead...but, we can't really tell which is the case here. + // + // TODO(eliza): Perhaps these should all be mapped to `Conflict` + // instead? It's arguably correct to say that trying to set + // migration IDs for an instance that doesn't exist is sort of a + // "conflict", for a significantly broad definition of "conflcit"... public_error_from_diesel( e, ErrorHandler::NotFoundByLookup( @@ -1820,4 +1831,265 @@ mod tests { db.cleanup().await.unwrap(); logctx.cleanup_successful(); } + + #[tokio::test] + async fn test_instance_set_migration_ids() { + // Setup + let logctx = dev::test_setup_log("test_instance_set_migration_ids"); + let mut db = test_setup_database(&logctx.log).await; + let (opctx, datastore) = datastore_test(&logctx, &db).await; + let authz_instance = create_test_instance(&datastore, &opctx).await; + + // Create the first VMM in a state where `set_migration_ids` should + // *fail* (Stopped). We will assert that we cannot set the migration + // IDs, and then advance it to Running, when we can start the migration. + let vmm1 = datastore + .vmm_insert( + &opctx, + Vmm { + id: Uuid::new_v4(), + time_created: Utc::now(), + time_deleted: None, + instance_id: authz_instance.id(), + sled_id: Uuid::new_v4(), + propolis_ip: "10.1.9.32".parse().unwrap(), + propolis_port: 420.into(), + runtime: VmmRuntimeState { + time_state_updated: Utc::now(), + r#gen: Generation::new(), + state: VmmState::Stopped, + }, + }, + ) + .await + .expect("active VMM should be inserted successfully!"); + + let instance_id = InstanceUuid::from_untyped_uuid(authz_instance.id()); + let instance = datastore + .instance_refetch(&opctx, &authz_instance) + .await + .expect("instance should be there"); + datastore + .instance_update_runtime( + &instance_id, + &InstanceRuntimeState { + time_updated: Utc::now(), + r#gen: Generation(instance.runtime_state.gen.0.next()), + nexus_state: InstanceState::Vmm, + propolis_id: Some(vmm1.id), + ..instance.runtime_state.clone() + }, + ) + .await + .expect("instance update should work"); + + let vmm2 = datastore + .vmm_insert( + &opctx, + Vmm { + id: Uuid::new_v4(), + time_created: Utc::now(), + time_deleted: None, + instance_id: authz_instance.id(), + sled_id: Uuid::new_v4(), + propolis_ip: "10.1.9.42".parse().unwrap(), + propolis_port: 420.into(), + runtime: VmmRuntimeState { + time_state_updated: Utc::now(), + r#gen: Generation::new(), + state: VmmState::Running, + }, + }, + ) + .await + .expect("second VMM should insert"); + + // make a migration... + let migration = datastore + .migration_insert( + &opctx, + Migration::new(Uuid::new_v4(), instance_id, vmm1.id, vmm2.id), + ) + .await + .expect("migration should be inserted successfully!"); + + // Our first attempt to set migration IDs should fail, because the + // active VMM is Stopped. + let res = dbg!( + datastore + .instance_set_migration_ids( + &opctx, + instance_id, + PropolisUuid::from_untyped_uuid(vmm1.id), + migration.id, + PropolisUuid::from_untyped_uuid(vmm2.id), + ) + .await + ); + assert!(res.is_err()); + + // Okay, now, advance the active VMM to Running, and try again. + let updated = dbg!( + datastore + .vmm_update_runtime( + &PropolisUuid::from_untyped_uuid(vmm1.id), + &VmmRuntimeState { + time_state_updated: Utc::now(), + r#gen: Generation(vmm2.runtime.r#gen.0.next()), + state: VmmState::Running, + }, + ) + .await + ) + .expect("updating VMM state should be fine"); + assert!(updated); + + // Now, it should work! + let instance = dbg!( + datastore + .instance_set_migration_ids( + &opctx, + instance_id, + PropolisUuid::from_untyped_uuid(vmm1.id), + migration.id, + PropolisUuid::from_untyped_uuid(vmm2.id), + ) + .await + ) + .expect("setting migration IDs should succeed"); + assert_eq!(instance.runtime().dst_propolis_id, Some(vmm2.id)); + assert_eq!(instance.runtime().migration_id, Some(migration.id)); + + // Doing it again should be idempotent, and the instance record + // shouldn't change. + let instance2 = dbg!( + datastore + .instance_set_migration_ids( + &opctx, + instance_id, + PropolisUuid::from_untyped_uuid(vmm1.id), + migration.id, + PropolisUuid::from_untyped_uuid(vmm2.id), + ) + .await + ) + .expect("setting the same migration IDs a second time should succeed"); + assert_eq!( + instance.runtime().dst_propolis_id, + instance2.runtime().dst_propolis_id + ); + assert_eq!( + instance.runtime().migration_id, + instance2.runtime().migration_id + ); + let instance = instance2; + + // Trying to set a new migration should fail, as long as the prior stuff + // is still in place. + let vmm3 = datastore + .vmm_insert( + &opctx, + Vmm { + id: Uuid::new_v4(), + time_created: Utc::now(), + time_deleted: None, + instance_id: authz_instance.id(), + sled_id: Uuid::new_v4(), + propolis_ip: "10.1.9.42".parse().unwrap(), + propolis_port: 420.into(), + runtime: VmmRuntimeState { + time_state_updated: Utc::now(), + r#gen: Generation::new(), + state: VmmState::Running, + }, + }, + ) + .await + .expect("third VMM should insert"); + let migration2 = datastore + .migration_insert( + &opctx, + Migration::new(Uuid::new_v4(), instance_id, vmm1.id, vmm3.id), + ) + .await + .expect("migration should be inserted successfully!"); + dbg!( + datastore + .instance_set_migration_ids( + &opctx, + instance_id, + PropolisUuid::from_untyped_uuid(vmm1.id), + migration2.id, + PropolisUuid::from_untyped_uuid(vmm3.id), + ) + .await + ).expect_err("trying to set migration IDs should fail when a previous migration and VMM are still there"); + + // Pretend the previous migration saga has unwound the VMM + let updated = dbg!( + datastore + .vmm_update_runtime( + &PropolisUuid::from_untyped_uuid(vmm2.id), + &VmmRuntimeState { + time_state_updated: Utc::now(), + r#gen: Generation(vmm2.runtime.r#gen.0.next().next()), + state: VmmState::SagaUnwound, + }, + ) + .await + ) + .expect("updating VMM state should be fine"); + assert!(updated); + + // It should still fail due to the presence of the migration ID. + dbg!( + datastore + .instance_set_migration_ids( + &opctx, + instance_id, + PropolisUuid::from_untyped_uuid(vmm1.id), + migration2.id, + PropolisUuid::from_untyped_uuid(vmm3.id), + ) + .await + ).expect_err("trying to set migration IDs should fail when a previous migration ID is still there"); + + // Remove the migration ID. + let updated = dbg!(datastore + .instance_update_runtime( + &instance_id, + &InstanceRuntimeState { + time_updated: Utc::now(), + r#gen: Generation(instance.runtime_state.gen.0.next()), + nexus_state: InstanceState::Vmm, + propolis_id: Some(vmm1.id), + migration_id: None, + ..instance.runtime_state.clone() + }, + ) + .await + .expect("instance update should work")); + assert!(updated); + + // Now that the migration ID is gone, we should be able to clobber the + // SagaUnwound VMM ID. + let instance = dbg!( + datastore + .instance_set_migration_ids( + &opctx, + instance_id, + PropolisUuid::from_untyped_uuid(vmm1.id), + migration2.id, + PropolisUuid::from_untyped_uuid(vmm3.id), + ) + .await + ) + .expect("replacing SagaUnwound VMM should work"); + assert_eq!(instance.runtime().migration_id, Some(migration2.id)); + assert_eq!(instance.runtime().dst_propolis_id, Some(vmm3.id)); + + // Clean up. + db.cleanup().await.unwrap(); + logctx.cleanup_successful(); + } } From ddbf2faa37f3486f1a288dad93212f57af56c12e Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Tue, 30 Jul 2024 13:01:12 -0700 Subject: [PATCH 194/234] placate clippy --- sled-agent/src/instance.rs | 1 + sled-agent/src/instance_manager.rs | 11 ++++++++--- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/sled-agent/src/instance.rs b/sled-agent/src/instance.rs index cb437d824fe..631f2b83f64 100644 --- a/sled-agent/src/instance.rs +++ b/sled-agent/src/instance.rs @@ -986,6 +986,7 @@ impl Instance { /// * `services`: A set of instance manager-provided services. /// * `sled_identifiers`: Sled-related metadata used to track statistics. /// * `metadata`: Instance-related metadata used to track statistics. + #[allow(clippy::too_many_arguments)] pub(crate) fn new( log: Logger, id: InstanceUuid, diff --git a/sled-agent/src/instance_manager.rs b/sled-agent/src/instance_manager.rs index 5ac9ddbed77..1b2fb204d0d 100644 --- a/sled-agent/src/instance_manager.rs +++ b/sled-agent/src/instance_manager.rs @@ -166,7 +166,7 @@ impl InstanceManager { instance_runtime, vmm_runtime, propolis_addr, - sled_identifiers, + sled_identifiers: Box::new(sled_identifiers), metadata, tx, }) @@ -349,7 +349,12 @@ enum InstanceManagerRequest { instance_runtime: InstanceRuntimeState, vmm_runtime: VmmRuntimeState, propolis_addr: SocketAddr, - sled_identifiers: SledIdentifiers, + // These are boxed because they are, apparently, quite large, and Clippy + // whinges about the overall size of this variant relative to the + // others. Since we will generally send `EnsureRegistered` requests much + // less frequently than most of the others, boxing this seems like a + // reasonable choice... + sled_identifiers: Box, metadata: InstanceMetadata, tx: oneshot::Sender>, }, @@ -480,7 +485,7 @@ impl InstanceManagerRunner { instance_runtime, vmm_runtime, propolis_addr, - sled_identifiers, + *sled_identifiers, metadata ).await).map_err(|_| Error::FailedSendClientClosed) }, From f301183176252061c4e54ab8bdc33558f41539a3 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Tue, 30 Jul 2024 14:25:57 -0700 Subject: [PATCH 195/234] remove log file (oops) --- 83e2f8ab-86dc-491f-98d9-b66a16768ddb.log | 565 ----------------------- 1 file changed, 565 deletions(-) delete mode 100644 83e2f8ab-86dc-491f-98d9-b66a16768ddb.log diff --git a/83e2f8ab-86dc-491f-98d9-b66a16768ddb.log b/83e2f8ab-86dc-491f-98d9-b66a16768ddb.log deleted file mode 100644 index 7324b45faa5..00000000000 --- a/83e2f8ab-86dc-491f-98d9-b66a16768ddb.log +++ /dev/null @@ -1,565 +0,0 @@ -root@BRM44220001:~# looker -f $( /opt/oxide/oxlog/oxlog logs oxz_nexus_3090570f-4c2b-43ae-8124-776fbad100fa --current ) -c 'r.instance_id?.contains("83e2f8ab-86dc-491f-98d9-b66a16768ddb")' -21:25:29.253Z INFO 3090570f-4c2b-43ae-8124-776fbad100fa (dropshot_internal): received new VMM runtime state from sled agent - actor_id = 001de000-05e4-4000-8000-000000000002 - authenticated = true - file = nexus/src/app/instance.rs:1869 - instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb - local_addr = [fd00:1122:3344:102::4]:12221 - method = PUT - migration_state = Migrations { migration_in: None, migration_out: None } - propolis_id = 18a38b46-c2b3-45f4-8a77-3fbf43d2175c - remote_addr = [fd00:1122:3344:103::1]:54972 - req_id = e81e031e-a27d-4e06-b5c0-2481f5eea1a4 - uri = /instances/83e2f8ab-86dc-491f-98d9-b66a16768ddb - vmm_state = VmmRuntimeState { state: Destroyed, gen: Generation(2), time_updated: 2024-07-25T21:25:29.252818945Z } -21:25:29.257Z DEBG 3090570f-4c2b-43ae-8124-776fbad100fa (dropshot_internal): new VMM runtime state from sled agent requires an instance-update saga - actor_id = 001de000-05e4-4000-8000-000000000002 - authenticated = true - instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb - local_addr = [fd00:1122:3344:102::4]:12221 - method = PUT - migration_in_needs_update = false - migration_out_needs_update = false - propolis_id = 18a38b46-c2b3-45f4-8a77-3fbf43d2175c - remote_addr = [fd00:1122:3344:103::1]:54972 - req_id = e81e031e-a27d-4e06-b5c0-2481f5eea1a4 - uri = /instances/83e2f8ab-86dc-491f-98d9-b66a16768ddb - vmm_needs_update = true -21:25:29.273Z INFO 3090570f-4c2b-43ae-8124-776fbad100fa (dropshot_internal): starting update saga for 83e2f8ab-86dc-491f-98d9-b66a16768ddb - actor_id = 001de000-05e4-4000-8000-000000000002 - authenticated = true - file = nexus/src/app/instance.rs:1371 - instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb - local_addr = [fd00:1122:3344:102::4]:12221 - method = PUT - migration_state = Migrations { migration_in: None, migration_out: None } - remote_addr = [fd00:1122:3344:103::1]:54972 - req_id = e81e031e-a27d-4e06-b5c0-2481f5eea1a4 - uri = /instances/83e2f8ab-86dc-491f-98d9-b66a16768ddb - vmm_state = VmmRuntimeState { state: Destroyed, gen: Generation(2), time_updated: 2024-07-25T21:25:29.252818945Z } -21:25:29.351Z INFO 3090570f-4c2b-43ae-8124-776fbad100fa (ServerContext): instance update: attempting to lock instance - file = nexus/src/app/sagas/instance_update/start.rs:92 - instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb - saga_id = c5a4e62e-ff9f-4edb-aa68-9a033893a62a - saga_name = start-instance-update -21:25:29.365Z DEBG 3090570f-4c2b-43ae-8124-776fbad100fa (ServerContext): attempting to acquire instance updater lock - actor_id = 001de000-05e4-4000-8000-000000000002 - authenticated = true - current_gen = Generation(Generation(1)) - instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb - saga_id = 734a7030-3e17-4637-9711-19bc78dbd077 - saga_name = start-instance-update - saga_node = LockInstance - updater_id = c5a4e62e-ff9f-4edb-aa68-9a033893a62a -21:25:29.370Z DEBG 3090570f-4c2b-43ae-8124-776fbad100fa (ServerContext): attempting to acquire instance updater lock - actor_id = 001de000-05e4-4000-8000-000000000002 - authenticated = true - current_gen = Generation(Generation(1)) - instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb - saga_id = 734a7030-3e17-4637-9711-19bc78dbd077 - saga_name = start-instance-update - saga_node = LockInstance - updater_id = c5a4e62e-ff9f-4edb-aa68-9a033893a62a -21:25:29.374Z DEBG 3090570f-4c2b-43ae-8124-776fbad100fa (ServerContext): instance updater lock acquired! - actor_id = 001de000-05e4-4000-8000-000000000002 - already_locked = true - authenticated = true - instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb - locked_gen = Generation(Generation(2)) - saga_id = 734a7030-3e17-4637-9711-19bc78dbd077 - saga_name = start-instance-update - saga_node = LockInstance - updater_id = c5a4e62e-ff9f-4edb-aa68-9a033893a62a -21:25:29.399Z INFO 3090570f-4c2b-43ae-8124-776fbad100fa (ServerContext): instance update: starting real update saga... - current.active_vmm = Some(Vmm { id: 18a38b46-c2b3-45f4-8a77-3fbf43d2175c, time_created: 2024-07-25T21:25:26.371086Z, time_deleted: None, instance_id: 83e2f8ab-86dc-491f-98d9-b66a16768ddb, sled_id: 7c8b0a7f-23e1-4b88-8519-eb43ed065667, propolis_ip: V6(Ipv6Network { addr: fd00:1122:3344:103::1:34, prefix: 128 }), propolis_port: SqlU16(12400), runtime: VmmRuntimeState { time_state_updated: 2024-07-25T21:25:29.252818Z, gen: Generation(Generation(2)), state: Destroyed } }) - current.migration = None - current.runtime_state = InstanceRuntimeState { time_updated: 2024-07-25T21:25:26.884469Z, gen: Generation(Generation(2)), propolis_id: Some(18a38b46-c2b3-45f4-8a77-3fbf43d2175c), dst_propolis_id: None, migration_id: None, nexus_state: Vmm } - current.target_vmm = None - file = nexus/src/app/sagas/instance_update/start.rs:178 - instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb - saga_id = 734a7030-3e17-4637-9711-19bc78dbd077 - saga_name = start-instance-update - update.deprovision = true - update.destroy_active_vmm = Some(18a38b46-c2b3-45f4-8a77-3fbf43d2175c (propolis)) - update.destroy_target_vmm = None - update.network_config_update = Some(Delete) - update.new_runtime_state = InstanceRuntimeState { time_updated: 2024-07-25T21:25:29.398999367Z, gen: Generation(Generation(3)), propolis_id: None, dst_propolis_id: None, migration_id: None, nexus_state: NoVmm } -21:25:29.420Z DEBG 3090570f-4c2b-43ae-8124-776fbad100fa (ServerContext): instance update: trying to become instance updater... - instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb - parent_lock = UpdaterLock { updater_id: c5a4e62e-ff9f-4edb-aa68-9a033893a62a, locked_gen: Generation(Generation(2)) } - saga_id = 41b9eb70-84a4-4397-ba86-be068cbffec7 - saga_name = instance-update -21:25:29.427Z INFO 3090570f-4c2b-43ae-8124-776fbad100fa (ServerContext): inherited lock from c5a4e62e-ff9f-4edb-aa68-9a033893a62a to 41b9eb70-84a4-4397-ba86-be068cbffec7 - actor_id = 001de000-05e4-4000-8000-000000000002 - authenticated = true - file = nexus/db-queries/src/db/datastore/instance.rs:1051 - instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb - locked_gen = Generation(Generation(3)) - parent_gen = Generation(Generation(2)) - parent_id = c5a4e62e-ff9f-4edb-aa68-9a033893a62a - saga_id = 07d0a44e-e348-4bd9-ada0-1e9e4eebf1ad - saga_name = instance-update - saga_node = BecomeUpdater - updater_id = 41b9eb70-84a4-4397-ba86-be068cbffec7 -21:25:29.427Z INFO 3090570f-4c2b-43ae-8124-776fbad100fa (ServerContext): instance_update: Now, I am become Updater, the destroyer of VMMs. - file = nexus/src/app/sagas/instance_update/mod.rs:796 - instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb - saga_id = 41b9eb70-84a4-4397-ba86-be068cbffec7 - saga_name = instance-update -21:25:29.438Z INFO 3090570f-4c2b-43ae-8124-776fbad100fa (ServerContext): instance update: deleting network config - file = nexus/src/app/sagas/instance_update/mod.rs:839 - instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb - saga_id = 07d0a44e-e348-4bd9-ada0-1e9e4eebf1ad - saga_name = instance-update -21:25:29.438Z INFO 3090570f-4c2b-43ae-8124-776fbad100fa (ServerContext): deleting instance dpd configuration - file = nexus/src/app/instance_network.rs:548 - instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb -21:25:29.453Z DEBG 3090570f-4c2b-43ae-8124-776fbad100fa (ServerContext): notifying dendrite of updates - instance_id = Some(83e2f8ab-86dc-491f-98d9-b66a16768ddb (instance)) - switch = switch1 -21:25:29.607Z DEBG 3090570f-4c2b-43ae-8124-776fbad100fa (ServerContext): notifying dendrite of updates - instance_id = Some(83e2f8ab-86dc-491f-98d9-b66a16768ddb (instance)) - switch = switch0 -21:25:29.784Z INFO 3090570f-4c2b-43ae-8124-776fbad100fa (ServerContext): instance update (no VMM): deallocated virtual provisioning resources - file = nexus/src/app/sagas/instance_update/mod.rs:918 - instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb - records_deleted = [VirtualProvisioningCollection { id: 001de000-1334-4000-8000-000000000000, time_modified: Some(2024-07-25T21:25:29.775985Z), collection_type: "Fleet", virtual_disk_bytes_provisioned: ByteCount(ByteCount(11811160064)), cpus_provisioned: 10, ram_provisioned: ByteCount(ByteCount(10737418240)) }, VirtualProvisioningCollection { id: 64900d6e-0c92-4d8c-a035-25c1c179125f, time_modified: Some(2024-07-25T21:25:29.775985Z), collection_type: "Silo", virtual_disk_bytes_provisioned: ByteCount(ByteCount(11811160064)), cpus_provisioned: 10, ram_provisioned: ByteCount(ByteCount(10737418240)) }, VirtualProvisioningCollection { id: 73a1ed16-acc9-4913-b82e-108105e6bed2, time_modified: Some(2024-07-25T21:25:29.775985Z), collection_type: "Project", virtual_disk_bytes_provisioned: ByteCount(ByteCount(11811160064)), cpus_provisioned: 10, ram_provisioned: ByteCount(ByteCount(10737418240)) }] - saga_id = 07d0a44e-e348-4bd9-ada0-1e9e4eebf1ad - saga_name = instance-update -21:25:29.791Z INFO 3090570f-4c2b-43ae-8124-776fbad100fa (ServerContext): instance update (no VMM): unassigning oximeter producer - file = nexus/src/app/sagas/instance_update/mod.rs:953 - instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb - saga_id = 07d0a44e-e348-4bd9-ada0-1e9e4eebf1ad - saga_name = instance-update -21:25:29.798Z DEBG 3090570f-4c2b-43ae-8124-776fbad100fa (ServerContext): instance update: committing new runtime state and unlocking... - instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb - lock = UpdaterLock { updater_id: 41b9eb70-84a4-4397-ba86-be068cbffec7, locked_gen: Generation(Generation(3)) } - new_runtime = InstanceRuntimeState { time_updated: 2024-07-25T21:25:29.398999367Z, gen: Generation(Generation(3)), propolis_id: None, dst_propolis_id: None, migration_id: None, nexus_state: NoVmm } - saga_id = 07d0a44e-e348-4bd9-ada0-1e9e4eebf1ad - saga_name = instance-update -21:25:29.834Z INFO 3090570f-4c2b-43ae-8124-776fbad100fa (ServerContext): instance update: committed update new runtime state! - did_unlock = true - file = nexus/src/app/sagas/instance_update/mod.rs:1001 - instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb - new_runtime = InstanceRuntimeState { time_updated: 2024-07-25T21:25:29.398999367Z, gen: Generation(Generation(3)), propolis_id: None, dst_propolis_id: None, migration_id: None, nexus_state: NoVmm } - saga_id = 07d0a44e-e348-4bd9-ada0-1e9e4eebf1ad - saga_name = instance-update -21:25:29.863Z INFO 3090570f-4c2b-43ae-8124-776fbad100fa (ServerContext): instance update (VMM destroyed): deallocating sled resource reservation - file = nexus/src/app/sagas/instance_update/destroyed.rs:83 - instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb - propolis_id = 18a38b46-c2b3-45f4-8a77-3fbf43d2175c - saga_id = 07d0a44e-e348-4bd9-ada0-1e9e4eebf1ad - saga_name = instance-update -21:25:29.872Z INFO 3090570f-4c2b-43ae-8124-776fbad100fa (ServerContext): instance update (VMM destroyed): marking VMM record deleted - file = nexus/src/app/sagas/instance_update/destroyed.rs:114 - instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb - propolis_id = 18a38b46-c2b3-45f4-8a77-3fbf43d2175c - saga_id = 07d0a44e-e348-4bd9-ada0-1e9e4eebf1ad - saga_name = instance-update -21:25:42.274Z INFO 3090570f-4c2b-43ae-8124-776fbad100fa (dropshot_internal): received new VMM runtime state from sled agent - actor_id = 001de000-05e4-4000-8000-000000000002 - authenticated = true - file = nexus/src/app/instance.rs:1869 - instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb - local_addr = [fd00:1122:3344:102::4]:12221 - method = PUT - migration_state = Migrations { migration_in: None, migration_out: None } - propolis_id = a549321e-e2c4-40b7-a2ba-0ae492878f0c - remote_addr = [fd00:1122:3344:101::1]:43502 - req_id = b29d225b-7eb5-40c9-9055-e8011a1bce4e - uri = /instances/83e2f8ab-86dc-491f-98d9-b66a16768ddb - vmm_state = VmmRuntimeState { state: Starting, gen: Generation(2), time_updated: 2024-07-25T21:25:42.273996740Z } -21:25:42.279Z INFO 3090570f-4c2b-43ae-8124-776fbad100fa (dropshot_internal): received new VMM runtime state from sled agent - actor_id = 001de000-05e4-4000-8000-000000000002 - authenticated = true - file = nexus/src/app/instance.rs:1869 - instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb - local_addr = [fd00:1122:3344:102::4]:12221 - method = PUT - migration_state = Migrations { migration_in: None, migration_out: None } - propolis_id = a549321e-e2c4-40b7-a2ba-0ae492878f0c - remote_addr = [fd00:1122:3344:101::1]:43502 - req_id = bbd804e9-76e0-40d9-838b-d710c45cc978 - uri = /instances/83e2f8ab-86dc-491f-98d9-b66a16768ddb - vmm_state = VmmRuntimeState { state: Running, gen: Generation(3), time_updated: 2024-07-25T21:25:42.279615082Z } -21:25:55.258Z DEBG 3090570f-4c2b-43ae-8124-776fbad100fa (ServerContext): updating instance state - background_task = instance_watcher - instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb - state = Running -21:25:55.258Z INFO 3090570f-4c2b-43ae-8124-776fbad100fa (ServerContext): received new VMM runtime state from sled agent - background_task = instance_watcher - file = nexus/src/app/instance.rs:1869 - instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb - migration_state = Migrations { migration_in: None, migration_out: None } - propolis_id = a549321e-e2c4-40b7-a2ba-0ae492878f0c - vmm_state = VmmRuntimeState { state: Running, gen: Generation(3), time_updated: 2024-07-25T21:25:42.279615082Z } -21:26:25.342Z DEBG 3090570f-4c2b-43ae-8124-776fbad100fa (ServerContext): updating instance state - background_task = instance_watcher - instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb - state = Running -21:26:25.342Z INFO 3090570f-4c2b-43ae-8124-776fbad100fa (ServerContext): received new VMM runtime state from sled agent - background_task = instance_watcher - file = nexus/src/app/instance.rs:1869 - instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb - migration_state = Migrations { migration_in: None, migration_out: None } - propolis_id = a549321e-e2c4-40b7-a2ba-0ae492878f0c - vmm_state = VmmRuntimeState { state: Running, gen: Generation(3), time_updated: 2024-07-25T21:25:42.279615082Z } -21:26:55.245Z DEBG 3090570f-4c2b-43ae-8124-776fbad100fa (ServerContext): updating instance state - background_task = instance_watcher - instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb - state = Running -21:26:55.245Z INFO 3090570f-4c2b-43ae-8124-776fbad100fa (ServerContext): received new VMM runtime state from sled agent - background_task = instance_watcher - file = nexus/src/app/instance.rs:1869 - instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb - migration_state = Migrations { migration_in: None, migration_out: None } - propolis_id = a549321e-e2c4-40b7-a2ba-0ae492878f0c - vmm_state = VmmRuntimeState { state: Running, gen: Generation(3), time_updated: 2024-07-25T21:25:42.279615082Z } -21:27:25.806Z DEBG 3090570f-4c2b-43ae-8124-776fbad100fa (ServerContext): updating instance state - background_task = instance_watcher - instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb - state = Running -21:27:25.806Z INFO 3090570f-4c2b-43ae-8124-776fbad100fa (ServerContext): received new VMM runtime state from sled agent - background_task = instance_watcher - file = nexus/src/app/instance.rs:1869 - instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb - migration_state = Migrations { migration_in: None, migration_out: None } - propolis_id = a549321e-e2c4-40b7-a2ba-0ae492878f0c - vmm_state = VmmRuntimeState { state: Running, gen: Generation(3), time_updated: 2024-07-25T21:25:42.279615082Z } -21:27:55.254Z DEBG 3090570f-4c2b-43ae-8124-776fbad100fa (ServerContext): updating instance state - background_task = instance_watcher - instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb - state = Running -21:27:55.254Z INFO 3090570f-4c2b-43ae-8124-776fbad100fa (ServerContext): received new VMM runtime state from sled agent - background_task = instance_watcher - file = nexus/src/app/instance.rs:1869 - instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb - migration_state = Migrations { migration_in: None, migration_out: None } - propolis_id = a549321e-e2c4-40b7-a2ba-0ae492878f0c - vmm_state = VmmRuntimeState { state: Running, gen: Generation(3), time_updated: 2024-07-25T21:25:42.279615082Z } -21:28:25.653Z DEBG 3090570f-4c2b-43ae-8124-776fbad100fa (ServerContext): updating instance state - background_task = instance_watcher - instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb - state = Running -21:28:25.653Z INFO 3090570f-4c2b-43ae-8124-776fbad100fa (ServerContext): received new VMM runtime state from sled agent - background_task = instance_watcher - file = nexus/src/app/instance.rs:1869 - instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb - migration_state = Migrations { migration_in: None, migration_out: None } - propolis_id = a549321e-e2c4-40b7-a2ba-0ae492878f0c - vmm_state = VmmRuntimeState { state: Running, gen: Generation(3), time_updated: 2024-07-25T21:25:42.279615082Z } -21:28:27.578Z INFO 3090570f-4c2b-43ae-8124-776fbad100fa (dropshot_internal): received new VMM runtime state from sled agent - actor_id = 001de000-05e4-4000-8000-000000000002 - authenticated = true - file = nexus/src/app/instance.rs:1869 - instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb - local_addr = [fd00:1122:3344:102::4]:12221 - method = PUT - migration_state = Migrations { migration_in: None, migration_out: None } - propolis_id = a549321e-e2c4-40b7-a2ba-0ae492878f0c - remote_addr = [fd00:1122:3344:101::1]:43502 - req_id = c2913dd1-4210-43be-986f-f0f4ac400678 - uri = /instances/83e2f8ab-86dc-491f-98d9-b66a16768ddb - vmm_state = VmmRuntimeState { state: Stopping, gen: Generation(5), time_updated: 2024-07-25T21:28:27.577646756Z } -21:28:30.621Z INFO 3090570f-4c2b-43ae-8124-776fbad100fa (dropshot_internal): received new VMM runtime state from sled agent - actor_id = 001de000-05e4-4000-8000-000000000002 - authenticated = true - file = nexus/src/app/instance.rs:1869 - instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb - local_addr = [fd00:1122:3344:102::4]:12221 - method = PUT - migration_state = Migrations { migration_in: None, migration_out: None } - propolis_id = a549321e-e2c4-40b7-a2ba-0ae492878f0c - remote_addr = [fd00:1122:3344:101::1]:43502 - req_id = cd2f9516-79a4-4dc4-9dc3-f1d74a9df395 - uri = /instances/83e2f8ab-86dc-491f-98d9-b66a16768ddb - vmm_state = VmmRuntimeState { state: Destroyed, gen: Generation(7), time_updated: 2024-07-25T21:28:30.620811030Z } -21:28:30.635Z DEBG 3090570f-4c2b-43ae-8124-776fbad100fa (dropshot_internal): new VMM runtime state from sled agent requires an instance-update saga - actor_id = 001de000-05e4-4000-8000-000000000002 - authenticated = true - instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb - local_addr = [fd00:1122:3344:102::4]:12221 - method = PUT - migration_in_needs_update = false - migration_out_needs_update = false - propolis_id = a549321e-e2c4-40b7-a2ba-0ae492878f0c - remote_addr = [fd00:1122:3344:101::1]:43502 - req_id = cd2f9516-79a4-4dc4-9dc3-f1d74a9df395 - uri = /instances/83e2f8ab-86dc-491f-98d9-b66a16768ddb - vmm_needs_update = true -21:28:30.651Z INFO 3090570f-4c2b-43ae-8124-776fbad100fa (dropshot_internal): starting update saga for 83e2f8ab-86dc-491f-98d9-b66a16768ddb - actor_id = 001de000-05e4-4000-8000-000000000002 - authenticated = true - file = nexus/src/app/instance.rs:1371 - instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb - local_addr = [fd00:1122:3344:102::4]:12221 - method = PUT - migration_state = Migrations { migration_in: None, migration_out: None } - remote_addr = [fd00:1122:3344:101::1]:43502 - req_id = cd2f9516-79a4-4dc4-9dc3-f1d74a9df395 - uri = /instances/83e2f8ab-86dc-491f-98d9-b66a16768ddb - vmm_state = VmmRuntimeState { state: Destroyed, gen: Generation(7), time_updated: 2024-07-25T21:28:30.620811030Z } -21:28:30.652Z INFO 3090570f-4c2b-43ae-8124-776fbad100fa (dropshot_internal): received new VMM runtime state from sled agent - actor_id = 001de000-05e4-4000-8000-000000000002 - authenticated = true - file = nexus/src/app/instance.rs:1869 - instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb - local_addr = [fd00:1122:3344:102::4]:12221 - method = PUT - migration_state = Migrations { migration_in: None, migration_out: None } - propolis_id = a549321e-e2c4-40b7-a2ba-0ae492878f0c - remote_addr = [fd00:1122:3344:101::1]:43502 - req_id = 5e7ceff7-3899-4054-a9b5-e580da62b6cc - uri = /instances/83e2f8ab-86dc-491f-98d9-b66a16768ddb - vmm_state = VmmRuntimeState { state: Destroyed, gen: Generation(7), time_updated: 2024-07-25T21:28:30.620811030Z } -21:28:30.701Z INFO 3090570f-4c2b-43ae-8124-776fbad100fa (ServerContext): instance update: attempting to lock instance - file = nexus/src/app/sagas/instance_update/start.rs:92 - instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb - saga_id = 716a5faa-9a9d-456d-a242-32cc9a846fab - saga_name = start-instance-update -21:28:30.715Z DEBG 3090570f-4c2b-43ae-8124-776fbad100fa (ServerContext): attempting to acquire instance updater lock - actor_id = 001de000-05e4-4000-8000-000000000002 - authenticated = true - current_gen = Generation(Generation(4)) - instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb - saga_id = 9fde65bf-fb0a-4a82-9b2b-276789f57a76 - saga_name = start-instance-update - saga_node = LockInstance - updater_id = 716a5faa-9a9d-456d-a242-32cc9a846fab -21:28:30.721Z DEBG 3090570f-4c2b-43ae-8124-776fbad100fa (ServerContext): attempting to acquire instance updater lock - actor_id = 001de000-05e4-4000-8000-000000000002 - authenticated = true - current_gen = Generation(Generation(4)) - instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb - saga_id = 9fde65bf-fb0a-4a82-9b2b-276789f57a76 - saga_name = start-instance-update - saga_node = LockInstance - updater_id = 716a5faa-9a9d-456d-a242-32cc9a846fab -21:28:30.724Z DEBG 3090570f-4c2b-43ae-8124-776fbad100fa (ServerContext): instance updater lock acquired! - actor_id = 001de000-05e4-4000-8000-000000000002 - already_locked = true - authenticated = true - instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb - locked_gen = Generation(Generation(5)) - saga_id = 9fde65bf-fb0a-4a82-9b2b-276789f57a76 - saga_name = start-instance-update - saga_node = LockInstance - updater_id = 716a5faa-9a9d-456d-a242-32cc9a846fab -21:28:30.765Z DEBG 3090570f-4c2b-43ae-8124-776fbad100fa (ServerContext): instance update: unlocking instance on unwind - instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb - lock = UpdaterLock { updater_id: 716a5faa-9a9d-456d-a242-32cc9a846fab, locked_gen: Generation(Generation(5)) } - saga_id = 9fde65bf-fb0a-4a82-9b2b-276789f57a76 - saga_name = start-instance-update -21:28:30.768Z ERRO 3090570f-4c2b-43ae-8124-776fbad100fa (ServerContext): attempted to release a lock held by another saga - actor_id = 001de000-05e4-4000-8000-000000000002 - actual_id = 716a5faa-9a9d-456d-a242-32cc9a846fab - authenticated = true - file = nexus/db-queries/src/db/datastore/instance.rs:1160 - found_gen = Generation(Generation(5)) - instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb - locked_gen = Generation(Generation(5)) - saga_id = 9fde65bf-fb0a-4a82-9b2b-276789f57a76 - saga_name = start-instance-update - saga_node = LockInstance - updater_id = 716a5faa-9a9d-456d-a242-32cc9a846fab -21:28:30.768Z INFO 3090570f-4c2b-43ae-8124-776fbad100fa (ServerContext): server error while recording saga event, retrying - call_count = 0 - error = Internal Error: attempted to release a lock held by another saga! this is a bug! - file = nexus/src/app/sagas/instance_update/mod.rs:1202 - instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb - lock = UpdaterLock { updater_id: 716a5faa-9a9d-456d-a242-32cc9a846fab, locked_gen: Generation(Generation(5)) } - saga_id = 9fde65bf-fb0a-4a82-9b2b-276789f57a76 - saga_name = start-instance-update - total_duration = 3.72443ms -21:28:31.082Z ERRO 3090570f-4c2b-43ae-8124-776fbad100fa (ServerContext): attempted to release a lock held by another saga - actor_id = 001de000-05e4-4000-8000-000000000002 - actual_id = 716a5faa-9a9d-456d-a242-32cc9a846fab - authenticated = true - file = nexus/db-queries/src/db/datastore/instance.rs:1160 - found_gen = Generation(Generation(5)) - instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb - locked_gen = Generation(Generation(5)) - saga_id = 9fde65bf-fb0a-4a82-9b2b-276789f57a76 - saga_name = start-instance-update - saga_node = LockInstance - updater_id = 716a5faa-9a9d-456d-a242-32cc9a846fab -21:28:31.082Z INFO 3090570f-4c2b-43ae-8124-776fbad100fa (ServerContext): server error while recording saga event, retrying - call_count = 1 - error = Internal Error: attempted to release a lock held by another saga! this is a bug! - file = nexus/src/app/sagas/instance_update/mod.rs:1202 - instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb - lock = UpdaterLock { updater_id: 716a5faa-9a9d-456d-a242-32cc9a846fab, locked_gen: Generation(Generation(5)) } - saga_id = 9fde65bf-fb0a-4a82-9b2b-276789f57a76 - saga_name = start-instance-update - total_duration = 316.89294ms -21:28:31.634Z ERRO 3090570f-4c2b-43ae-8124-776fbad100fa (ServerContext): attempted to release a lock held by another saga - actor_id = 001de000-05e4-4000-8000-000000000002 - actual_id = 716a5faa-9a9d-456d-a242-32cc9a846fab - authenticated = true - file = nexus/db-queries/src/db/datastore/instance.rs:1160 - found_gen = Generation(Generation(5)) - instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb - locked_gen = Generation(Generation(5)) - saga_id = 9fde65bf-fb0a-4a82-9b2b-276789f57a76 - saga_name = start-instance-update - saga_node = LockInstance - updater_id = 716a5faa-9a9d-456d-a242-32cc9a846fab -21:28:31.635Z INFO 3090570f-4c2b-43ae-8124-776fbad100fa (ServerContext): server error while recording saga event, retrying - call_count = 2 - error = Internal Error: attempted to release a lock held by another saga! this is a bug! - file = nexus/src/app/sagas/instance_update/mod.rs:1202 - instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb - lock = UpdaterLock { updater_id: 716a5faa-9a9d-456d-a242-32cc9a846fab, locked_gen: Generation(Generation(5)) } - saga_id = 9fde65bf-fb0a-4a82-9b2b-276789f57a76 - saga_name = start-instance-update - total_duration = 869.80914ms -21:28:32.994Z ERRO 3090570f-4c2b-43ae-8124-776fbad100fa (ServerContext): attempted to release a lock held by another saga - actor_id = 001de000-05e4-4000-8000-000000000002 - actual_id = 716a5faa-9a9d-456d-a242-32cc9a846fab - authenticated = true - file = nexus/db-queries/src/db/datastore/instance.rs:1160 - found_gen = Generation(Generation(5)) - instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb - locked_gen = Generation(Generation(5)) - saga_id = 9fde65bf-fb0a-4a82-9b2b-276789f57a76 - saga_name = start-instance-update - saga_node = LockInstance - updater_id = 716a5faa-9a9d-456d-a242-32cc9a846fab -21:28:32.994Z INFO 3090570f-4c2b-43ae-8124-776fbad100fa (ServerContext): server error while recording saga event, retrying - call_count = 3 - error = Internal Error: attempted to release a lock held by another saga! this is a bug! - file = nexus/src/app/sagas/instance_update/mod.rs:1202 - instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb - lock = UpdaterLock { updater_id: 716a5faa-9a9d-456d-a242-32cc9a846fab, locked_gen: Generation(Generation(5)) } - saga_id = 9fde65bf-fb0a-4a82-9b2b-276789f57a76 - saga_name = start-instance-update - total_duration = 2.228850544s -21:28:35.581Z ERRO 3090570f-4c2b-43ae-8124-776fbad100fa (ServerContext): attempted to release a lock held by another saga - actor_id = 001de000-05e4-4000-8000-000000000002 - actual_id = 716a5faa-9a9d-456d-a242-32cc9a846fab - authenticated = true - file = nexus/db-queries/src/db/datastore/instance.rs:1160 - found_gen = Generation(Generation(5)) - instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb - locked_gen = Generation(Generation(5)) - saga_id = 9fde65bf-fb0a-4a82-9b2b-276789f57a76 - saga_name = start-instance-update - saga_node = LockInstance - updater_id = 716a5faa-9a9d-456d-a242-32cc9a846fab -21:28:35.581Z INFO 3090570f-4c2b-43ae-8124-776fbad100fa (ServerContext): server error while recording saga event, retrying - call_count = 4 - error = Internal Error: attempted to release a lock held by another saga! this is a bug! - file = nexus/src/app/sagas/instance_update/mod.rs:1202 - instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb - lock = UpdaterLock { updater_id: 716a5faa-9a9d-456d-a242-32cc9a846fab, locked_gen: Generation(Generation(5)) } - saga_id = 9fde65bf-fb0a-4a82-9b2b-276789f57a76 - saga_name = start-instance-update - total_duration = 4.816258375s -21:28:41.493Z ERRO 3090570f-4c2b-43ae-8124-776fbad100fa (ServerContext): attempted to release a lock held by another saga - actor_id = 001de000-05e4-4000-8000-000000000002 - actual_id = 716a5faa-9a9d-456d-a242-32cc9a846fab - authenticated = true - file = nexus/db-queries/src/db/datastore/instance.rs:1160 - found_gen = Generation(Generation(5)) - instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb - locked_gen = Generation(Generation(5)) - saga_id = 9fde65bf-fb0a-4a82-9b2b-276789f57a76 - saga_name = start-instance-update - saga_node = LockInstance - updater_id = 716a5faa-9a9d-456d-a242-32cc9a846fab -21:28:41.493Z INFO 3090570f-4c2b-43ae-8124-776fbad100fa (ServerContext): server error while recording saga event, retrying - call_count = 5 - error = Internal Error: attempted to release a lock held by another saga! this is a bug! - file = nexus/src/app/sagas/instance_update/mod.rs:1202 - instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb - lock = UpdaterLock { updater_id: 716a5faa-9a9d-456d-a242-32cc9a846fab, locked_gen: Generation(Generation(5)) } - saga_id = 9fde65bf-fb0a-4a82-9b2b-276789f57a76 - saga_name = start-instance-update - total_duration = 10.728196913s -21:28:47.821Z ERRO 3090570f-4c2b-43ae-8124-776fbad100fa (ServerContext): attempted to release a lock held by another saga - actor_id = 001de000-05e4-4000-8000-000000000002 - actual_id = 716a5faa-9a9d-456d-a242-32cc9a846fab - authenticated = true - file = nexus/db-queries/src/db/datastore/instance.rs:1160 - found_gen = Generation(Generation(5)) - instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb - locked_gen = Generation(Generation(5)) - saga_id = 9fde65bf-fb0a-4a82-9b2b-276789f57a76 - saga_name = start-instance-update - saga_node = LockInstance - updater_id = 716a5faa-9a9d-456d-a242-32cc9a846fab -21:28:47.821Z INFO 3090570f-4c2b-43ae-8124-776fbad100fa (ServerContext): server error while recording saga event, retrying - call_count = 6 - error = Internal Error: attempted to release a lock held by another saga! this is a bug! - file = nexus/src/app/sagas/instance_update/mod.rs:1202 - instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb - lock = UpdaterLock { updater_id: 716a5faa-9a9d-456d-a242-32cc9a846fab, locked_gen: Generation(Generation(5)) } - saga_id = 9fde65bf-fb0a-4a82-9b2b-276789f57a76 - saga_name = start-instance-update - total_duration = 17.055072611s -21:29:01.460Z ERRO 3090570f-4c2b-43ae-8124-776fbad100fa (ServerContext): attempted to release a lock held by another saga - actor_id = 001de000-05e4-4000-8000-000000000002 - actual_id = 716a5faa-9a9d-456d-a242-32cc9a846fab - authenticated = true - file = nexus/db-queries/src/db/datastore/instance.rs:1160 - found_gen = Generation(Generation(5)) - instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb - locked_gen = Generation(Generation(5)) - saga_id = 9fde65bf-fb0a-4a82-9b2b-276789f57a76 - saga_name = start-instance-update - saga_node = LockInstance - updater_id = 716a5faa-9a9d-456d-a242-32cc9a846fab -21:29:01.460Z WARN 3090570f-4c2b-43ae-8124-776fbad100fa (ServerContext): instance update: server error while unlocking instance, - retrying - call_count = 7 - error = Internal Error: attempted to release a lock held by another saga! this is a bug! - file = nexus/src/app/sagas/instance_update/mod.rs:1191 - instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb - lock = UpdaterLock { updater_id: 716a5faa-9a9d-456d-a242-32cc9a846fab, locked_gen: Generation(Generation(5)) } - saga_id = 9fde65bf-fb0a-4a82-9b2b-276789f57a76 - saga_name = start-instance-update - total_duration = 30.693835165s -21:29:17.617Z ERRO 3090570f-4c2b-43ae-8124-776fbad100fa (ServerContext): attempted to release a lock held by another saga - actor_id = 001de000-05e4-4000-8000-000000000002 - actual_id = 716a5faa-9a9d-456d-a242-32cc9a846fab - authenticated = true - file = nexus/db-queries/src/db/datastore/instance.rs:1160 - found_gen = Generation(Generation(5)) - instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb - locked_gen = Generation(Generation(5)) - saga_id = 9fde65bf-fb0a-4a82-9b2b-276789f57a76 - saga_name = start-instance-update - saga_node = LockInstance - updater_id = 716a5faa-9a9d-456d-a242-32cc9a846fab -21:29:17.617Z WARN 3090570f-4c2b-43ae-8124-776fbad100fa (ServerContext): instance update: server error while unlocking instance, - retrying - call_count = 8 - error = Internal Error: attempted to release a lock held by another saga! this is a bug! - file = nexus/src/app/sagas/instance_update/mod.rs:1191 - instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb - lock = UpdaterLock { updater_id: 716a5faa-9a9d-456d-a242-32cc9a846fab, locked_gen: Generation(Generation(5)) } - saga_id = 9fde65bf-fb0a-4a82-9b2b-276789f57a76 - saga_name = start-instance-update - total_duration = 46.850057263s -21:30:35.990Z ERRO 3090570f-4c2b-43ae-8124-776fbad100fa (ServerContext): attempted to release a lock held by another saga - actor_id = 001de000-05e4-4000-8000-000000000002 - actual_id = 716a5faa-9a9d-456d-a242-32cc9a846fab - authenticated = true - file = nexus/db-queries/src/db/datastore/instance.rs:1160 - found_gen = Generation(Generation(5)) - instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb - locked_gen = Generation(Generation(5)) - saga_id = 9fde65bf-fb0a-4a82-9b2b-276789f57a76 - saga_name = start-instance-update - saga_node = LockInstance - updater_id = 716a5faa-9a9d-456d-a242-32cc9a846fab -21:30:35.990Z WARN 3090570f-4c2b-43ae-8124-776fbad100fa (ServerContext): instance update: server error while unlocking instance, - retrying - call_count = 9 - error = Internal Error: attempted to release a lock held by another saga! this is a bug! - file = nexus/src/app/sagas/instance_update/mod.rs:1191 - instance_id = 83e2f8ab-86dc-491f-98d9-b66a16768ddb - lock = UpdaterLock { updater_id: 716a5faa-9a9d-456d-a242-32cc9a846fab, locked_gen: Generation(Generation(5)) } - saga_id = 9fde65bf-fb0a-4a82-9b2b-276789f57a76 - saga_name = start-instance-update - total_duration = 125.218673206s From d68f0dea1add4ede1aabb9c23f7fc8f9779f93db Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Tue, 30 Jul 2024 15:06:13 -0700 Subject: [PATCH 196/234] start addressing @smklein's review suggestions this is basically just the easy bits, i'm going to make more of the changes you've suggested as well! these are just the trivial ones i could easily fix while reading through the review comments. --- nexus/db-queries/src/db/datastore/instance.rs | 44 +++++++++++-------- sled-agent/src/common/instance.rs | 5 +-- 2 files changed, 26 insertions(+), 23 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/instance.rs b/nexus/db-queries/src/db/datastore/instance.rs index 1fe6ffce90a..7c286823a67 100644 --- a/nexus/db-queries/src/db/datastore/instance.rs +++ b/nexus/db-queries/src/db/datastore/instance.rs @@ -368,6 +368,9 @@ impl DataStore { vmm_dsl::vmm .filter(vmm_dsl::state.eq(VmmState::Destroyed)) + // If the VMM record has already been deleted, we don't need to do + // anything about it --- someone already has. + .filter(vmm_dsl::time_deleted.is_null()) .inner_join( dsl::instance.on(dsl::active_propolis_id .eq(vmm_dsl::id.nullable()) @@ -612,6 +615,11 @@ impl DataStore { /// `target_propolis_id`, if the instance does not currently have an active /// migration, and the active VMM is in the [`VmmState::Running`] or /// [`VmmState::Rebooting`] states. + /// + /// Note that a non-NULL `target_propolis_id` will be overwritten, if (and + /// only if) the target VMM record is in [`VmmState::SagaUnwound`], + /// indicating that it was left behind by a failed `instance-migrate` saga + /// unwinding. pub async fn instance_set_migration_ids( &self, opctx: &OpContext, @@ -636,6 +644,7 @@ impl DataStore { // that we can use it in a `filter` on the update query. let vmm_ok = vmm_dsl::vmm .filter(vmm_dsl::id.eq(src_propolis_id)) + .filter(vmm_dsl::time_deleted.is_null()) .filter(vmm_dsl::state.eq_any(ALLOWED_ACTIVE_VMM_STATES)) .select(vmm_dsl::instance_id); // Subquery for checking if a present target VMM ID points at a VMM @@ -643,6 +652,10 @@ impl DataStore { // out that VMM). let target_vmm_unwound = vmm_dsl::vmm .filter(vmm_dsl::id.nullable().eq(dsl::target_propolis_id)) + // Don't filter out target VMMs with `time_deleted` set here --- we + // *shouldn't* have deleted the VMM without unlinking it from the + // instance record, but if something did, we should still allow the + // ID to be clobbered. .filter(vmm_dsl::state.eq(VmmState::SagaUnwound)) .select(vmm_dsl::instance_id); @@ -683,25 +696,12 @@ impl DataStore { &*self.pool_connection_authorized(opctx).await?, ) .await - .map_err(|e| { - // Turning all these errors into `NotFound` errors is a bit - // unfortunate. The query will not find anything fail if the - // instance ID actually doesn't exist, *or* if any of the "is - // it valid to set migration IDs in the current state?" checks - // fail, which should probably be `Error::Conflict` - // instead...but, we can't really tell which is the case here. - // - // TODO(eliza): Perhaps these should all be mapped to `Conflict` - // instead? It's arguably correct to say that trying to set - // migration IDs for an instance that doesn't exist is sort of a - // "conflict", for a significantly broad definition of "conflcit"... - public_error_from_diesel( - e, - ErrorHandler::NotFoundByLookup( - ResourceType::Instance, - LookupType::ById(instance_id.into_untyped_uuid()), - ), - ) + .map_err(|error| { + Error::conflict(format!( + "cannot set migration ID {migration_id} for instance \ + {instance_id} (perhaps a previous migration is already \ + set): {error:#}" + )) }) } @@ -710,6 +710,12 @@ impl DataStore { /// /// This method will only unset the instance's migration IDs if they match /// the provided ones. + /// # Returns + /// + /// - `Ok(true)` if the migration IDs were unset, + /// - `Ok(false)` if the instance IDs have *already* been unset (this method + /// is idempotent) + /// - `Err` if the database query returned an error. pub async fn instance_unset_migration_ids( &self, opctx: &OpContext, diff --git a/sled-agent/src/common/instance.rs b/sled-agent/src/common/instance.rs index dc8d1b09a6e..4f137439880 100644 --- a/sled-agent/src/common/instance.rs +++ b/sled-agent/src/common/instance.rs @@ -263,10 +263,7 @@ impl InstanceStates { migration: &mut MigrationRuntimeState, now: DateTime, ) { - if matches!( - migration.state, - MigrationState::InProgress | MigrationState::Pending - ) { + if !migration.state.is_terminal() { migration.gen = migration.gen.next(); migration.time_updated = now; migration.state = MigrationState::Failed; From 3cdac70ddfeb642d1745ee8e9aeb29a8c7502316 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Tue, 30 Jul 2024 15:19:58 -0700 Subject: [PATCH 197/234] fix comments Co-authored-by: Sean Klein --- common/src/api/internal/nexus.rs | 2 +- nexus/examples/config-second.toml | 2 +- nexus/examples/config.toml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/common/src/api/internal/nexus.rs b/common/src/api/internal/nexus.rs index 75eb0b37eab..7f4eb358a4a 100644 --- a/common/src/api/internal/nexus.rs +++ b/common/src/api/internal/nexus.rs @@ -126,7 +126,7 @@ pub struct SledInstanceState { /// The current state of any inbound migration to this VMM. pub migration_in: Option, - /// The state of any outbound migration to this VMM. + /// The state of any outbound migration from this VMM. pub migration_out: Option, } diff --git a/nexus/examples/config-second.toml b/nexus/examples/config-second.toml index 40fe8da632e..754f37c064f 100644 --- a/nexus/examples/config-second.toml +++ b/nexus/examples/config-second.toml @@ -132,7 +132,7 @@ region_replacement.period_secs = 30 region_replacement_driver.period_secs = 10 # How frequently to query the status of active instances. instance_watcher.period_secs = 30 -# How frequently to schedule new instance update sagass. +# How frequently to schedule new instance update sagas. instance_updater.period_secs = 30 service_firewall_propagation.period_secs = 300 v2p_mapping_propagation.period_secs = 30 diff --git a/nexus/examples/config.toml b/nexus/examples/config.toml index 7555c86c2a7..bd50e846bdc 100644 --- a/nexus/examples/config.toml +++ b/nexus/examples/config.toml @@ -118,7 +118,7 @@ region_replacement.period_secs = 30 region_replacement_driver.period_secs = 10 # How frequently to query the status of active instances. instance_watcher.period_secs = 30 -# How frequently to schedule new instance update sagass. +# How frequently to schedule new instance update sagas. instance_updater.period_secs = 30 service_firewall_propagation.period_secs = 300 v2p_mapping_propagation.period_secs = 30 From bfd2c32c87713b6433458bd77c4052b53a9737ff Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Tue, 30 Jul 2024 15:31:19 -0700 Subject: [PATCH 198/234] `InstanceGestalt` RIDES AGAIN! @smklein points out that there is already a technical meaning of "snapshot" here when referring to "disk snapshots", and using the same word is potentially confusing...I'm deciding that it's more confusing than forcing a future reader to look up "gestalt" in the dictionary, and cackling gleefully as I sneak another five-dollar word into the codebase. --- nexus/db-queries/src/db/datastore/instance.rs | 10 ++--- nexus/db-queries/src/db/datastore/mod.rs | 2 +- nexus/src/app/sagas/instance_update/mod.rs | 42 +++++++++---------- 3 files changed, 27 insertions(+), 27 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/instance.rs b/nexus/db-queries/src/db/datastore/instance.rs index 7c286823a67..8b7d7d2278c 100644 --- a/nexus/db-queries/src/db/datastore/instance.rs +++ b/nexus/db-queries/src/db/datastore/instance.rs @@ -199,13 +199,13 @@ impl From for external::Instance { } } -/// A complete snapshot of the database records describing the current state of +/// The totality of database records describing the current state of /// an instance: the [`Instance`] record itself, along with its active [`Vmm`], /// target [`Vmm`], and current [`Migration`], if they exist. /// /// This is returned by [`DataStore::instance_fetch_all`]. #[derive(Clone, Debug, serde::Serialize, serde::Deserialize)] -pub struct InstanceSnapshot { +pub struct InstanceGestalt { /// The instance record. pub instance: Instance, /// The [`Vmm`] record pointed to by the instance's `active_propolis_id`, if @@ -485,7 +485,7 @@ impl DataStore { /// instance in a single atomic query. /// /// If an instance with the provided UUID exists, this method returns an - /// [`InstanceSnapshot`], which contains the following: + /// [`InstanceGestalt`], which contains the following: /// /// - The [`Instance`] record itself, /// - The instance's active [`Vmm`] record, if the `active_propolis_id` @@ -498,7 +498,7 @@ impl DataStore { &self, opctx: &OpContext, authz_instance: &authz::Instance, - ) -> LookupResult { + ) -> LookupResult { opctx.authorize(authz::Action::Read, authz_instance).await?; use db::schema::instance::dsl as instance_dsl; @@ -564,7 +564,7 @@ impl DataStore { ) })?; - Ok(InstanceSnapshot { instance, migration, active_vmm, target_vmm }) + Ok(InstanceGestalt { instance, migration, active_vmm, target_vmm }) } // TODO-design It's tempting to return the updated state of the Instance diff --git a/nexus/db-queries/src/db/datastore/mod.rs b/nexus/db-queries/src/db/datastore/mod.rs index d9ea3ad31ba..58259be7ee2 100644 --- a/nexus/db-queries/src/db/datastore/mod.rs +++ b/nexus/db-queries/src/db/datastore/mod.rs @@ -111,7 +111,7 @@ mod zpool; pub use address_lot::AddressLotCreateResult; pub use dns::DataStoreDnsTest; pub use dns::DnsVersionUpdateBuilder; -pub use instance::{InstanceAndActiveVmm, InstanceSnapshot}; +pub use instance::{InstanceAndActiveVmm, InstanceGestalt}; pub use inventory::DataStoreInventoryTest; use nexus_db_model::AllSchemaVersions; pub use rack::RackInit; diff --git a/nexus/src/app/sagas/instance_update/mod.rs b/nexus/src/app/sagas/instance_update/mod.rs index 8a26db7c52d..51af0ef8b5b 100644 --- a/nexus/src/app/sagas/instance_update/mod.rs +++ b/nexus/src/app/sagas/instance_update/mod.rs @@ -144,29 +144,29 @@ //! When an `instance-update` saga is started, it attempts to [acquire the //! updater lock][instance_updater_lock]. If the lock is already held by another //! update saga, then the update saga completes immediately. Otherwise, the saga -//! then queries CRDB for a snapshot of the current state of the `instance`` -//! record, the active and migration-target `vmm` records (if any exist), and -//! the current `migration` record (if one exists). This snapshot represents the -//! state from which the update will be applied, and must be read only after -//! locking the instance to ensure that it cannot race with another saga. +//! then queries CRDB for the current state of the `instance` record, the active +//! and migration-target `vmm` records (if any exist), and the current +//! `migration` record (if one exists). This snapshot represents the state from +//! which the update will be applied, and must be read only after locking the +//! instance to ensure that it cannot race with another saga. //! //! This is where another of this saga's weird quirks shows up: the shape of the //! saga DAG we wish to execute depends on this instance, active VMM, target -//! VMM, and migration snapshot. But, because this snapshot may only be taken -//! once the lock is acquired, and --- as we discussed above --- the -//! instance-updater lock may only ever be acquired within a saga, we arrive at -//! a bit of a weird impasse: we can't determine what saga DAG to build without -//! looking at the snapshot, but we can't take the snapshot until we've already -//! started a saga. To solve this, we've split this saga into two pieces: the -//! first, `start-instance-update`, is a very small saga that just tries to lock -//! the instance, and upon doing so, loads the instance snapshot from the -//! database and prepares and executes the "real" instance update saga. Once the -//! "real" saga starts, it "inherits" the lock from the start saga by performing -//! [the SQL equivalent equivalent of a compare-and-swap +//! VMM, and migration. But, because the precondition for the saga state may +//! only be read once the lock is acquired, and --- as we discussed above --- +//! the instance-updater lock may only ever be acquired within a saga, we arrive +//! at a bit of a weird impasse: we can't determine what saga DAG to build +//! without looking at the initial state, but we can't load the state until +//! we've already started a saga. To solve this, we've split this saga into two +//! pieces: the first, `start-instance-update`, is a very small saga that just +//! tries to lock the instance, and upon doing so, loads the instance state from +//! the database and prepares and executes the "real" instance update saga. Once +//! the "real" saga starts, it "inherits" the lock from the start saga by +//! performing [the SQL equivalent equivalent of a compare-and-swap //! operation][instance_updater_inherit_lock] with its own UUID. //! -//! The DAG for the "real" update saga depends on the snapshot read within the -//! lock, and since the lock was never released, that snapshot remains valid for +//! The DAG for the "real" update saga depends on the state read within the +//! lock, and since the lock was never released, that state remains valid for //! its execution. As the final action of the update saga, the instance record's //! new runtime state is written back to the database and the lock is released, //! in a [single atomic operation][instance_updater_unlock]. Should the update @@ -258,7 +258,7 @@ use super::{ ACTION_GENERATE_ID, }; use crate::app::db::datastore::instance; -use crate::app::db::datastore::InstanceSnapshot; +use crate::app::db::datastore::InstanceGestalt; use crate::app::db::datastore::VmmStateUpdateResult; use crate::app::db::lookup::LookupPath; use crate::app::db::model::ByteCount; @@ -417,7 +417,7 @@ struct Deprovision { impl UpdatesRequired { fn for_snapshot( log: &slog::Logger, - snapshot: &InstanceSnapshot, + snapshot: &InstanceGestalt, ) -> Option { let mut new_runtime = snapshot.instance.runtime().clone(); new_runtime.gen = Generation(new_runtime.gen.next()); @@ -2059,7 +2059,7 @@ mod test { struct MigrationTest { outcome: MigrationOutcome, instance_id: InstanceUuid, - initial_state: InstanceSnapshot, + initial_state: InstanceGestalt, authz_instance: authz::Instance, opctx: OpContext, } From 12f69b404a20c97d07826ae04d372d6e70f645af Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Tue, 30 Jul 2024 17:00:54 -0700 Subject: [PATCH 199/234] update openapi (changed a comment) --- openapi/sled-agent.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openapi/sled-agent.json b/openapi/sled-agent.json index bf51462c9d9..ecaff330428 100644 --- a/openapi/sled-agent.json +++ b/openapi/sled-agent.json @@ -4518,7 +4518,7 @@ }, "migration_out": { "nullable": true, - "description": "The state of any outbound migration to this VMM.", + "description": "The state of any outbound migration from this VMM.", "allOf": [ { "$ref": "#/components/schemas/MigrationRuntimeState" From 558de3c54c0cbe0887c0924f61554656fdf5496f Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Wed, 31 Jul 2024 09:47:46 -0700 Subject: [PATCH 200/234] don't duplicate `SimulatedMigration` API types as suggested by @smklein; the simulated sled-agent can just reuse these types from `sled-agent-client`, rather than duplicating them. --- clients/sled-agent-client/src/lib.rs | 19 ++++++++++++++----- sled-agent/src/sim/instance.rs | 17 +++-------------- 2 files changed, 17 insertions(+), 19 deletions(-) diff --git a/clients/sled-agent-client/src/lib.rs b/clients/sled-agent-client/src/lib.rs index ba3f0a054d5..4ed5aaa1cb5 100644 --- a/clients/sled-agent-client/src/lib.rs +++ b/clients/sled-agent-client/src/lib.rs @@ -509,19 +509,28 @@ impl TestInterfaces for Client { } } -// N.B. that this needs to be kept in sync with the types defined in -// `sled_agent::sim`! AFAICT this is the first simulated-only interface that has -// a body, so I wasn't sure whether there was a nice way to do this without -// creating a cyclic dependency or taking a giant pile of query params instead -// of JSON... +/// Parameters to the `/instances/{id}/sim-migration-source` test API. +/// +/// This message type is not included in the OpenAPI spec, because this API +/// exists only in test builds. #[derive(Serialize, Deserialize, JsonSchema)] pub struct SimulateMigrationSource { + /// The ID of the migration out of the instance's current active VMM. pub migration_id: Uuid, + /// What migration result (success or failure) to simulate. pub result: SimulatedMigrationResult, } +/// The result of a simulated migration out from an instance's current active +/// VMM. #[derive(Serialize, Deserialize, JsonSchema)] pub enum SimulatedMigrationResult { + /// Simulate a successful migration out. Success, + /// Simulate a failed migration out. + /// + /// # Note + /// + /// This is not currently implemented by the simulated sled-agent. Failure, } diff --git a/sled-agent/src/sim/instance.rs b/sled-agent/src/sim/instance.rs index 38f987f67a1..8ee0130262e 100644 --- a/sled-agent/src/sim/instance.rs +++ b/sled-agent/src/sim/instance.rs @@ -21,9 +21,6 @@ use propolis_client::types::{ InstanceMigrationStatus as PropolisMigrationStatus, InstanceState as PropolisInstanceState, InstanceStateMonitorResponse, }; -use schemars::JsonSchema; -use serde::Deserialize; -use serde::Serialize; use std::collections::VecDeque; use std::sync::Arc; use std::sync::Mutex; @@ -31,17 +28,9 @@ use uuid::Uuid; use crate::common::instance::{Action as InstanceAction, InstanceStates}; -#[derive(Serialize, Deserialize, JsonSchema)] -pub struct SimulateMigrationSource { - pub(in crate::sim) migration_id: Uuid, - pub(in crate::sim) result: SimulatedMigrationResult, -} - -#[derive(Serialize, Deserialize, JsonSchema)] -pub(in crate::sim) enum SimulatedMigrationResult { - Success, - Failure, -} +pub use sled_agent_client::{ + SimulateMigrationSource, SimulatedMigrationResult, +}; #[derive(Clone, Debug)] enum MonitorChange { From b7075b6652dc599eeee8dd7b6eb9a3bc567ca137 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Wed, 31 Jul 2024 10:10:39 -0700 Subject: [PATCH 201/234] better document unwinding behavior --- nexus/src/app/sagas/instance_update/mod.rs | 59 +++++++++++++++++++--- 1 file changed, 52 insertions(+), 7 deletions(-) diff --git a/nexus/src/app/sagas/instance_update/mod.rs b/nexus/src/app/sagas/instance_update/mod.rs index 51af0ef8b5b..224f5aae9f4 100644 --- a/nexus/src/app/sagas/instance_update/mod.rs +++ b/nexus/src/app/sagas/instance_update/mod.rs @@ -89,9 +89,9 @@ //! reliably, we require that all mutations of an instance` record are performed //! by a saga. The following sagas currently touch the `instance` record: //! -//! - [`instance_start`](super::instance_start) -//! - [`instance_migrate`](super::instance_migrate) -//! - [`instance_delete`](super::instance_delete) +//! - [`instance_start`] +//! - [`instance_migrate`] +//! - [`instance_delete`] //! - `instance_update` (this saga) //! //! For most of these sagas, the instance state machine itself guards against @@ -236,6 +236,48 @@ //! fails to acquire the lock and exits, it activates the background task as //! well. This ensures that we will attempt the update again. //! +//! ### On Unwinding +//! +//! Typically, when a Nexus saga unwinds, each node's reverse action undoes any +//! changes made by the forward action. The `instance-update` saga, however, is +//! a bit different: most of its nodes don't have reverse actions that undo the +//! action they performed. This is because, unlike `instance-start`, +//! `instance-migrate``, or `instance-delete`, the instance-update saga is +//! **not** attempting to perform a state change for the instance that was +//! requested by an operator. Instead, it is attempting to update the +//! database and networking configuration *to match a state change that has +//! already occurred.* +//! +//! Consider the folliwng: if we run an `instance-start` saga, and the instance +//! cannot actually be started, of course we would want the unwinding saga to +//! undo any database changes it has made, because the instance was not actually +//! started. Failing to undo those changes when an `instance-start` saga unwinds +//! would mean the database is left in a state that does not reflect reality, as +//! the instance was not actually started. On the other hand, suppose an +//! instance's active VMM shuts down and we start an `instance-update` saga to +//! move it to the `Destroyed` state. Even if some action along the way fails, the +//! instance is still `Destroyed``; that state transition has *already happened* +//! on the sled, and unwinding the update saga cannot and should not un-destroy +//! the VMM. +//! +//! So, unlike other sagas, we want to leave basically anything we've +//! successfully done in place when unwinding, because even if the update is +//! incomplete, we have still brought Nexus' understanding of the instance +//! *closer* to reality. If there was something we weren't able to do, one of +//! the instance-update-related RPWs[^rpws] will start a new update saga to try +//! it again. Because saga actions are idempotent, attempting to do something +//! that was already successfully performed a second time isn't a problem, and +//! we don't need to undo it. +//! +//! The one exception to this is, as [discussed +//! above](#the-instance-updater-lock-or-distributed-raii), unwinding instance +//! update sagas MUST always release the instance-updater lock, so that a +//! subsequent saga can update the instance. Thus, the saga actions which lock +//! the instance have reverse actions that release the updater lock. +//! +//! [`instance_start`]: super::instance_start +//! [`instance_migrate`]: super::instance_migrate +//! [`instance_delete`]: super::instance_delete //! [instance_updater_lock]: //! crate::app::db::datastore::DataStore::instance_updater_lock //! [instance_updater_inherit_lock]: @@ -252,6 +294,8 @@ //! [^3]: Even if the Nexus instance that processed the state update died //! between when it wrote the state to CRDB and when it started the //! requisite update saga! +//! [^rpws]: Either the `instance-updater` or `abandoned-vmm-reaper` background +//! tasks, as appropriate. use super::{ ActionRegistry, NexusActionContext, NexusSaga, SagaInitError, @@ -1382,16 +1426,17 @@ mod test { // Unlike most other sagas, we actually don't unwind the work performed // by an update saga, as we would prefer that at least some of it - // succeeds. The only thing that *needs* to be rolled back when an + // succeeds. The only thing that *needs* to be rolled back when an // instance-update saga fails is that the updater lock *MUST* be - // released so that a subsequent saga can run. - // + // released so that a subsequent saga can run. See the section "on + // unwinding" in the documentation comment at the top of the + // instance-update module for details. + assert_instance_unlocked(instance); // Additionally, we assert that the instance record is in a // consistent state, ensuring that all changes to the instance record // are atomic. This is important *because* we won't roll back changes // to the instance: if we're going to leave them in place, they can't // be partially applied, even if we unwound partway through the saga. - assert_instance_unlocked(instance); assert_instance_record_is_consistent(instance); // Throw away the instance so that subsequent unwinding From 48be892a79092859745fdf42fc0e1b9216aab1b1 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Wed, 31 Jul 2024 10:12:29 -0700 Subject: [PATCH 202/234] update openapi yet again --- openapi/nexus-internal.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openapi/nexus-internal.json b/openapi/nexus-internal.json index 6400b71d0f8..7e4d6e6c027 100644 --- a/openapi/nexus-internal.json +++ b/openapi/nexus-internal.json @@ -4658,7 +4658,7 @@ }, "migration_out": { "nullable": true, - "description": "The state of any outbound migration to this VMM.", + "description": "The state of any outbound migration from this VMM.", "allOf": [ { "$ref": "#/components/schemas/MigrationRuntimeState" From e2a1ee58768d1cd951f1722f31b6950da4801dfd Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Wed, 31 Jul 2024 11:30:52 -0700 Subject: [PATCH 203/234] fix typo Co-authored-by: Sean Klein --- nexus/src/app/sagas/instance_update/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nexus/src/app/sagas/instance_update/mod.rs b/nexus/src/app/sagas/instance_update/mod.rs index 224f5aae9f4..6134bc7f7a2 100644 --- a/nexus/src/app/sagas/instance_update/mod.rs +++ b/nexus/src/app/sagas/instance_update/mod.rs @@ -248,7 +248,7 @@ //! database and networking configuration *to match a state change that has //! already occurred.* //! -//! Consider the folliwng: if we run an `instance-start` saga, and the instance +//! Consider the following: if we run an `instance-start` saga, and the instance //! cannot actually be started, of course we would want the unwinding saga to //! undo any database changes it has made, because the instance was not actually //! started. Failing to undo those changes when an `instance-start` saga unwinds From a0e6042e86eeda9f3ceb75d37678a14176d29560 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Wed, 31 Jul 2024 12:34:19 -0700 Subject: [PATCH 204/234] turns out we can just totally disable it in tests --- nexus-config/src/nexus_config.rs | 11 +++ nexus/src/app/background/init.rs | 10 +- .../app/background/tasks/instance_updater.rs | 91 +++++++++++-------- nexus/tests/config.test.toml | 7 +- 4 files changed, 74 insertions(+), 45 deletions(-) diff --git a/nexus-config/src/nexus_config.rs b/nexus-config/src/nexus_config.rs index 49c78dae53b..9d8bf1ac9ba 100644 --- a/nexus-config/src/nexus_config.rs +++ b/nexus-config/src/nexus_config.rs @@ -568,6 +568,15 @@ pub struct InstanceUpdaterConfig { /// period (in seconds) for periodic activations of this background task #[serde_as(as = "DurationSeconds")] pub period_secs: Duration, + + /// disable background checks for instances in need of updates. + /// + /// This config is intended for use in testing, and should generally not be + /// enabled in real life. + /// + /// Default: Off + #[serde(default)] + pub disable: bool, } #[serde_as] @@ -859,6 +868,7 @@ mod test { region_replacement_driver.period_secs = 30 instance_watcher.period_secs = 30 instance_updater.period_secs = 30 + instance_updater.disable = false service_firewall_propagation.period_secs = 300 v2p_mapping_propagation.period_secs = 30 abandoned_vmm_reaper.period_secs = 60 @@ -1008,6 +1018,7 @@ mod test { }, instance_updater: InstanceUpdaterConfig { period_secs: Duration::from_secs(30), + disable: false, }, service_firewall_propagation: ServiceFirewallPropagationConfig { diff --git a/nexus/src/app/background/init.rs b/nexus/src/app/background/init.rs index 385d95c317c..977067f8fae 100644 --- a/nexus/src/app/background/init.rs +++ b/nexus/src/app/background/init.rs @@ -638,11 +638,19 @@ impl BackgroundTasksInitializer { let updater = instance_updater::InstanceUpdater::new( datastore.clone(), sagas.clone(), + config.instance_updater.disable, ); + let period = if config.instance_updater.disable { + // If we're explicitly disabled by the config, don't waste + // energy activating the background task just to have it do nothing. + std::time::Duration::MAX + } else { + config.instance_updater.period_secs + }; driver.register( TaskDefinition { name: "instance_updater", description: "detects if instances require update sagas and schedules them", - period: config.instance_updater.period_secs, + period, task_impl: Box::new(updater), opctx: opctx.child(BTreeMap::new()), watchers: vec![], diff --git a/nexus/src/app/background/tasks/instance_updater.rs b/nexus/src/app/background/tasks/instance_updater.rs index 183a12fe24d..46a3bead215 100644 --- a/nexus/src/app/background/tasks/instance_updater.rs +++ b/nexus/src/app/background/tasks/instance_updater.rs @@ -26,14 +26,19 @@ use tokio::task::JoinSet; pub struct InstanceUpdater { datastore: Arc, sagas: Arc, + disable: bool, } impl InstanceUpdater { - pub fn new(datastore: Arc, sagas: Arc) -> Self { - InstanceUpdater { datastore, sagas } + pub fn new( + datastore: Arc, + sagas: Arc, + disable: bool, + ) -> Self { + InstanceUpdater { datastore, sagas, disable } } - async fn activate2( + async fn actually_activate( &mut self, opctx: &OpContext, stats: &mut ActivationStats, @@ -205,43 +210,49 @@ impl BackgroundTask for InstanceUpdater { ) -> BoxFuture<'a, serde_json::Value> { async { let mut stats = ActivationStats::default(); - let error = match self.activate2(opctx, &mut stats).await { - Ok(()) => { - slog::info!( - &opctx.log, - "instance updater activation completed"; - "destroyed_active_vmms" => stats.destroyed_active_vmms, - "terminated_active_migrations" => stats.terminated_active_migrations, - "update_sagas_started" => stats.sagas_started, - "update_sagas_completed" => stats.sagas_completed, - ); - debug_assert_eq!( - stats.sagas_failed, - 0, - "if the task completed successfully, then no sagas \ - should have failed", - ); - debug_assert_eq!( - stats.saga_start_failures, - 0, - "if the task completed successfully, all sagas \ - should have started successfully" - ); - None - } - Err(error) => { - slog::warn!( - &opctx.log, - "instance updater activation failed!"; - "error" => %error, - "destroyed_active_vmms" => stats.destroyed_active_vmms, - "terminated_active_migrations" => stats.terminated_active_migrations, - "update_sagas_started" => stats.sagas_started, - "update_sagas_completed" => stats.sagas_completed, - "update_sagas_failed" => stats.sagas_failed, - "update_saga_start_failures" => stats.saga_start_failures, - ); - Some(error.to_string()) + + let error = if self.disable { + slog::info!(&opctx.log, "background instance updater explicitly disabled"); + None + } else { + match self.actually_activate(opctx, &mut stats).await { + Ok(()) => { + slog::info!( + &opctx.log, + "instance updater activation completed"; + "destroyed_active_vmms" => stats.destroyed_active_vmms, + "terminated_active_migrations" => stats.terminated_active_migrations, + "update_sagas_started" => stats.sagas_started, + "update_sagas_completed" => stats.sagas_completed, + ); + debug_assert_eq!( + stats.sagas_failed, + 0, + "if the task completed successfully, then no sagas \ + should have failed", + ); + debug_assert_eq!( + stats.saga_start_failures, + 0, + "if the task completed successfully, all sagas \ + should have started successfully" + ); + None + } + Err(error) => { + slog::warn!( + &opctx.log, + "instance updater activation failed!"; + "error" => %error, + "destroyed_active_vmms" => stats.destroyed_active_vmms, + "terminated_active_migrations" => stats.terminated_active_migrations, + "update_sagas_started" => stats.sagas_started, + "update_sagas_completed" => stats.sagas_completed, + "update_sagas_failed" => stats.sagas_failed, + "update_saga_start_failures" => stats.saga_start_failures, + ); + Some(error.to_string()) + } } }; json!({ diff --git a/nexus/tests/config.test.toml b/nexus/tests/config.test.toml index f78aee3a88d..8f65a73204a 100644 --- a/nexus/tests/config.test.toml +++ b/nexus/tests/config.test.toml @@ -134,10 +134,9 @@ lookup_region_port.period_secs = 60 # to be executed in a timely manner, so for integration tests, we don't want to # *rely* on the instance-updater background task for running these sagas. # -# Therefore, set a period long enough that this task won't activate during a -# reasonable integration test execution. Tests for the instance-updater task -# will explictly activate it. -instance_updater.period_secs = 600 +# Therefore, disable the background task during tests. +instance_updater.disable = true +instance_updater.period_secs = 60 [default_region_allocation_strategy] # we only have one sled in the test environment, so we need to use the From 0b89c58094d5c99e521e79e18a05d19e6be42248 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Wed, 31 Jul 2024 12:43:04 -0700 Subject: [PATCH 205/234] turns out it's fine to not unlock deleted instances --- nexus/db-queries/src/db/datastore/instance.rs | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/instance.rs b/nexus/db-queries/src/db/datastore/instance.rs index 8b7d7d2278c..89f78ef2688 100644 --- a/nexus/db-queries/src/db/datastore/instance.rs +++ b/nexus/db-queries/src/db/datastore/instance.rs @@ -1128,9 +1128,7 @@ impl DataStore { let UpdaterLock { updater_id, locked_gen } = *lock; let result = diesel::update(dsl::instance) - // N.B. that we intentionally *don't* filter out instances that have - // been deleted. If the instance doesn't exist, whatever. It is, by - // definition, "unlocked"... :) + .filter(dsl::time_deleted.is_null()) .filter(dsl::id.eq(instance_id)) // Only unlock the instance if: // - the provided updater ID matches that of the saga that has From 9788b4021b0ffa8f14c43f95823f9540f961f609 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Wed, 31 Jul 2024 13:09:17 -0700 Subject: [PATCH 206/234] fix unfinished comments --- nexus/db-queries/src/db/datastore/instance.rs | 5 +++++ nexus/src/app/sagas/instance_update/mod.rs | 5 ++++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/nexus/db-queries/src/db/datastore/instance.rs b/nexus/db-queries/src/db/datastore/instance.rs index 89f78ef2688..31380feb96f 100644 --- a/nexus/db-queries/src/db/datastore/instance.rs +++ b/nexus/db-queries/src/db/datastore/instance.rs @@ -124,6 +124,11 @@ impl InstanceAndActiveVmm { // Instead, we'll continue to report the instance's state as // "migrating" until an instance-update saga has resolved the // outcome of the migration, since only the instance-update saga + // can complete the migration and update the instance record to + // point at its new active VMM. No new instance-migrate, + // instance-stop, or instance-delete saga can be started + // until this occurs. + // // If the instance actually *has* stopped or failed before a // successful migration out, this is fine, because an // instance-update saga will come along and remove the active VMM diff --git a/nexus/src/app/sagas/instance_update/mod.rs b/nexus/src/app/sagas/instance_update/mod.rs index 6134bc7f7a2..311d7db78a1 100644 --- a/nexus/src/app/sagas/instance_update/mod.rs +++ b/nexus/src/app/sagas/instance_update/mod.rs @@ -439,7 +439,10 @@ struct UpdatesRequired { /// deallocated. deprovision: Option, - /// If this is [`Some`], + /// If this is [`Some`], then a network configuration update must be + /// performed: either updating NAT configuration and V2P mappings when the + /// instance has moved to a new sled, or deleting them if it is no longer + /// incarnated. network_config: Option, } From 9d331d6136a07718bd7e40c68f692f47e22da50c Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Wed, 31 Jul 2024 14:15:52 -0700 Subject: [PATCH 207/234] lol, OMDB panics when it's Duration::MAX --- dev-tools/omdb/tests/successes.out | 2 +- nexus/src/app/background/init.rs | 9 +-------- 2 files changed, 2 insertions(+), 9 deletions(-) diff --git a/dev-tools/omdb/tests/successes.out b/dev-tools/omdb/tests/successes.out index 395132a8026..d4c07899f40 100644 --- a/dev-tools/omdb/tests/successes.out +++ b/dev-tools/omdb/tests/successes.out @@ -487,7 +487,7 @@ task: "external_endpoints" TLS certificates: 0 task: "instance_updater" - configured period: every 10m + configured period: every s currently executing: no last completed activation: , triggered by a periodic timer firing started at (s ago) and ran for ms diff --git a/nexus/src/app/background/init.rs b/nexus/src/app/background/init.rs index 977067f8fae..850e63443a1 100644 --- a/nexus/src/app/background/init.rs +++ b/nexus/src/app/background/init.rs @@ -640,17 +640,10 @@ impl BackgroundTasksInitializer { sagas.clone(), config.instance_updater.disable, ); - let period = if config.instance_updater.disable { - // If we're explicitly disabled by the config, don't waste - // energy activating the background task just to have it do nothing. - std::time::Duration::MAX - } else { - config.instance_updater.period_secs - }; driver.register( TaskDefinition { name: "instance_updater", description: "detects if instances require update sagas and schedules them", - period, + period: config.instance_watcher.period_secs, task_impl: Box::new(updater), opctx: opctx.child(BTreeMap::new()), watchers: vec![], From d61d1370e34a2e66acb969aa395a444dd65236d1 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Thu, 1 Aug 2024 12:28:32 -0700 Subject: [PATCH 208/234] initial test for `vmm_and_migration_update_runtime` --- nexus/db-queries/src/db/datastore/vmm.rs | 150 +++++++++++++++++++++++ 1 file changed, 150 insertions(+) diff --git a/nexus/db-queries/src/db/datastore/vmm.rs b/nexus/db-queries/src/db/datastore/vmm.rs index 4ce6d45603e..67e3705bcc7 100644 --- a/nexus/db-queries/src/db/datastore/vmm.rs +++ b/nexus/db-queries/src/db/datastore/vmm.rs @@ -302,3 +302,153 @@ impl DataStore { .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) } } + +#[cfg(test)] +mod tests { + use super::*; + use crate::db; + use crate::db::datastore::test_utils::datastore_test; + use crate::db::model::Generation; + use crate::db::model::Migration; + use crate::db::model::VmmRuntimeState; + use crate::db::model::VmmState; + use nexus_test_utils::db::test_setup_database; + use omicron_common::api::internal::nexus; + use omicron_test_utils::dev; + use omicron_uuid_kinds::InstanceUuid; + + #[tokio::test] + async fn test_vmm_and_migration_update_runtime() { + // Setup + let logctx = + dev::test_setup_log("test_vmm_and_migration_update_runtime"); + let mut db = test_setup_database(&logctx.log).await; + let (opctx, datastore) = datastore_test(&logctx, &db).await; + + let instance_id = InstanceUuid::from_untyped_uuid(Uuid::new_v4()); + let vmm1 = datastore + .vmm_insert( + &opctx, + Vmm { + id: Uuid::new_v4(), + time_created: Utc::now(), + time_deleted: None, + instance_id: instance_id.into_untyped_uuid(), + sled_id: Uuid::new_v4(), + propolis_ip: "10.1.9.32".parse().unwrap(), + propolis_port: 420.into(), + runtime: VmmRuntimeState { + time_state_updated: Utc::now(), + r#gen: Generation::new(), + state: VmmState::Running, + }, + }, + ) + .await + .expect("VMM 1 should be inserted successfully!"); + + let vmm2 = datastore + .vmm_insert( + &opctx, + Vmm { + id: Uuid::new_v4(), + time_created: Utc::now(), + time_deleted: None, + instance_id: instance_id.into_untyped_uuid(), + sled_id: Uuid::new_v4(), + propolis_ip: "10.1.9.42".parse().unwrap(), + propolis_port: 420.into(), + runtime: VmmRuntimeState { + time_state_updated: Utc::now(), + r#gen: Generation::new(), + state: VmmState::Running, + }, + }, + ) + .await + .expect("VMM 2 should be inserted successfully!"); + + let migration1 = datastore + .migration_insert( + &opctx, + Migration::new(Uuid::new_v4(), instance_id, vmm1.id, vmm2.id), + ) + .await + .expect("migration should be inserted successfully!"); + + // pretend we have just migrated in from vmm1 to vmm2 + let vmm1_migration_out = nexus::MigrationRuntimeState { + migration_id: migration1.id, + state: nexus::MigrationState::Completed, + r#gen: Generation::new().0.next(), + time_updated: Utc::now(), + }; + datastore + .vmm_and_migration_update_runtime( + PropolisUuid::from_untyped_uuid(vmm1.id), + &VmmRuntimeState { + time_state_updated: Utc::now(), + r#gen: Generation(vmm1.runtime.r#gen.0.next()), + state: VmmState::Stopping, + }, + Migrations { + migration_in: None, + migration_out: Some(&vmm1_migration_out), + }, + ) + .await + .expect("vmm1 state should update"); + let vmm2_migration_in = nexus::MigrationRuntimeState { + migration_id: migration1.id, + state: nexus::MigrationState::Completed, + r#gen: Generation::new().0.next(), + time_updated: Utc::now(), + }; + datastore + .vmm_and_migration_update_runtime( + PropolisUuid::from_untyped_uuid(vmm2.id), + &VmmRuntimeState { + time_state_updated: Utc::now(), + r#gen: Generation(vmm2.runtime.r#gen.0.next()), + state: VmmState::Running, + }, + Migrations { + migration_in: Some(&vmm2_migration_in), + migration_out: None, + }, + ) + .await + .expect("vmm1 state should update"); + + let all_migrations = datastore + .instance_list_migrations( + &opctx, + instance_id, + &DataPageParams::max_page(), + ) + .await + .expect("must list migrations"); + assert_eq!(all_migrations.len(), 1); + let db_migration1 = &all_migrations[0]; + assert_eq!( + db_migration1.source_state, + db::model::MigrationState::COMPLETED + ); + assert_eq!( + db_migration1.target_state, + db::model::MigrationState::COMPLETED + ); + assert_eq!( + db_migration1.source_gen, + Generation(Generation::new().0.next()), + ); + assert_eq!( + db_migration1.target_gen, + Generation(Generation::new().0.next()), + ); + + // Clean up. + db.cleanup().await.unwrap(); + logctx.cleanup_successful(); + } +} From b657fbc2e7d5c10b80ab99654b7b469c3177a79d Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Thu, 1 Aug 2024 13:24:47 -0700 Subject: [PATCH 209/234] actually repro the problem --- nexus/db-queries/src/db/datastore/vmm.rs | 140 ++++++++++++++++++++++- 1 file changed, 139 insertions(+), 1 deletion(-) diff --git a/nexus/db-queries/src/db/datastore/vmm.rs b/nexus/db-queries/src/db/datastore/vmm.rs index 67e3705bcc7..c5988468618 100644 --- a/nexus/db-queries/src/db/datastore/vmm.rs +++ b/nexus/db-queries/src/db/datastore/vmm.rs @@ -376,7 +376,14 @@ mod tests { .await .expect("migration should be inserted successfully!"); - // pretend we have just migrated in from vmm1 to vmm2 + info!( + &logctx.log, + "pretending to migrate from vmm1 to vmm2"; + "vmm1" => ?vmm1, + "vmm2" => ?vmm2, + "migration" => ?migration1, + ); + let vmm1_migration_out = nexus::MigrationRuntimeState { migration_id: migration1.id, state: nexus::MigrationState::Completed, @@ -447,6 +454,137 @@ mod tests { Generation(Generation::new().0.next()), ); + // now, let's simulate a second migration, out of vmm2. + let vmm3 = datastore + .vmm_insert( + &opctx, + Vmm { + id: Uuid::new_v4(), + time_created: Utc::now(), + time_deleted: None, + instance_id: instance_id.into_untyped_uuid(), + sled_id: Uuid::new_v4(), + propolis_ip: "10.1.9.69".parse().unwrap(), + propolis_port: 420.into(), + runtime: VmmRuntimeState { + time_state_updated: Utc::now(), + r#gen: Generation::new(), + state: VmmState::Running, + }, + }, + ) + .await + .expect("VMM 2 should be inserted successfully!"); + + let migration2 = datastore + .migration_insert( + &opctx, + Migration::new(Uuid::new_v4(), instance_id, vmm2.id, vmm3.id), + ) + .await + .expect("migration 2 should be inserted successfully!"); + info!( + &logctx.log, + "pretending to migrate from vmm2 to vmm3"; + "vmm2" => ?vmm2, + "vmm3" => ?vmm3, + "migration" => ?migration2, + ); + + let vmm2_migration_out = nexus::MigrationRuntimeState { + migration_id: migration2.id, + state: nexus::MigrationState::Completed, + r#gen: Generation::new().0.next(), + time_updated: Utc::now(), + }; + datastore + .vmm_and_migration_update_runtime( + PropolisUuid::from_untyped_uuid(vmm2.id), + &VmmRuntimeState { + time_state_updated: Utc::now(), + r#gen: Generation(vmm2.runtime.r#gen.0.next()), + state: VmmState::Destroyed, + }, + Migrations { + migration_in: Some(&vmm2_migration_in), + migration_out: Some(&vmm2_migration_out), + }, + ) + .await + .expect("vmm2 state should update"); + + let vmm3_migration_in = nexus::MigrationRuntimeState { + migration_id: migration2.id, + // Let's make this fail, just for fun... + state: nexus::MigrationState::Failed, + r#gen: Generation::new().0.next(), + time_updated: Utc::now(), + }; + datastore + .vmm_and_migration_update_runtime( + PropolisUuid::from_untyped_uuid(vmm3.id), + &VmmRuntimeState { + time_state_updated: Utc::now(), + r#gen: Generation(vmm3.runtime.r#gen.0.next()), + state: VmmState::Destroyed, + }, + Migrations { + migration_in: Some(&vmm3_migration_in), + migration_out: None, + }, + ) + .await + .expect("vmm3 state should update"); + + let all_migrations = datastore + .instance_list_migrations( + &opctx, + instance_id, + &DataPageParams::max_page(), + ) + .await + .expect("must list migrations"); + assert_eq!(all_migrations.len(), 2); + + // the previous migration should not have closed. + let new_db_migration1 = all_migrations + .iter() + .find(|m| m.id == migration1.id) + .expect("query must include migration1"); + assert_eq!(new_db_migration1.source_state, db_migration1.source_state); + assert_eq!(new_db_migration1.source_gen, db_migration1.source_gen); + assert_eq!( + db_migration1.time_source_updated, + new_db_migration1.time_source_updated + ); + assert_eq!(new_db_migration1.target_state, db_migration1.target_state); + assert_eq!(new_db_migration1.target_gen, db_migration1.target_gen,); + assert_eq!( + new_db_migration1.time_target_updated, + db_migration1.time_target_updated, + ); + + let db_migration2 = all_migrations + .iter() + .find(|m| m.id == migration2.id) + .expect("query must include migration2"); + assert_eq!( + new_db_migration2.source_state, + db::model::MigrationState::COMPLETED + ); + assert_eq!( + db_migration2.target_state, + db::model::MigrationState::FAILED + ); + assert_eq!( + db_migration2.source_gen, + Generation(Generation::new().0.next()), + ); + assert_eq!( + db_migration2.target_gen, + Generation(Generation::new().0.next()), + ); + // Clean up. db.cleanup().await.unwrap(); logctx.cleanup_successful(); From bfb85aff1e3e469d148c3420e63a27a2631736c9 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Thu, 1 Aug 2024 14:39:23 -0700 Subject: [PATCH 210/234] replace vmm/migration CTE with transaction This should fix the CRDB errors on multiple UPDATEs for the same table in a CTE. --- .../db-queries/src/db/datastore/migration.rs | 51 +- nexus/db-queries/src/db/datastore/vmm.rs | 164 ++++-- nexus/db-queries/src/db/queries/mod.rs | 1 - nexus/db-queries/src/db/queries/vmm.rs | 549 ------------------ nexus/src/app/instance.rs | 1 + nexus/src/app/sagas/instance_update/mod.rs | 2 + 6 files changed, 176 insertions(+), 592 deletions(-) delete mode 100644 nexus/db-queries/src/db/queries/vmm.rs diff --git a/nexus/db-queries/src/db/datastore/migration.rs b/nexus/db-queries/src/db/datastore/migration.rs index 8a7d1c645bd..128239503cb 100644 --- a/nexus/db-queries/src/db/datastore/migration.rs +++ b/nexus/db-queries/src/db/datastore/migration.rs @@ -6,12 +6,16 @@ use super::DataStore; use crate::context::OpContext; +use crate::db; use crate::db::error::public_error_from_diesel; use crate::db::error::ErrorHandler; -use crate::db::model::{Migration, MigrationState}; +use crate::db::model::Generation; +use crate::db::model::Migration; +use crate::db::model::MigrationState; use crate::db::pagination::paginated; use crate::db::schema::migration::dsl; use crate::db::update_and_check::UpdateAndCheck; +use crate::db::update_and_check::UpdateAndQueryResult; use crate::db::update_and_check::UpdateStatus; use async_bb8_diesel::AsyncRunQueryDsl; use chrono::Utc; @@ -23,6 +27,7 @@ use omicron_common::api::external::UpdateResult; use omicron_common::api::internal::nexus; use omicron_uuid_kinds::GenericUuid; use omicron_uuid_kinds::InstanceUuid; +use omicron_uuid_kinds::PropolisUuid; use uuid::Uuid; impl DataStore { @@ -123,6 +128,50 @@ impl DataStore { }) .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) } + + pub(crate) async fn migration_update_source_on_connection( + &self, + conn: &async_bb8_diesel::Connection, + vmm_id: &PropolisUuid, + migration: &nexus::MigrationRuntimeState, + ) -> Result, diesel::result::Error> { + let generation = Generation(migration.r#gen); + diesel::update(dsl::migration) + .filter(dsl::id.eq(migration.migration_id)) + .filter(dsl::time_deleted.is_null()) + .filter(dsl::source_gen.lt(generation)) + .filter(dsl::source_propolis_id.eq(vmm_id.into_untyped_uuid())) + .set(( + dsl::source_state.eq(MigrationState(migration.state)), + dsl::source_gen.eq(generation), + dsl::time_source_updated.eq(migration.time_updated), + )) + .check_if_exists::(migration.migration_id) + .execute_and_check(conn) + .await + } + + pub(crate) async fn migration_update_target_on_connection( + &self, + conn: &async_bb8_diesel::Connection, + vmm_id: &PropolisUuid, + migration: &nexus::MigrationRuntimeState, + ) -> Result, diesel::result::Error> { + let generation = Generation(migration.r#gen); + diesel::update(dsl::migration) + .filter(dsl::id.eq(migration.migration_id)) + .filter(dsl::time_deleted.is_null()) + .filter(dsl::target_gen.lt(generation)) + .filter(dsl::target_propolis_id.eq(vmm_id.into_untyped_uuid())) + .set(( + dsl::target_state.eq(MigrationState(migration.state)), + dsl::target_gen.eq(generation), + dsl::time_target_updated.eq(migration.time_updated), + )) + .check_if_exists::(migration.migration_id) + .execute_and_check(conn) + .await + } } #[cfg(test)] diff --git a/nexus/db-queries/src/db/datastore/vmm.rs b/nexus/db-queries/src/db/datastore/vmm.rs index c5988468618..14a922fcf21 100644 --- a/nexus/db-queries/src/db/datastore/vmm.rs +++ b/nexus/db-queries/src/db/datastore/vmm.rs @@ -7,6 +7,7 @@ use super::DataStore; use crate::authz; use crate::context::OpContext; +use crate::db; use crate::db::error::public_error_from_diesel; use crate::db::error::ErrorHandler; use crate::db::model::Vmm; @@ -15,18 +16,22 @@ use crate::db::model::VmmState as DbVmmState; use crate::db::pagination::paginated; use crate::db::schema::vmm::dsl; use crate::db::update_and_check::UpdateAndCheck; +use crate::db::update_and_check::UpdateAndQueryResult; use crate::db::update_and_check::UpdateStatus; +use crate::transaction_retry::OptionalError; use async_bb8_diesel::AsyncRunQueryDsl; use chrono::Utc; use diesel::prelude::*; use omicron_common::api::external::CreateResult; use omicron_common::api::external::DataPageParams; use omicron_common::api::external::Error; +use omicron_common::api::external::InternalContext; use omicron_common::api::external::ListResultVec; use omicron_common::api::external::LookupResult; use omicron_common::api::external::LookupType; use omicron_common::api::external::ResourceType; use omicron_common::api::external::UpdateResult; +use omicron_common::api::internal::nexus; use omicron_common::api::internal::nexus::Migrations; use omicron_uuid_kinds::GenericUuid; use omicron_uuid_kinds::PropolisUuid; @@ -133,29 +138,41 @@ impl DataStore { vmm_id: &PropolisUuid, new_runtime: &VmmRuntimeState, ) -> Result { - let updated = diesel::update(dsl::vmm) + self.vmm_update_runtime_on_connection( + &*self.pool_connection_unauthorized().await?, + vmm_id, + new_runtime, + ) + .await + .map(|r| match r.status { + UpdateStatus::Updated => true, + UpdateStatus::NotUpdatedButExists => false, + }) + .map_err(|e| { + public_error_from_diesel( + e, + ErrorHandler::NotFoundByLookup( + ResourceType::Vmm, + LookupType::ById(vmm_id.into_untyped_uuid()), + ), + ) + }) + } + + async fn vmm_update_runtime_on_connection( + &self, + conn: &async_bb8_diesel::Connection, + vmm_id: &PropolisUuid, + new_runtime: &VmmRuntimeState, + ) -> Result, diesel::result::Error> { + diesel::update(dsl::vmm) .filter(dsl::time_deleted.is_null()) .filter(dsl::id.eq(vmm_id.into_untyped_uuid())) .filter(dsl::state_generation.lt(new_runtime.gen)) .set(new_runtime.clone()) .check_if_exists::(vmm_id.into_untyped_uuid()) - .execute_and_check(&*self.pool_connection_unauthorized().await?) + .execute_and_check(conn) .await - .map(|r| match r.status { - UpdateStatus::Updated => true, - UpdateStatus::NotUpdatedButExists => false, - }) - .map_err(|e| { - public_error_from_diesel( - e, - ErrorHandler::NotFoundByLookup( - ResourceType::Vmm, - LookupType::ById(vmm_id.into_untyped_uuid()), - ), - ) - })?; - - Ok(updated) } /// Updates a VMM record and associated migration record(s) with a single @@ -185,33 +202,94 @@ impl DataStore { /// - `Err` if another error occurred while accessing the database. pub async fn vmm_and_migration_update_runtime( &self, + opctx: &OpContext, vmm_id: PropolisUuid, new_runtime: &VmmRuntimeState, - migrations: Migrations<'_>, + Migrations { migration_in, migration_out }: Migrations<'_>, ) -> Result { - let query = crate::db::queries::vmm::VmmAndMigrationUpdate::new( - vmm_id, - new_runtime.clone(), - migrations, - ); - - // The VmmAndMigrationUpdate query handles and indicates failure to find - // either the VMM or the migration, so a query failure here indicates - // some kind of internal error and not a failed lookup. - let result = query - .execute_and_check(&*self.pool_connection_unauthorized().await?) + fn migration_id( + m: Option<&nexus::MigrationRuntimeState>, + ) -> Option { + m.as_ref().map(|m| m.migration_id) + } + + if migration_id(migration_in) == migration_id(migration_out) { + return Err(Error::conflict( + "migrating from a VMM to itself is nonsensical", + )) + .internal_context(format!("migration_in: {migration_in:?}; migration_out: {migration_out:?}")); + } + + let err = OptionalError::new(); + let conn = self.pool_connection_authorized(opctx).await?; + + self.transaction_retry_wrapper("vmm_and_migration_update_runtime") + .transaction(&conn, |conn| { + let err = err.clone(); + async move { + let vmm_updated = self + .vmm_update_runtime_on_connection( + &conn, + &vmm_id, + new_runtime, + ) + .await.map(|r| match r.status { UpdateStatus::Updated => true, UpdateStatus::NotUpdatedButExists => false })?; + let migration_out_updated = match migration_out { + Some(migration) => { + let r = self.migration_update_source_on_connection( + &conn, &vmm_id, migration, + ) + .await?; + match r.status { + UpdateStatus::Updated => true, + UpdateStatus::NotUpdatedButExists => match r.found { + m if m.time_deleted.is_some() => return Err(err.bail(Error::Gone)), + m if m.source_propolis_id != vmm_id.into_untyped_uuid() => { + return Err(err.bail(Error::invalid_value( + "source propolis UUID", + format!("{vmm_id} is not the source VMM of this migration"), + ))); + } + // Not updated, generation has advanced. + _ => false + }, + } + }, + None => false, + }; + let migration_in_updated = match migration_in { + Some(migration) => { + let r = self.migration_update_target_on_connection( + &conn, &vmm_id, migration, + ) + .await?; + match r.status { + UpdateStatus::Updated => true, + UpdateStatus::NotUpdatedButExists => match r.found { + m if m.time_deleted.is_some() => return Err(err.bail(Error::Gone)), + m if m.target_propolis_id != vmm_id.into_untyped_uuid() => { + return Err(err.bail(Error::invalid_value( + "target propolis UUID", + format!("{vmm_id} is not the target VMM of this migration"), + ))); + } + // Not updated, generation has advanced. + _ => false + }, + } + }, + None => false, + }; + Ok(VmmStateUpdateResult { + vmm_updated, + migration_in_updated, + migration_out_updated, + }) + }}) .await - .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?; - - Ok(VmmStateUpdateResult { - vmm_updated: match result.vmm_status { - Some(UpdateStatus::Updated) => true, - Some(UpdateStatus::NotUpdatedButExists) => false, - None => false, - }, - migration_in_updated: result.migration_in_status.was_updated(), - migration_out_updated: result.migration_out_status.was_updated(), - }) + .map_err(|e| { + err.take().unwrap_or_else(|| public_error_from_diesel(e, ErrorHandler::Server)) + }) } /// Forcibly overwrites the Propolis IP/Port in the supplied VMM's record with @@ -392,6 +470,7 @@ mod tests { }; datastore .vmm_and_migration_update_runtime( + &opctx, PropolisUuid::from_untyped_uuid(vmm1.id), &VmmRuntimeState { time_state_updated: Utc::now(), @@ -413,6 +492,7 @@ mod tests { }; datastore .vmm_and_migration_update_runtime( + &opctx, PropolisUuid::from_untyped_uuid(vmm2.id), &VmmRuntimeState { time_state_updated: Utc::now(), @@ -499,6 +579,7 @@ mod tests { }; datastore .vmm_and_migration_update_runtime( + &opctx, PropolisUuid::from_untyped_uuid(vmm2.id), &VmmRuntimeState { time_state_updated: Utc::now(), @@ -522,6 +603,7 @@ mod tests { }; datastore .vmm_and_migration_update_runtime( + &opctx, PropolisUuid::from_untyped_uuid(vmm3.id), &VmmRuntimeState { time_state_updated: Utc::now(), @@ -569,7 +651,7 @@ mod tests { .find(|m| m.id == migration2.id) .expect("query must include migration2"); assert_eq!( - new_db_migration2.source_state, + db_migration2.source_state, db::model::MigrationState::COMPLETED ); assert_eq!( diff --git a/nexus/db-queries/src/db/queries/mod.rs b/nexus/db-queries/src/db/queries/mod.rs index 46e8a7bc163..f88b8fab6d8 100644 --- a/nexus/db-queries/src/db/queries/mod.rs +++ b/nexus/db-queries/src/db/queries/mod.rs @@ -8,7 +8,6 @@ pub mod disk; pub mod external_ip; pub mod ip_pool; -pub mod vmm; #[macro_use] mod next_item; pub mod network_interface; diff --git a/nexus/db-queries/src/db/queries/vmm.rs b/nexus/db-queries/src/db/queries/vmm.rs deleted file mode 100644 index e8eec47141d..00000000000 --- a/nexus/db-queries/src/db/queries/vmm.rs +++ /dev/null @@ -1,549 +0,0 @@ -// This Source Code Form is subject to the terms of the Mozilla Public -// License, v. 2.0. If a copy of the MPL was not distributed with this -// file, You can obtain one at https://mozilla.org/MPL/2.0/. - -//! Implement a query for updating an instance and VMM in a single CTE. - -use async_bb8_diesel::AsyncRunQueryDsl; -use diesel::prelude::QueryResult; -use diesel::query_builder::{Query, QueryFragment, QueryId}; -use diesel::result::Error as DieselError; -use diesel::sql_types::{Nullable, Uuid as SqlUuid}; -use diesel::{pg::Pg, query_builder::AstPass}; -use diesel::{Column, ExpressionMethods, QueryDsl, RunQueryDsl}; -use nexus_db_model::{ - schema::{migration::dsl as migration_dsl, vmm::dsl as vmm_dsl}, - Generation, MigrationState, VmmRuntimeState, -}; -use omicron_common::api::internal::nexus::{MigrationRuntimeState, Migrations}; -use omicron_uuid_kinds::{GenericUuid, PropolisUuid}; -use uuid::Uuid; - -use crate::db::pool::DbConnection; -use crate::db::update_and_check::UpdateStatus; - -/// A CTE that checks and updates the VMM and migration tables in a single -/// atomic operation. -// -// The single-table update-and-check CTE has the following form: -// -// WITH found AS (SELECT FROM T WHERE ) -// updated AS (UPDATE T SET RETURNING *) -// SELECT -// found. -// updated. -// found.* -// FROM -// found -// LEFT JOIN -// updated -// ON -// found. = updated.; -// -// The idea behind this query is to have separate "found" and "updated" -// subqueries for the VMM and migration tables, then use those to create two more -// subqueries that perform the joins and yield the results, along the following -// lines: -// -// WITH vmm_found AS (SELECT(SELECT id FROM vmm WHERE vmm.id = id) AS id), -// vmm_updated AS (UPDATE vmm SET ... RETURNING *), -// migration_in_found AS (SELECT( -// SELECT id FROM migration WHERE migration.id = migration_in_id -// ) AS id), -// migration_in_updated AS (UPDATE migration SET ... RETURNING *), -// migration_out_found AS (SELECT( -// SELECT id FROM migration WHERE migration.id = migration_out_id -// ) AS id), -// migration_out_updated AS (UPDATE migration SET ... RETURNING *), -// vmm_result AS ( -// SELECT vmm_found.id AS found, vmm_updated.id AS updated -// FROM vmm_found -// LEFT JOIN vmm_updated -// ON vmm_found.id = vmm_updated.id -// ), -// migration_in_result AS ( -// SELECT migration_in_found.id AS found, migration_in_updated.id AS updated -// FROM migration_in_found -// LEFT JOIN migration_in_updated -// ON migration_in_found.id = migration_in_updated.id -// ), -// migration_out_result AS ( .. ) -// SELECT vmm_result.found, vmm_result.updated, migration_in_result.found, -// migration_in_result.updated, migration_out_result.found, -// migration_out_result.updated, -// FROM vmm_result, migration_in_result, migration_out_result; -// -// Depending on whether a migration in, migration out, both, or neither were -// provided, the structure of the query will differ somewhat. -// -// The "wrapper" SELECTs when finding migrations and VMMs are used to get a NULL -// result in the final output instead of failing the entire query if the target -// object is missing. This maximizes Nexus's flexibility when dealing with -// updates from sled agent that refer to one valid and one deleted object. (This -// can happen if, e.g., sled agent sends a message indicating that a retired VMM -// has finally been destroyed when its instance has since been deleted.) -pub struct VmmAndMigrationUpdate { - vmm_find: Box + Send>, - vmm_update: Box + Send>, - migration_in: Option, - migration_out: Option, -} - -struct Update { - name: &'static str, - id: &'static str, - find: Box + Send>, - update: Box + Send>, -} - -/// Contains the result of a combined instance-and-VMM update operation. -#[derive(Copy, Clone, PartialEq, Debug)] -pub struct VmmAndMigrationUpdateResult { - /// `Some(status)` if the target VMM was found; the wrapped `UpdateStatus` - /// indicates whether the row was updated. `None` if the VMM was not found. - pub vmm_status: Option, - - /// Indicates whether a migration-in update was performed. - pub migration_in_status: RecordUpdateStatus, - - /// Indicates whether a migration-out update was performed. - pub migration_out_status: RecordUpdateStatus, -} - -#[derive(Copy, Clone, PartialEq, Debug)] -pub enum RecordUpdateStatus { - /// No record was found for the provided ID. - NotFound, - /// No record for this table was provided as part of the update. - NotProvided, - /// An update for this record was provided, and a a record matching the - /// provided ID exists. - Found(UpdateStatus), -} - -impl RecordUpdateStatus { - pub fn was_updated(self) -> bool { - matches!(self, Self::Found(UpdateStatus::Updated)) - } -} - -/// Computes the update status to return from the results of queries that find -/// and update an object with an ID of type `T`. -fn compute_update_status( - found: Option, - updated: Option, -) -> Option -where - T: PartialEq + std::fmt::Display, -{ - match (found, updated) { - // If both the "find" and "update" prongs returned an ID, the row was - // updated. The IDs should match in this case (if they don't then the - // query was constructed very strangely!). - (Some(found_id), Some(updated_id)) if found_id == updated_id => { - Some(UpdateStatus::Updated) - } - // If the "find" prong returned an ID but the "update" prong didn't, the - // row exists but wasn't updated. - (Some(_), None) => Some(UpdateStatus::NotUpdatedButExists), - // If neither prong returned anything, indicate the row is missing. - (None, None) => None, - // If both prongs returned an ID, but they don't match, something - // terrible has happened--the prongs must have referred to different - // IDs! - (Some(found_id), Some(mismatched_id)) => unreachable!( - "updated ID {} didn't match found ID {}", - mismatched_id, found_id - ), - // Similarly, if the target ID was not found but something was updated - // anyway, then something is wrong with the update query--either it has - // the wrong ID or did not filter rows properly. - (None, Some(updated_id)) => unreachable!( - "ID {} was updated but no found ID was supplied", - updated_id - ), - } -} - -impl VmmAndMigrationUpdate { - pub fn new( - vmm_id: PropolisUuid, - new_vmm_runtime_state: VmmRuntimeState, - Migrations { migration_in, migration_out }: Migrations<'_>, - ) -> Self { - let vmm_find = Box::new( - vmm_dsl::vmm - .filter(vmm_dsl::id.eq(vmm_id.into_untyped_uuid())) - .select(vmm_dsl::id), - ); - - let vmm_update = Box::new( - diesel::update(vmm_dsl::vmm) - .filter(vmm_dsl::time_deleted.is_null()) - .filter(vmm_dsl::id.eq(vmm_id.into_untyped_uuid())) - .filter(vmm_dsl::state_generation.lt(new_vmm_runtime_state.gen)) - .set(new_vmm_runtime_state), - ); - - fn migration_find( - migration_id: Uuid, - ) -> Box + Send> { - Box::new( - migration_dsl::migration - .filter(migration_dsl::id.eq(migration_id)) - .filter(migration_dsl::time_deleted.is_null()) - .select(migration_dsl::id), - ) - } - - let migration_in = migration_in.cloned().map( - |MigrationRuntimeState { - migration_id, - state, - gen, - time_updated, - }| { - let state = MigrationState::from(state); - let gen = Generation::from(gen); - let update = Box::new( - diesel::update(migration_dsl::migration) - .filter(migration_dsl::id.eq(migration_id)) - .filter( - migration_dsl::target_propolis_id - .eq(vmm_id.into_untyped_uuid()), - ) - .filter(migration_dsl::target_gen.lt(gen)) - .set(( - migration_dsl::target_state.eq(state), - migration_dsl::time_target_updated.eq(time_updated), - migration_dsl::target_gen.eq(gen), - )), - ); - Update { - find: migration_find(migration_id), - update, - name: "migration_in", - id: migration_dsl::id::NAME, - } - }, - ); - - let migration_out = migration_out.cloned().map( - |MigrationRuntimeState { - migration_id, - state, - gen, - time_updated, - }| { - let state = MigrationState::from(state); - let gen = Generation::from(gen); - let update = Box::new( - diesel::update(migration_dsl::migration) - .filter(migration_dsl::id.eq(migration_id)) - .filter( - migration_dsl::source_propolis_id - .eq(vmm_id.into_untyped_uuid()), - ) - .filter(migration_dsl::source_gen.lt(gen)) - .set(( - migration_dsl::source_state.eq(state), - migration_dsl::time_source_updated.eq(time_updated), - migration_dsl::source_gen.eq(gen), - )), - ); - Update { - find: migration_find(migration_id), - update, - name: "migration_out", - id: migration_dsl::id::NAME, - } - }, - ); - - Self { vmm_find, vmm_update, migration_in, migration_out } - } - - pub async fn execute_and_check( - self, - conn: &(impl async_bb8_diesel::AsyncConnection + Sync), - ) -> Result { - let has_migration_in = self.migration_in.is_some(); - let has_migration_out = self.migration_out.is_some(); - let ( - vmm_found, - vmm_updated, - migration_in_found, - migration_in_updated, - migration_out_found, - migration_out_updated, - ) = self - .get_result_async::<( - Option, - Option, - Option, - Option, - Option, - Option, - // WHEW! - )>(conn) - .await?; - - let vmm_status = compute_update_status(vmm_found, vmm_updated); - - let migration_in_status = if has_migration_in { - compute_update_status(migration_in_found, migration_in_updated) - .map(RecordUpdateStatus::Found) - .unwrap_or(RecordUpdateStatus::NotFound) - } else { - RecordUpdateStatus::NotProvided - }; - - let migration_out_status = if has_migration_out { - compute_update_status(migration_out_found, migration_out_updated) - .map(RecordUpdateStatus::Found) - .unwrap_or(RecordUpdateStatus::NotFound) - } else { - RecordUpdateStatus::NotProvided - }; - - Ok(VmmAndMigrationUpdateResult { - vmm_status, - migration_in_status, - migration_out_status, - }) - } -} - -impl QueryId for VmmAndMigrationUpdate { - type QueryId = (); - const HAS_STATIC_QUERY_ID: bool = false; -} - -impl Query for VmmAndMigrationUpdate { - type SqlType = ( - Nullable, - Nullable, - Nullable, - Nullable, - Nullable, - Nullable, - ); -} - -impl RunQueryDsl for VmmAndMigrationUpdate {} - -impl Update { - fn push_subqueries<'b>( - &'b self, - out: &mut AstPass<'_, 'b, Pg>, - ) -> QueryResult<()> { - out.push_sql(self.name); - out.push_sql("_found AS (SELECT ("); - self.find.walk_ast(out.reborrow())?; - out.push_sql(") AS ID), "); - out.push_sql(self.name); - out.push_sql("_updated AS ("); - self.update.walk_ast(out.reborrow())?; - out.push_sql("RETURNING id), "); - out.push_sql(self.name); - out.push_sql("_result AS (SELECT "); - out.push_sql(self.name); - out.push_sql("_found."); - out.push_identifier(self.id)?; - out.push_sql(" AS found, "); - out.push_sql(self.name); - out.push_sql("_updated."); - out.push_identifier(self.id)?; - out.push_sql(" AS updated"); - out.push_sql(" FROM "); - out.push_sql(self.name); - out.push_sql("_found LEFT JOIN "); - out.push_sql(self.name); - out.push_sql("_updated ON "); - out.push_sql(self.name); - out.push_sql("_found."); - out.push_identifier(self.id)?; - out.push_sql("= "); - out.push_sql(self.name); - out.push_sql("_updated."); - out.push_identifier(self.id)?; - out.push_sql(")"); - - Ok(()) - } -} - -impl QueryFragment for VmmAndMigrationUpdate { - fn walk_ast<'b>(&'b self, mut out: AstPass<'_, 'b, Pg>) -> QueryResult<()> { - out.push_sql("WITH "); - - if let Some(ref m) = self.migration_in { - m.push_subqueries(&mut out)?; - out.push_sql(", "); - } - - if let Some(ref m) = self.migration_out { - m.push_subqueries(&mut out)?; - out.push_sql(", "); - } - - out.push_sql("vmm_found AS (SELECT ("); - self.vmm_find.walk_ast(out.reborrow())?; - out.push_sql(") AS id), "); - - out.push_sql("vmm_updated AS ("); - self.vmm_update.walk_ast(out.reborrow())?; - out.push_sql(" RETURNING id), "); - out.push_sql("vmm_result AS ("); - out.push_sql("SELECT vmm_found."); - out.push_identifier(vmm_dsl::id::NAME)?; - out.push_sql(" AS found, vmm_updated."); - out.push_identifier(vmm_dsl::id::NAME)?; - out.push_sql(" AS updated"); - out.push_sql(" FROM vmm_found LEFT JOIN vmm_updated ON vmm_found."); - out.push_identifier(vmm_dsl::id::NAME)?; - out.push_sql(" = vmm_updated."); - out.push_identifier(vmm_dsl::id::NAME)?; - out.push_sql(") "); - - fn push_select_from_result( - update: Option<&Update>, - out: &mut AstPass<'_, '_, Pg>, - ) { - if let Some(update) = update { - out.push_sql(update.name); - out.push_sql("_result.found, "); - out.push_sql(update.name); - out.push_sql("_result.updated"); - } else { - out.push_sql("NULL, NULL") - } - } - - out.push_sql("SELECT vmm_result.found, vmm_result.updated, "); - push_select_from_result(self.migration_in.as_ref(), &mut out); - out.push_sql(", "); - push_select_from_result(self.migration_out.as_ref(), &mut out); - out.push_sql(" "); - - out.push_sql("FROM vmm_result"); - if self.migration_in.is_some() { - out.push_sql(", migration_in_result"); - } - - if self.migration_out.is_some() { - out.push_sql(", migration_out_result"); - } - - Ok(()) - } -} - -#[cfg(test)] -mod test { - use super::*; - use crate::db::model::Generation; - use crate::db::model::VmmState; - use crate::db::raw_query_builder::expectorate_query_contents; - use chrono::Utc; - use omicron_common::api::internal::nexus::MigrationRuntimeState; - use omicron_common::api::internal::nexus::MigrationState; - use uuid::Uuid; - - // These tests are a bit of a "change detector", but they're here to help - // with debugging too. If you change this query, it can be useful to see - // exactly how the output SQL has been altered. - - fn mk_vmm_state() -> VmmRuntimeState { - VmmRuntimeState { - time_state_updated: Utc::now(), - gen: Generation::new(), - state: VmmState::Starting, - } - } - - fn mk_migration_state() -> MigrationRuntimeState { - let migration_id = Uuid::nil(); - MigrationRuntimeState { - migration_id, - state: MigrationState::Pending, - gen: Generation::new().into(), - time_updated: Utc::now(), - } - } - - #[tokio::test] - async fn expectorate_query_only_vmm() { - let vmm_id = PropolisUuid::nil(); - let vmm_state = mk_vmm_state(); - - let query = VmmAndMigrationUpdate::new( - vmm_id, - vmm_state, - Migrations::default(), - ); - expectorate_query_contents( - &query, - "tests/output/vmm_and_migration_update_vmm_only.sql", - ) - .await; - } - - #[tokio::test] - async fn expectorate_query_vmm_and_migration_in() { - let vmm_id = PropolisUuid::nil(); - let vmm_state = mk_vmm_state(); - let migration = mk_migration_state(); - - let query = VmmAndMigrationUpdate::new( - vmm_id, - vmm_state, - Migrations { migration_in: Some(&migration), migration_out: None }, - ); - expectorate_query_contents( - &query, - "tests/output/vmm_and_migration_update_vmm_and_migration_in.sql", - ) - .await; - } - - #[tokio::test] - async fn expectorate_query_vmm_and_migration_out() { - let vmm_id = PropolisUuid::nil(); - let vmm_state = mk_vmm_state(); - let migration = mk_migration_state(); - - let query = VmmAndMigrationUpdate::new( - vmm_id, - vmm_state, - Migrations { migration_out: Some(&migration), migration_in: None }, - ); - expectorate_query_contents( - &query, - "tests/output/vmm_and_migration_update_vmm_and_migration_out.sql", - ) - .await; - } - - #[tokio::test] - async fn expectorate_query_vmm_and_both_migrations() { - let vmm_id = PropolisUuid::nil(); - let vmm_state = mk_vmm_state(); - let migration_in = mk_migration_state(); - let migration_out = mk_migration_state(); - - let query = VmmAndMigrationUpdate::new( - vmm_id, - vmm_state, - Migrations { - migration_in: Some(&migration_in), - migration_out: Some(&migration_out), - }, - ); - expectorate_query_contents( - &query, - "tests/output/vmm_and_migration_update_vmm_and_both_migrations.sql", - ) - .await; - } -} diff --git a/nexus/src/app/instance.rs b/nexus/src/app/instance.rs index 00076dfa483..d5f869a4cfc 100644 --- a/nexus/src/app/instance.rs +++ b/nexus/src/app/instance.rs @@ -1853,6 +1853,7 @@ pub(crate) async fn notify_instance_updated( let result = datastore .vmm_and_migration_update_runtime( + &opctx, propolis_id, // TODO(eliza): probably should take this by value... &new_runtime_state.vmm_state.clone().into(), diff --git a/nexus/src/app/sagas/instance_update/mod.rs b/nexus/src/app/sagas/instance_update/mod.rs index 311d7db78a1..7a9fdfb39f9 100644 --- a/nexus/src/app/sagas/instance_update/mod.rs +++ b/nexus/src/app/sagas/instance_update/mod.rs @@ -2290,6 +2290,7 @@ mod test { .nexus .datastore() .vmm_and_migration_update_runtime( + &self.opctx, vmm_id, &new_runtime, migrations, @@ -2346,6 +2347,7 @@ mod test { .nexus .datastore() .vmm_and_migration_update_runtime( + &self.opctx, vmm_id, &new_runtime, migrations, From 4b7fe6bf83aa34da74bee0331a66782243e6d241 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Thu, 1 Aug 2024 15:06:40 -0700 Subject: [PATCH 211/234] don't reject runtime states with no migration WHOOPS! --- nexus/db-queries/src/db/datastore/vmm.rs | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/nexus/db-queries/src/db/datastore/vmm.rs b/nexus/db-queries/src/db/datastore/vmm.rs index 14a922fcf21..14c3405a705 100644 --- a/nexus/db-queries/src/db/datastore/vmm.rs +++ b/nexus/db-queries/src/db/datastore/vmm.rs @@ -213,7 +213,13 @@ impl DataStore { m.as_ref().map(|m| m.migration_id) } - if migration_id(migration_in) == migration_id(migration_out) { + // If both a migration-in and migration-out update was provided for this + // VMM, they can't be from the same migration, since migrating from a + // VMM to itself wouldn't make sense... + let migration_out_id = migration_id(migration_out); + if migration_out_id.is_some() + && migration_out_id == migration_id(migration_in) + { return Err(Error::conflict( "migrating from a VMM to itself is nonsensical", )) From c7efd477f628460f4405460f3e8177ff6197a471 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Mon, 5 Aug 2024 10:04:37 -0700 Subject: [PATCH 212/234] start addressing @gjcolombo's feedback thiss is just the easy low-hanging fruit, i'll do the more complex suggestions next :) --- nexus/db-queries/src/db/datastore/instance.rs | 18 +++++++++++------- nexus/src/app/sagas/instance_update/mod.rs | 14 ++++++++------ sled-agent/src/common/instance.rs | 8 ++++++++ 3 files changed, 27 insertions(+), 13 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/instance.rs b/nexus/db-queries/src/db/datastore/instance.rs index 31380feb96f..0ed32334e7b 100644 --- a/nexus/db-queries/src/db/datastore/instance.rs +++ b/nexus/db-queries/src/db/datastore/instance.rs @@ -47,6 +47,7 @@ use omicron_common::api::external::Error; use omicron_common::api::external::ListResultVec; use omicron_common::api::external::LookupResult; use omicron_common::api::external::LookupType; +use omicron_common::api::external::MessagePair; use omicron_common::api::external::ResourceType; use omicron_common::bail_unless; use omicron_uuid_kinds::GenericUuid; @@ -701,12 +702,15 @@ impl DataStore { &*self.pool_connection_authorized(opctx).await?, ) .await - .map_err(|error| { - Error::conflict(format!( - "cannot set migration ID {migration_id} for instance \ - {instance_id} (perhaps a previous migration is already \ - set): {error:#}" - )) + .map_err(|error| Error::Conflict { + message: MessagePair::new_full( + "another migration is already in progress".to_string(), + format!( + "cannot set migration ID {migration_id} for instance \ + {instance_id} (perhaps another migration ID is \ + already present): {error:#}" + ), + ), }) } @@ -1309,7 +1313,7 @@ impl DataStore { "attempted to release a lock held by another saga! this is a bug!", )) }, - Some(_) => Err(Error::conflict( + Some(_) => Err(Error::conflict( "attempted to commit an instance update, but the state generation has advanced!" )), None => Err(Error::internal_error( diff --git a/nexus/src/app/sagas/instance_update/mod.rs b/nexus/src/app/sagas/instance_update/mod.rs index 7a9fdfb39f9..45fd94d16f6 100644 --- a/nexus/src/app/sagas/instance_update/mod.rs +++ b/nexus/src/app/sagas/instance_update/mod.rs @@ -86,8 +86,10 @@ //! ## Theory of Operation //! //! In order to ensure that changes to the state of an instance are handled -//! reliably, we require that all mutations of an instance` record are performed -//! by a saga. The following sagas currently touch the `instance` record: +//! reliably, we require that all multi-stage operations on an instance --- +//! i.e., operations which cannot be done atomically in a single database query +//! --- on an instance are performed by a saga. The following sagas currently +//! touch the `instance` record: //! //! - [`instance_start`] //! - [`instance_migrate`] @@ -120,9 +122,9 @@ //! multiple sagas mutate the same fields in the instance record, because the //! states from which a particular transition may start limited. However, this //! is not the case for the `instance-update` saga, which may need to run any -//! time a sled-agent publishes a new instance state. Therefore, this saga has -//! the dubious honor of using the only distributed lock in Nexus (at the time -//! of writing), the "instance updater lock". +//! time a sled-agent publishes a new instance state. Therefore, this saga +//! ensures mutual exclusion using one of the only distributed locking schemes +//! in Omicron: the "instance updater lock". //! //! ### The Instance-Updater Lock, or, "Distributed RAII" //! @@ -244,7 +246,7 @@ //! action they performed. This is because, unlike `instance-start`, //! `instance-migrate``, or `instance-delete`, the instance-update saga is //! **not** attempting to perform a state change for the instance that was -//! requested by an operator. Instead, it is attempting to update the +//! requested by a user. Instead, it is attempting to update the //! database and networking configuration *to match a state change that has //! already occurred.* //! diff --git a/sled-agent/src/common/instance.rs b/sled-agent/src/common/instance.rs index 4f137439880..adbeb9158f4 100644 --- a/sled-agent/src/common/instance.rs +++ b/sled-agent/src/common/instance.rs @@ -565,6 +565,14 @@ mod test { .expect("instance must have a migration state"); assert_eq!(migration.state, MigrationState::Completed); assert_eq!(migration.gen, prev_migration.gen); + + state.terminate_rudely(false); + let migration = state + .migration_out + .clone() + .expect("instance must have a migration state"); + assert_eq!(migration.state, MigrationState::Completed); + assert_eq!(migration.gen, prev_migration.gen); } #[test] From 8e49853f9f17ce8e9ca1b5787c2f9aaae5c5fd1d Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Mon, 5 Aug 2024 10:13:03 -0700 Subject: [PATCH 213/234] rm extra backtick --- nexus/src/app/sagas/instance_update/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nexus/src/app/sagas/instance_update/mod.rs b/nexus/src/app/sagas/instance_update/mod.rs index 45fd94d16f6..bff3b5e49c1 100644 --- a/nexus/src/app/sagas/instance_update/mod.rs +++ b/nexus/src/app/sagas/instance_update/mod.rs @@ -244,7 +244,7 @@ //! changes made by the forward action. The `instance-update` saga, however, is //! a bit different: most of its nodes don't have reverse actions that undo the //! action they performed. This is because, unlike `instance-start`, -//! `instance-migrate``, or `instance-delete`, the instance-update saga is +//! `instance-migrate`, or `instance-delete`, the instance-update saga is //! **not** attempting to perform a state change for the instance that was //! requested by a user. Instead, it is attempting to update the //! database and networking configuration *to match a state change that has From 5754b3aa53167af8a38e8bb50544670de9c28393 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Mon, 5 Aug 2024 11:08:27 -0700 Subject: [PATCH 214/234] activate bg task when dropping lock on unwind --- nexus/src/app/sagas/instance_update/mod.rs | 28 +++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/nexus/src/app/sagas/instance_update/mod.rs b/nexus/src/app/sagas/instance_update/mod.rs index bff3b5e49c1..067c203caeb 100644 --- a/nexus/src/app/sagas/instance_update/mod.rs +++ b/nexus/src/app/sagas/instance_update/mod.rs @@ -675,7 +675,23 @@ const NETWORK_CONFIG_UPDATE: &str = "network_config_update"; declare_saga_actions! { instance_update; - // Become the instance updater + // Become the instance updater. + // + // This action inherits the instance-updater lock from the + // `start-instance-update` saga, which attempts to compare-and-swap in a new + // saga UUID. This ensuring that only one child update saga is + // actually allowed to proceed, even if the `start-instance-update` saga's + // "fetch_instance_and_start_real_saga" executes multiple times, avoiding + // duplicate work. + // + // Unwinding this action releases the updater lock. In addition, it + // activates the `instance-updater` background task to ensure that a new + // update saga is started in a timely manner, to perform the work that the + // unwinding saga was *supposed* to do. Since this action only succeeds if + // the lock was acquired, and this saga is only started if updates are + // required, having this action activate the background task when unwinding + // avoids unneeded activations when a saga fails just because it couldn't + // get the lock. BECOME_UPDATER -> "updater_lock" { + siu_become_updater - siu_unbecome_updater @@ -862,6 +878,16 @@ async fn siu_unbecome_updater( unwind_instance_lock(lock, serialized_authn, authz_instance, &sagactx) .await; + // Now that we've released the lock, activate the `instance-updater` + // background task to make sure that a new instance update saga is started + // if the instance still needs to be updated. + sagactx + .user_data() + .nexus() + .background_tasks + .task_instance_updater + .activate(); + Ok(()) } From 4487a6ab97295edb0a9b64ddac7f90a5bf449784 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Mon, 5 Aug 2024 13:16:02 -0700 Subject: [PATCH 215/234] don't unset migration IDs in migration unwind as per @gjcolombo's suggestions in [this comment][1], we no longer unset migration IDs when unwinding, and instead allow the `instance_set_migration_ids` query to succeed if the target VMM is `SagaUnwound` *and* both sides of the current migration are `Failed`. [1]: https://github.com/oxidecomputer/omicron/pull/5749#discussion_r1702417278 --- nexus/db-queries/src/db/datastore/instance.rs | 95 ++++++++++++------- nexus/src/app/sagas/instance_migrate.rs | 58 +++-------- 2 files changed, 75 insertions(+), 78 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/instance.rs b/nexus/db-queries/src/db/datastore/instance.rs index 0ed32334e7b..a4e3bd19995 100644 --- a/nexus/db-queries/src/db/datastore/instance.rs +++ b/nexus/db-queries/src/db/datastore/instance.rs @@ -22,6 +22,7 @@ use crate::db::model::Generation; use crate::db::model::Instance; use crate::db::model::InstanceRuntimeState; use crate::db::model::Migration; +use crate::db::model::MigrationState; use crate::db::model::Name; use crate::db::model::Project; use crate::db::model::Sled; @@ -635,6 +636,7 @@ impl DataStore { target_propolis_id: PropolisUuid, ) -> Result { use db::schema::instance::dsl; + use db::schema::migration::dsl as migration_dsl; use db::schema::vmm::dsl as vmm_dsl; // Only allow migrating out if the active VMM is running or rebooting. @@ -664,29 +666,53 @@ impl DataStore { // ID to be clobbered. .filter(vmm_dsl::state.eq(VmmState::SagaUnwound)) .select(vmm_dsl::instance_id); + // Subquery for checking if an already present migration ID points at a + // migration where both the source- and target-sides are marked as + // failed. If both are failed, *and* the target VMM is `SagaUnwound` as + // determined by the query above, then it's okay to clobber that + // migration, as it was left behind by a previous migrate saga unwinding. + let current_migration_failed = migration_dsl::migration + .filter(migration_dsl::id.nullable().eq(dsl::migration_id)) + .filter(migration_dsl::target_state.eq(MigrationState::FAILED)) + .filter(migration_dsl::source_state.eq(MigrationState::FAILED)) + .select(migration_dsl::instance_id); diesel::update(dsl::instance) .filter(dsl::time_deleted.is_null()) .filter(dsl::id.eq(instance_id)) .filter( - // To ensure that saga actions that set migration IDs are - // idempotent, we update the row if the migration and target - // VMM IDs are not present *or* if they are already equal to the - // desired values. This way, we can use a `RETURNING` clause to - // fetch the current state after the update, rather than - // `check_if_exists` which returns the prior state, and still - // fail to update the record if another migration/target VMM ID - // is already there. - (dsl::migration_id.is_null().and( - dsl::target_propolis_id - .is_null() - // It's okay to clobber a previously-set target VMM ID - // if (and only if!) it's in the saga-unwound state. - .or(dsl::id.eq_any(target_vmm_unwound)), - )) + // Update the row if and only if one of the following is true: + // + // - The migration and target VMM IDs are not present + (dsl::migration_id + .is_null() + .and(dsl::target_propolis_id.is_null())) + // - The migration and target VMM IDs are set to the values + // we are trying to set. + // + // This way, we can use a `RETURNING` clause to fetch the + // current state after the update, rather than + // `check_if_exists` which returns the prior state, and still + // fail to update the record if another migration/target VMM + // ID is already there. .or(dsl::migration_id .eq(Some(migration_id)) - .and(dsl::target_propolis_id.eq(Some(target_propolis_id)))), + .and(dsl::target_propolis_id.eq(Some(target_propolis_id)))) + // - The migration and target VMM IDs are set to another + // migration, but the target VMM state is `SagaUnwound` and + // the migration is `Failed` on both sides. + // + // This would indicate that the migration/VMM IDs are left + // behind by another migrate saga failing, and are okay to get + // rid of. + .or( + // Note that both of these queries return the instance ID + // from the VMM and migration records, so we check if one was + // found by comparing it to the actual instance ID. + dsl::id + .eq_any(target_vmm_unwound) + .and(dsl::id.eq_any(current_migration_failed)), + ), ) .filter(dsl::active_propolis_id.eq(src_propolis_id)) .filter(dsl::id.eq_any(vmm_ok)) @@ -1995,7 +2021,6 @@ mod tests { instance.runtime().migration_id, instance2.runtime().migration_id ); - let instance = instance2; // Trying to set a new migration should fail, as long as the prior stuff // is still in place. @@ -2036,7 +2061,11 @@ mod tests { PropolisUuid::from_untyped_uuid(vmm3.id), ) .await - ).expect_err("trying to set migration IDs should fail when a previous migration and VMM are still there"); + ) + .expect_err( + "trying to set migration IDs should fail when a previous \ + migration and VMM are still there", + ); // Pretend the previous migration saga has unwound the VMM let updated = dbg!( @@ -2054,7 +2083,7 @@ mod tests { .expect("updating VMM state should be fine"); assert!(updated); - // It should still fail due to the presence of the migration ID. + // It should still fail, since the migration is still in progress. dbg!( datastore .instance_set_migration_ids( @@ -2065,27 +2094,23 @@ mod tests { PropolisUuid::from_untyped_uuid(vmm3.id), ) .await - ).expect_err("trying to set migration IDs should fail when a previous migration ID is still there"); + ) + .expect_err( + "trying to set migration IDs should fail when a previous \ + migration ID is present and not marked as failed", + ); - // Remove the migration ID. + // Now, mark the previous migration as Failed. let updated = dbg!(datastore - .instance_update_runtime( - &instance_id, - &InstanceRuntimeState { - time_updated: Utc::now(), - r#gen: Generation(instance.runtime_state.gen.0.next()), - nexus_state: InstanceState::Vmm, - propolis_id: Some(vmm1.id), - migration_id: None, - ..instance.runtime_state.clone() - }, - ) + .migration_mark_failed(&opctx, migration.id) .await - .expect("instance update should work")); + .expect( + "we should be able to mark the previous migration as failed" + )); assert!(updated); - // Now that the migration ID is gone, we should be able to clobber the - // SagaUnwound VMM ID. + // If the current migration is failed on both sides *and* the current + // VMM is SagaUnwound, we should be able to clobber them with new IDs. let instance = dbg!( datastore .instance_set_migration_ids( diff --git a/nexus/src/app/sagas/instance_migrate.rs b/nexus/src/app/sagas/instance_migrate.rs index 2c61b4380c8..f7f46588165 100644 --- a/nexus/src/app/sagas/instance_migrate.rs +++ b/nexus/src/app/sagas/instance_migrate.rs @@ -91,9 +91,23 @@ declare_saga_actions! { // those tables. Because the `instance` table is queried in the public API, // we take care to ensure that it doesn't have "dangling pointers" to // records in the `vmm` and `migration` tables that don't exist yet. + // + // Note that unwinding this action does *not* clear the migration IDs from + // the instance record. This is to avoid a potential race with the instance + // update saga where: + // + // - a `instance-migrate` saga sets the migration IDs at instance state + // generation _N_ + // - an `instance-update` saga increments the instance's state generation to + // _N_ + 1 + // - the `instance-migrate` saga unwinds and attempts to clear the migration + // IDs, but can't, because the state generation has advanced. + // + // Instead, we leave the migration IDs in place and rely on setting the VMM + // state to `SagaUnwound` to indicate to other future `instance-migrate` + // sagas that it's okay to start a new migration. SET_MIGRATION_IDS -> "set_migration_ids" { + sim_set_migration_ids - - sim_clear_migration_ids } // This step registers the instance with the destination sled. Care is @@ -368,48 +382,6 @@ async fn sim_set_migration_ids( .map_err(ActionError::action_failed) } -async fn sim_clear_migration_ids( - sagactx: NexusActionContext, -) -> Result<(), anyhow::Error> { - let osagactx = sagactx.user_data(); - let params = sagactx.saga_params::()?; - let opctx = crate::context::op_context_for_saga_action( - &sagactx, - ¶ms.serialized_authn, - ); - let db_instance = params.instance; - let instance_id = InstanceUuid::from_untyped_uuid(db_instance.id()); - let src_propolis_id = PropolisUuid::from_untyped_uuid(params.src_vmm.id); - let migration_id = sagactx.lookup::("migrate_id")?; - let dst_propolis_id = sagactx.lookup::("dst_propolis_id")?; - - info!(osagactx.log(), "clearing migration IDs for saga unwind"; - "instance_id" => %db_instance.id(), - "migration_id" => %migration_id, - "src_propolis_id" => %src_propolis_id, - "dst_propolis_id" => %dst_propolis_id); - - if let Err(e) = osagactx - .datastore() - .instance_unset_migration_ids( - &opctx, - instance_id, - migration_id, - dst_propolis_id, - ) - .await - { - warn!(osagactx.log(), - "Error clearing migration IDs during rollback"; - "instance_id" => %instance_id, - "src_propolis_id" => %src_propolis_id, - "dst_propolis_id" => %dst_propolis_id, - "error" => ?e); - } - - Ok(()) -} - async fn sim_ensure_destination_propolis( sagactx: NexusActionContext, ) -> Result<(), ActionError> { From 90b2d04ff6fa21d13406a127da89809d0f1270bb Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Mon, 5 Aug 2024 13:37:46 -0700 Subject: [PATCH 216/234] fix "succeed idempotently" tests not testing that --- nexus/src/app/sagas/instance_update/mod.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/nexus/src/app/sagas/instance_update/mod.rs b/nexus/src/app/sagas/instance_update/mod.rs index 067c203caeb..16b96b37981 100644 --- a/nexus/src/app/sagas/instance_update/mod.rs +++ b/nexus/src/app/sagas/instance_update/mod.rs @@ -1883,7 +1883,7 @@ mod test { .source(MigrationState::Failed, VmmState::Running) .setup_test(cptestctx, &other_sleds) .await - .run_saga_basic_usage_succeeds_test(cptestctx) + .run_actions_succeed_idempotently_test(cptestctx) .await; } @@ -1945,7 +1945,7 @@ mod test { .source(MigrationState::Failed, VmmState::Running) .setup_test(cptestctx, &other_sleds) .await - .run_saga_basic_usage_succeeds_test(cptestctx) + .run_actions_succeed_idempotently_test(cptestctx) .await; } @@ -2007,7 +2007,7 @@ mod test { .source(MigrationState::Failed, VmmState::Destroyed) .setup_test(cptestctx, &other_sleds) .await - .run_saga_basic_usage_succeeds_test(cptestctx) + .run_actions_succeed_idempotently_test(cptestctx) .await; } @@ -2069,7 +2069,7 @@ mod test { .source(MigrationState::Failed, VmmState::Destroyed) .setup_test(cptestctx, &other_sleds) .await - .run_saga_basic_usage_succeeds_test(cptestctx) + .run_actions_succeed_idempotently_test(cptestctx) .await; } From 73cdb72b799ec073aad36820d6d0dc176908e93b Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Mon, 5 Aug 2024 14:04:36 -0700 Subject: [PATCH 217/234] tests for migration completed but target VMM destroyed --- nexus/src/app/sagas/instance_update/mod.rs | 116 +++++++++++++++++---- 1 file changed, 98 insertions(+), 18 deletions(-) diff --git a/nexus/src/app/sagas/instance_update/mod.rs b/nexus/src/app/sagas/instance_update/mod.rs index 16b96b37981..445653b8d29 100644 --- a/nexus/src/app/sagas/instance_update/mod.rs +++ b/nexus/src/app/sagas/instance_update/mod.rs @@ -2101,6 +2101,68 @@ mod test { .await; } + // === migration completed, but then the target was destroyed === + + #[nexus_test(server = crate::Server)] + async fn test_migration_completed_but_target_destroyed_succeeds( + cptestctx: &ControlPlaneTestContext, + ) { + let _project_id = setup_test_project(&cptestctx.external_client).await; + let other_sleds = test_helpers::add_sleds(cptestctx, 1).await; + + MigrationOutcome::default() + .target(MigrationState::Completed, VmmState::Destroyed) + .source(MigrationState::Completed, VmmState::Stopping) + .setup_test(cptestctx, &other_sleds) + .await + .run_saga_basic_usage_succeeds_test(cptestctx) + .await; + } + + #[nexus_test(server = crate::Server)] + async fn test_migration_completed_but_target_destroyed_actions_succeed_idempotently( + cptestctx: &ControlPlaneTestContext, + ) { + let _project_id = setup_test_project(&cptestctx.external_client).await; + let other_sleds = test_helpers::add_sleds(cptestctx, 1).await; + + MigrationOutcome::default() + .target(MigrationState::Completed, VmmState::Destroyed) + .source(MigrationState::Completed, VmmState::Stopping) + .setup_test(cptestctx, &other_sleds) + .await + .run_actions_succeed_idempotently_test(cptestctx) + .await; + } + + #[nexus_test(server = crate::Server)] + async fn test_migration_completed_but_target_destroyed_can_unwind( + cptestctx: &ControlPlaneTestContext, + ) { + let nexus = &cptestctx.server.server_context().nexus; + let other_sleds = test_helpers::add_sleds(cptestctx, 1).await; + let _project_id = setup_test_project(&cptestctx.external_client).await; + + let outcome = MigrationOutcome::default() + .target(MigrationState::Completed, VmmState::Destroyed) + .source(MigrationState::Completed, VmmState::Stopping); + + test_helpers::action_failure_can_unwind::( + nexus, + || { + Box::pin(async { + outcome + .setup_test(cptestctx, &other_sleds) + .await + .saga_params() + }) + }, + || Box::pin(after_unwinding(cptestctx)), + &cptestctx.logctx.log, + ) + .await; + } + #[derive(Clone, Copy, Default)] struct MigrationOutcome { source: Option<(MigrationState, VmmState)>, @@ -2397,9 +2459,9 @@ mod test { info!( cptestctx.logctx.log, "checking update saga results after migration"; - "source_outcome" => ?self.outcome.source.as_ref(), - "target_outcome" => ?self.outcome.target.as_ref(), - "migration_failed" => self.outcome.failed, + "source_outcome" => ?dbg!(self.outcome.source.as_ref()), + "target_outcome" => ?dbg!(self.outcome.target.as_ref()), + "migration_failed" => dbg!(self.outcome.failed), ); use test_helpers::*; @@ -2413,6 +2475,13 @@ mod test { assert_instance_unlocked(instance); assert_instance_record_is_consistent(instance); + let target_destroyed = self + .outcome + .target + .as_ref() + .map(|(_, state)| state == &VmmState::Destroyed) + .unwrap_or(false); + if self.outcome.failed { assert_eq!( instance_runtime.migration_id, None, @@ -2423,12 +2492,29 @@ mod test { "target VMM ID must be unset when a migration has failed" ); } else { - assert_eq!( - active_vmm_id, - Some(self.target_vmm_id()), - "target VMM must be in the active VMM position after migration success", - ); - assert_eq!(instance_runtime.nexus_state, InstanceState::Vmm); + if dbg!(target_destroyed) { + assert_eq!( + active_vmm_id, None, + "if the target VMM was destroyed, it should be unset, \ + even if a migration succeeded", + ); + assert_eq!( + instance_runtime.nexus_state, + InstanceState::NoVmm + ); + } else { + assert_eq!( + active_vmm_id, + Some(self.target_vmm_id()), + "target VMM must be in the active VMM position after \ + migration success", + ); + + assert_eq!( + instance_runtime.nexus_state, + InstanceState::Vmm + ); + } if self .outcome .target @@ -2470,13 +2556,6 @@ mod test { "source VMM should exist if and only if the source hasn't been destroyed", ); - let target_destroyed = self - .outcome - .target - .as_ref() - .map(|(_, state)| state == &VmmState::Destroyed) - .unwrap_or(false); - assert_eq!( self.target_resource_records_exist(cptestctx).await, !target_destroyed, @@ -2492,8 +2571,9 @@ mod test { !src_destroyed } else { // Otherwise, if the migration succeeded, the instance should be - // on the target VMM. - true + // on the target VMM, and virtual provisioning records should + // exist as long as the + !target_destroyed }; assert_eq!( From b031bd0bc5194290724009749e816b8dbf2c6ff9 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Mon, 5 Aug 2024 14:47:40 -0700 Subject: [PATCH 218/234] correctly handle target destroyed migration success --- nexus/src/app/sagas/instance_update/mod.rs | 83 +++++++++++++--------- 1 file changed, 50 insertions(+), 33 deletions(-) diff --git a/nexus/src/app/sagas/instance_update/mod.rs b/nexus/src/app/sagas/instance_update/mod.rs index 445653b8d29..09439b15026 100644 --- a/nexus/src/app/sagas/instance_update/mod.rs +++ b/nexus/src/app/sagas/instance_update/mod.rs @@ -475,7 +475,6 @@ impl UpdatesRequired { let mut update_required = false; let mut network_config = None; - let mut deprovision = false; // Has the active VMM been destroyed? let destroy_active_vmm = @@ -487,31 +486,14 @@ impl UpdatesRequired { // handles migration updates, will set this to the new VMM's ID, // instead. new_runtime.propolis_id = None; - new_runtime.nexus_state = InstanceState::NoVmm; update_required = true; - // If and only if the active VMM was destroyed *and* we did - // not successfully migrate out of it, the instance's - // virtual provisioning records and oximeter producer must - // be cleaned up. - // - // If the active VMM was destroyed as a result of a - // successful migration out, the subsequent code for - // determining what to do with the migration will change - // this back. - deprovision = true; - // Similarly, if the active VMM was destroyed and the - // instance has not migrated out of it, we must delete the - // instance's network configuration. Again, if there was a - // migration out, the subsequent migration-handling code - // will change this to a network config update if the - // instance is now living somewhere else. - network_config = Some(NetworkConfigUpdate::Delete); Some(id) } else { None } }); + // Okay, what about the target? let destroy_target_vmm = snapshot.target_vmm.as_ref().and_then(|target_vmm| { if target_vmm.runtime.state == VmmState::Destroyed { @@ -594,7 +576,23 @@ impl UpdatesRequired { new_runtime.propolis_id = Some(migration.target_propolis_id); network_config = Some(NetworkConfigUpdate::to_vmm(new_vmm)); - update_required = true; + } + + // Welp, the migration has succeeded, but the target Propolis + // has also gone away. This is functionally equivalent to having + // the active VMM go to `Destroyed`, so now we have no active + // VMM anymore. + if destroy_target_vmm.is_some() { + info!( + log, + "instance update (migration completed): target VMM \ + has gone away, destroying it!"; + "instance_id" => %instance_id, + "migration_id" => %migration.id, + "src_propolis_id" => %migration.source_propolis_id, + "target_propolis_id" => %migration.target_propolis_id, + ); + new_runtime.propolis_id = None; } // If the target reports that the migration has completed, @@ -614,19 +612,42 @@ impl UpdatesRequired { ); new_runtime.migration_id = None; new_runtime.dst_propolis_id = None; - update_required = true; } - // Even if the active VMM was destroyed (and we set the - // instance's state to `NoVmm` above), it has successfully - // migrated, so leave it in the VMM state and don't deallocate - // virtual provisioning records --- the instance is still - // incarnated. - new_runtime.nexus_state = InstanceState::Vmm; - deprovision = false; + update_required = true; } } + // If the *new* state no longer has a `propolis_id` field, that means + // that the active VMM was destroyed without a successful migration out + // (or, we migrated out to a target VMM that was immediately destroyed, + // which...seems weird but certainly could happen). In that case, the + // instance is no longer incarnated on a sled, and we must update the + // state of the world to reflect that. + let deprovision = if new_runtime.propolis_id.is_none() { + update_required = true; + // We no longer have a VMM. + new_runtime.nexus_state = InstanceState::NoVmm; + // If the active VMM was destroyed and the instance has not migrated + // out of it, we must delete the instance's network configuration. + // + // This clobbers a previously-set network config update to a new + // VMM, because if we set one above, we must have subsequently + // discovered that there actually *is* no new VMM anymore! + network_config = Some(NetworkConfigUpdate::Delete); + // The instance's virtual provisioning records must be deallocated, + // as it is no longer consuming any virtual resources. Providing a + // set of virtual provisioning counters to deallocate also indicates + // that the instance's oximeter producer should be cleaned up. + Some(Deprovision { + project_id: snapshot.instance.project_id, + cpus_diff: i64::from(snapshot.instance.ncpus.0 .0), + ram_diff: snapshot.instance.memory, + }) + } else { + None + }; + if !update_required { return None; } @@ -635,11 +656,7 @@ impl UpdatesRequired { new_runtime, destroy_active_vmm, destroy_target_vmm, - deprovision: deprovision.then(|| Deprovision { - project_id: snapshot.instance.project_id, - cpus_diff: i64::from(snapshot.instance.ncpus.0 .0), - ram_diff: snapshot.instance.memory, - }), + deprovision, network_config, }) } From 18b2f324fe20e78cc2e9c33ea3186cda0cf194e6 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Mon, 5 Aug 2024 16:25:49 -0700 Subject: [PATCH 219/234] fix start saga unwinding if duplicate child unwinds --- nexus/src/app/sagas/instance_update/start.rs | 24 ++++++++++++++++---- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/nexus/src/app/sagas/instance_update/start.rs b/nexus/src/app/sagas/instance_update/start.rs index 3350812a0c8..ae7152d3ab0 100644 --- a/nexus/src/app/sagas/instance_update/start.rs +++ b/nexus/src/app/sagas/instance_update/start.rs @@ -144,7 +144,7 @@ async fn siu_fetch_state_and_start_real_saga( sagactx.saga_params::()?; let osagactx = sagactx.user_data(); let lock_id = sagactx.lookup::(INSTANCE_LOCK_ID)?; - + let instance_id = authz_instance.id(); let log = osagactx.log(); // Did we get the lock? If so, we can start the next saga, otherwise, just @@ -155,7 +155,7 @@ async fn siu_fetch_state_and_start_real_saga( info!( log, "instance update: instance is already locked! doing nothing..."; - "instance_id" => %authz_instance.id(), + "instance_id" => %instance_id, "saga_id" => %lock_id, ); return Ok(()); @@ -178,7 +178,7 @@ async fn siu_fetch_state_and_start_real_saga( info!( log, "instance update: starting real update saga..."; - "instance_id" => %authz_instance.id(), + "instance_id" => %instance_id, "current.runtime_state" => ?state.instance.runtime(), "current.migration" => ?state.migration, "current.active_vmm" => ?state.active_vmm, @@ -189,7 +189,7 @@ async fn siu_fetch_state_and_start_real_saga( "update.destroy_target_vmm" => ?update.destroy_target_vmm, "update.deprovision" => update.deprovision.is_some(), ); - osagactx + if let Err(error) = osagactx .nexus() .sagas .saga_execute::(RealParams { @@ -199,7 +199,21 @@ async fn siu_fetch_state_and_start_real_saga( orig_lock, }) .await - .map_err(ActionError::action_failed)?; + { + warn!( + log, + "instance update: real update saga failed (which *could* \ + mean nothing...)"; + "instance_id" => %instance_id, + "error" => %error, + ); + // If the real saga failed, kick the background task. If the real + // saga failed because this action was executed twice and the second + // child saga couldn't lock the instance, that's fine, because the + // background task will only start new sagas for instances whose DB + // state actually *needs* an update. + osagactx.nexus().background_tasks.task_instance_updater.activate(); + } } else { info!( log, From c8b7421646350062cb843a7be78bf77df0e273f9 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Mon, 5 Aug 2024 16:51:10 -0700 Subject: [PATCH 220/234] fix completed updates spawning spurious update sagas --- nexus/src/app/sagas/instance_update/mod.rs | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/nexus/src/app/sagas/instance_update/mod.rs b/nexus/src/app/sagas/instance_update/mod.rs index 09439b15026..a1dd9a1757f 100644 --- a/nexus/src/app/sagas/instance_update/mod.rs +++ b/nexus/src/app/sagas/instance_update/mod.rs @@ -576,6 +576,7 @@ impl UpdatesRequired { new_runtime.propolis_id = Some(migration.target_propolis_id); network_config = Some(NetworkConfigUpdate::to_vmm(new_vmm)); + update_required = true; } // Welp, the migration has succeeded, but the target Propolis @@ -593,6 +594,7 @@ impl UpdatesRequired { "target_propolis_id" => %migration.target_propolis_id, ); new_runtime.propolis_id = None; + update_required = true; } // If the target reports that the migration has completed, @@ -612,9 +614,8 @@ impl UpdatesRequired { ); new_runtime.migration_id = None; new_runtime.dst_propolis_id = None; + update_required = true; } - - update_required = true; } } @@ -625,7 +626,11 @@ impl UpdatesRequired { // instance is no longer incarnated on a sled, and we must update the // state of the world to reflect that. let deprovision = if new_runtime.propolis_id.is_none() { - update_required = true; + // N.B. that this does *not* set `update_required`, because + // `new_runtime.propolis_id` might be `None` just because there was, + // already, no VMM there. `update_required` gets set above if there + // was any actual state change. + // We no longer have a VMM. new_runtime.nexus_state = InstanceState::NoVmm; // If the active VMM was destroyed and the instance has not migrated From 20a222be1409007f87b416e3987d1557d4e74712 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Tue, 6 Aug 2024 09:56:24 -0700 Subject: [PATCH 221/234] update tests to new migrate saga unwinding behavior --- nexus/src/app/sagas/instance_migrate.rs | 42 ++++++++++++++++++------- nexus/src/app/sagas/test_helpers.rs | 33 ++++++++++++++++++- 2 files changed, 63 insertions(+), 12 deletions(-) diff --git a/nexus/src/app/sagas/instance_migrate.rs b/nexus/src/app/sagas/instance_migrate.rs index f7f46588165..bb4bf282e41 100644 --- a/nexus/src/app/sagas/instance_migrate.rs +++ b/nexus/src/app/sagas/instance_migrate.rs @@ -717,24 +717,44 @@ mod tests { let after_saga = || -> futures::future::BoxFuture<'_, ()> { Box::pin({ async { - // Unwinding at any step should clear the migration IDs from - // the instance record and leave the instance's location - // otherwise untouched. - let new_state = - test_helpers::instance_fetch(cptestctx, instance_id) - .await; + let new_state = test_helpers::instance_fetch_all( + cptestctx, + instance_id, + ) + .await; - let new_instance = new_state.instance(); - let new_vmm = - new_state.vmm().as_ref().expect("vmm should be active"); + let new_instance = new_state.instance; + let new_vmm = new_state + .active_vmm + .as_ref() + .expect("vmm should be active"); - assert!(new_instance.runtime().migration_id.is_none()); - assert!(new_instance.runtime().dst_propolis_id.is_none()); assert_eq!( new_instance.runtime().propolis_id.unwrap(), new_vmm.id ); + // If the instance has had migration IDs set, then both + // sides of the migration should be marked as failed. + if let Some(migration) = new_state.migration { + assert_eq!( + migration.source_state, + db::model::MigrationState::FAILED + ); + assert_eq!( + migration.target_state, + db::model::MigrationState::FAILED + ); + } + // If the instance has a target VMM ID left behind by the + // unwinding saga, that VMM must be in the `SagaUnwound` state. + if let Some(target_vmm) = new_state.target_vmm { + assert_eq!( + target_vmm.runtime.state, + db::model::VmmState::SagaUnwound + ); + } + info!( &log, "migration saga unwind: stopping instance after failed \ diff --git a/nexus/src/app/sagas/test_helpers.rs b/nexus/src/app/sagas/test_helpers.rs index 31a77d49988..b9388a1116a 100644 --- a/nexus/src/app/sagas/test_helpers.rs +++ b/nexus/src/app/sagas/test_helpers.rs @@ -20,7 +20,11 @@ use nexus_db_model::InstanceState; use nexus_db_queries::{ authz, context::OpContext, - db::{datastore::InstanceAndActiveVmm, lookup::LookupPath, DataStore}, + db::{ + datastore::{InstanceAndActiveVmm, InstanceGestalt}, + lookup::LookupPath, + DataStore, + }, }; use nexus_test_interface::NexusServer; use nexus_test_utils::start_sled_agent; @@ -214,6 +218,33 @@ pub async fn instance_fetch( db_state } +pub async fn instance_fetch_all( + cptestctx: &ControlPlaneTestContext, + instance_id: InstanceUuid, +) -> InstanceGestalt { + let datastore = cptestctx.server.server_context().nexus.datastore().clone(); + let opctx = test_opctx(&cptestctx); + let (.., authz_instance) = LookupPath::new(&opctx, &datastore) + .instance_id(instance_id.into_untyped_uuid()) + .lookup_for(authz::Action::Read) + .await + .expect("test instance should be present in datastore"); + + let db_state = datastore + .instance_fetch_all(&opctx, &authz_instance) + .await + .expect("test instance's info should be fetchable"); + + info!(&cptestctx.logctx.log, "refetched all instance info from db"; + "instance_id" => %instance_id, + "instance" => ?db_state.instance, + "active_vmm" => ?db_state.active_vmm, + "target_vmm" => ?db_state.target_vmm, + "migration" => ?db_state.migration, + ); + + db_state +} pub async fn instance_fetch_by_name( cptestctx: &ControlPlaneTestContext, name: &str, From 77594e371ce77143797ce7f19341f60998697efa Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Tue, 6 Aug 2024 11:01:43 -0700 Subject: [PATCH 222/234] saga idempotency tests should test the real saga --- nexus/src/app/sagas/instance_update/mod.rs | 80 ++++++++++++++++++---- 1 file changed, 65 insertions(+), 15 deletions(-) diff --git a/nexus/src/app/sagas/instance_update/mod.rs b/nexus/src/app/sagas/instance_update/mod.rs index a1dd9a1757f..a3f06223ee1 100644 --- a/nexus/src/app/sagas/instance_update/mod.rs +++ b/nexus/src/app/sagas/instance_update/mod.rs @@ -916,8 +916,8 @@ async fn siu_unbecome_updater( async fn siu_update_network_config( sagactx: NexusActionContext, ) -> Result<(), ActionError> { - let Params { ref serialized_authn, ref authz_instance, .. } = - sagactx.saga_params()?; + let RealParams { ref serialized_authn, ref authz_instance, .. } = + sagactx.saga_params::()?; let update = sagactx.lookup::(NETWORK_CONFIG_UPDATE)?; @@ -1558,7 +1558,12 @@ mod test { let (state, params) = setup_active_vmm_destroyed_test(cptestctx).await; // Build the saga DAG with the provided test parameters - let dag = create_saga_dag::(params).unwrap(); + let real_params = make_real_params( + cptestctx, + &test_helpers::test_opctx(cptestctx), + params, + ).await; + let dag = create_saga_dag::(real_params).unwrap(); crate::app::sagas::test_helpers::actions_succeed_idempotently( &cptestctx.server.server_context().nexus, @@ -1746,7 +1751,7 @@ mod test { outcome .setup_test(cptestctx, &other_sleds) .await - .saga_params() + .start_saga_params() }) }, || Box::pin(after_unwinding(cptestctx)), @@ -1804,7 +1809,7 @@ mod test { outcome .setup_test(cptestctx, &other_sleds) .await - .saga_params() + .start_saga_params() }) }, || Box::pin(after_unwinding(cptestctx)), @@ -1866,7 +1871,7 @@ mod test { outcome .setup_test(cptestctx, &other_sleds) .await - .saga_params() + .start_saga_params() }) }, || Box::pin(after_unwinding(cptestctx)), @@ -1928,7 +1933,7 @@ mod test { outcome .setup_test(cptestctx, &other_sleds) .await - .saga_params() + .start_saga_params() }) }, || Box::pin(after_unwinding(cptestctx)), @@ -1990,7 +1995,7 @@ mod test { outcome .setup_test(cptestctx, &other_sleds) .await - .saga_params() + .start_saga_params() }) }, || Box::pin(after_unwinding(cptestctx)), @@ -2052,7 +2057,7 @@ mod test { outcome .setup_test(cptestctx, &other_sleds) .await - .saga_params() + .start_saga_params() }) }, || Box::pin(after_unwinding(cptestctx)), @@ -2114,7 +2119,7 @@ mod test { outcome .setup_test(cptestctx, &other_sleds) .await - .saga_params() + .start_saga_params() }) }, || Box::pin(after_unwinding(cptestctx)), @@ -2176,7 +2181,7 @@ mod test { outcome .setup_test(cptestctx, &other_sleds) .await - .saga_params() + .start_saga_params() }) }, || Box::pin(after_unwinding(cptestctx)), @@ -2327,7 +2332,7 @@ mod test { let nexus = &cptestctx.server.server_context().nexus; nexus .sagas - .saga_execute::(self.saga_params()) + .saga_execute::(self.start_saga_params()) .await .expect("update saga should succeed"); @@ -2339,12 +2344,14 @@ mod test { &self, cptestctx: &ControlPlaneTestContext, ) { + let params = make_real_params(cptestctx, &self.opctx, self.start_saga_params()).await; + // Build the saga DAG with the provided test parameters - let dag = create_saga_dag::(self.saga_params()) + let dag = create_saga_dag::(params) .unwrap(); // Run the actions-succeed-idempotently test - crate::app::sagas::test_helpers::actions_succeed_idempotently( + test_helpers::actions_succeed_idempotently( &cptestctx.server.server_context().nexus, dag, ) @@ -2468,7 +2475,7 @@ mod test { .expect("updating migration target state should succeed"); } - fn saga_params(&self) -> Params { + fn start_saga_params(&self) -> Params { Params { authz_instance: self.authz_instance.clone(), serialized_authn: authn::saga::Serialized::for_opctx( @@ -2641,4 +2648,47 @@ mod test { .await } } + + async fn make_real_params( + cptestctx: &ControlPlaneTestContext, + opctx: &OpContext, + Params { authz_instance, serialized_authn }: Params, + ) -> RealParams { + let nexus = &cptestctx.server.server_context().nexus; + let datastore = nexus.datastore(); + let log = &cptestctx.logctx.log; + + let lock_id = Uuid::new_v4(); + let orig_lock = datastore + .instance_updater_lock(opctx, &authz_instance, lock_id) + .await + .expect("must lock instance"); + let state = datastore + .instance_fetch_all(&opctx, &authz_instance) + .await + .expect("instance must exist"); + let update = UpdatesRequired::for_snapshot(&log, &state) + .expect("the test's precondition should require updates"); + + info!( + log, + "made params for real saga"; + "instance" => ?state.instance, + "active_vmm" => ?state.active_vmm, + "target_vmm" => ?state.target_vmm, + "migration" => ?state.migration, + "update.new_runtime" => ?update.new_runtime, + "update.destroy_active_vmm" => ?update.destroy_active_vmm, + "update.destroy_target_vmm" => ?update.destroy_target_vmm, + "update.deprovision" => ?update.deprovision, + "update.network_config" => ?update.network_config, + ); + + RealParams { + authz_instance, + serialized_authn, + update, + orig_lock, + } + } } From 8cd33782d4d980beb5a84c539d9d14948648d13b Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Tue, 6 Aug 2024 11:21:22 -0700 Subject: [PATCH 223/234] also run unwinding tests with "real" saga --- nexus/src/app/sagas/instance_update/mod.rs | 274 +++++++-------------- 1 file changed, 94 insertions(+), 180 deletions(-) diff --git a/nexus/src/app/sagas/instance_update/mod.rs b/nexus/src/app/sagas/instance_update/mod.rs index a3f06223ee1..18acbe9ba5a 100644 --- a/nexus/src/app/sagas/instance_update/mod.rs +++ b/nexus/src/app/sagas/instance_update/mod.rs @@ -1562,8 +1562,10 @@ mod test { cptestctx, &test_helpers::test_opctx(cptestctx), params, - ).await; - let dag = create_saga_dag::(real_params).unwrap(); + ) + .await; + let dag = + create_saga_dag::(real_params).unwrap(); crate::app::sagas::test_helpers::actions_succeed_idempotently( &cptestctx.server.server_context().nexus, @@ -1581,13 +1583,22 @@ mod test { ) { let _project_id = setup_test_project(&cptestctx.external_client).await; let nexus = &cptestctx.server.server_context().nexus; + let opctx = test_helpers::test_opctx(cptestctx); + test_helpers::action_failure_can_unwind::( nexus, || { Box::pin(async { - let (_, params) = + let (_, start_saga_params) = setup_active_vmm_destroyed_test(cptestctx).await; - params + + // Since the unwinding test will test unwinding from each + // individual saga node *in the saga DAG constructed by the + // provided params*, we need to give it the "real saga"'s + // params rather than the start saga's params. Otherwise, + // we're just testing the unwinding behavior of the trivial + // two-node start saga + make_real_params(cptestctx, &opctx, start_saga_params).await }) }, || Box::pin(after_unwinding(cptestctx)), @@ -1737,27 +1748,10 @@ mod test { async fn test_migration_source_completed_can_unwind( cptestctx: &ControlPlaneTestContext, ) { - let nexus = &cptestctx.server.server_context().nexus; - let other_sleds = test_helpers::add_sleds(cptestctx, 1).await; - let _project_id = setup_test_project(&cptestctx.external_client).await; - - let outcome = MigrationOutcome::default() - .source(MigrationState::Completed, VmmState::Stopping); - - test_helpers::action_failure_can_unwind::( - nexus, - || { - Box::pin(async { - outcome - .setup_test(cptestctx, &other_sleds) - .await - .start_saga_params() - }) - }, - || Box::pin(after_unwinding(cptestctx)), - &cptestctx.logctx.log, - ) - .await; + MigrationOutcome::default() + .source(MigrationState::Completed, VmmState::Stopping) + .run_unwinding_test(cptestctx) + .await; } // === migration target completed tests === @@ -1796,26 +1790,10 @@ mod test { async fn test_migration_target_completed_can_unwind( cptestctx: &ControlPlaneTestContext, ) { - let nexus = &cptestctx.server.server_context().nexus; - let other_sleds = test_helpers::add_sleds(cptestctx, 1).await; - let _project_id = setup_test_project(&cptestctx.external_client).await; - let outcome = MigrationOutcome::default() - .target(MigrationState::Completed, VmmState::Running); - - test_helpers::action_failure_can_unwind::( - nexus, - || { - Box::pin(async { - outcome - .setup_test(cptestctx, &other_sleds) - .await - .start_saga_params() - }) - }, - || Box::pin(after_unwinding(cptestctx)), - &cptestctx.logctx.log, - ) - .await; + MigrationOutcome::default() + .target(MigrationState::Completed, VmmState::Running) + .run_unwinding_test(cptestctx) + .await; } // === migration completed and source destroyed tests === @@ -1856,28 +1834,11 @@ mod test { async fn test_migration_completed_source_destroyed_can_unwind( cptestctx: &ControlPlaneTestContext, ) { - let nexus = &cptestctx.server.server_context().nexus; - let other_sleds = test_helpers::add_sleds(cptestctx, 1).await; - let _project_id = setup_test_project(&cptestctx.external_client).await; - - let outcome = MigrationOutcome::default() + MigrationOutcome::default() .target(MigrationState::Completed, VmmState::Running) - .source(MigrationState::Completed, VmmState::Destroyed); - - test_helpers::action_failure_can_unwind::( - nexus, - || { - Box::pin(async { - outcome - .setup_test(cptestctx, &other_sleds) - .await - .start_saga_params() - }) - }, - || Box::pin(after_unwinding(cptestctx)), - &cptestctx.logctx.log, - ) - .await; + .source(MigrationState::Completed, VmmState::Destroyed) + .run_unwinding_test(cptestctx) + .await; } // === migration failed, target not destroyed === @@ -1918,28 +1879,11 @@ mod test { async fn test_migration_target_failed_can_unwind( cptestctx: &ControlPlaneTestContext, ) { - let nexus = &cptestctx.server.server_context().nexus; - let other_sleds = test_helpers::add_sleds(cptestctx, 1).await; - let _project_id = setup_test_project(&cptestctx.external_client).await; - - let outcome = MigrationOutcome::default() + MigrationOutcome::default() .target(MigrationState::Failed, VmmState::Failed) - .source(MigrationState::Failed, VmmState::Running); - - test_helpers::action_failure_can_unwind::( - nexus, - || { - Box::pin(async { - outcome - .setup_test(cptestctx, &other_sleds) - .await - .start_saga_params() - }) - }, - || Box::pin(after_unwinding(cptestctx)), - &cptestctx.logctx.log, - ) - .await; + .source(MigrationState::Failed, VmmState::Running) + .run_unwinding_test(cptestctx) + .await; } // === migration failed, migration target destroyed tests === @@ -1980,28 +1924,11 @@ mod test { async fn test_migration_target_failed_destroyed_can_unwind( cptestctx: &ControlPlaneTestContext, ) { - let nexus = &cptestctx.server.server_context().nexus; - let other_sleds = test_helpers::add_sleds(cptestctx, 1).await; - let _project_id = setup_test_project(&cptestctx.external_client).await; - - let outcome = MigrationOutcome::default() + MigrationOutcome::default() .target(MigrationState::Failed, VmmState::Destroyed) - .source(MigrationState::Failed, VmmState::Running); - - test_helpers::action_failure_can_unwind::( - nexus, - || { - Box::pin(async { - outcome - .setup_test(cptestctx, &other_sleds) - .await - .start_saga_params() - }) - }, - || Box::pin(after_unwinding(cptestctx)), - &cptestctx.logctx.log, - ) - .await; + .source(MigrationState::Failed, VmmState::Running) + .run_unwinding_test(cptestctx) + .await; } // === migration failed, migration source destroyed tests === @@ -2042,28 +1969,11 @@ mod test { async fn test_migration_source_failed_destroyed_can_unwind( cptestctx: &ControlPlaneTestContext, ) { - let nexus = &cptestctx.server.server_context().nexus; - let other_sleds = test_helpers::add_sleds(cptestctx, 1).await; - let _project_id = setup_test_project(&cptestctx.external_client).await; - - let outcome = MigrationOutcome::default() + MigrationOutcome::default() .target(MigrationState::InProgress, VmmState::Running) - .source(MigrationState::Failed, VmmState::Destroyed); - - test_helpers::action_failure_can_unwind::( - nexus, - || { - Box::pin(async { - outcome - .setup_test(cptestctx, &other_sleds) - .await - .start_saga_params() - }) - }, - || Box::pin(after_unwinding(cptestctx)), - &cptestctx.logctx.log, - ) - .await; + .source(MigrationState::Failed, VmmState::Destroyed) + .run_unwinding_test(cptestctx) + .await; } // === migration failed, source and target both destroyed === @@ -2104,28 +2014,11 @@ mod test { async fn test_migration_failed_everyone_died_can_unwind( cptestctx: &ControlPlaneTestContext, ) { - let nexus = &cptestctx.server.server_context().nexus; - let other_sleds = test_helpers::add_sleds(cptestctx, 1).await; - let _project_id = setup_test_project(&cptestctx.external_client).await; - - let outcome = MigrationOutcome::default() + MigrationOutcome::default() .target(MigrationState::Failed, VmmState::Destroyed) - .source(MigrationState::Failed, VmmState::Destroyed); - - test_helpers::action_failure_can_unwind::( - nexus, - || { - Box::pin(async { - outcome - .setup_test(cptestctx, &other_sleds) - .await - .start_saga_params() - }) - }, - || Box::pin(after_unwinding(cptestctx)), - &cptestctx.logctx.log, - ) - .await; + .source(MigrationState::Failed, VmmState::Destroyed) + .run_unwinding_test(cptestctx) + .await; } // === migration completed, but then the target was destroyed === @@ -2166,28 +2059,11 @@ mod test { async fn test_migration_completed_but_target_destroyed_can_unwind( cptestctx: &ControlPlaneTestContext, ) { - let nexus = &cptestctx.server.server_context().nexus; - let other_sleds = test_helpers::add_sleds(cptestctx, 1).await; - let _project_id = setup_test_project(&cptestctx.external_client).await; - - let outcome = MigrationOutcome::default() + MigrationOutcome::default() .target(MigrationState::Completed, VmmState::Destroyed) - .source(MigrationState::Completed, VmmState::Stopping); - - test_helpers::action_failure_can_unwind::( - nexus, - || { - Box::pin(async { - outcome - .setup_test(cptestctx, &other_sleds) - .await - .start_saga_params() - }) - }, - || Box::pin(after_unwinding(cptestctx)), - &cptestctx.logctx.log, - ) - .await; + .source(MigrationState::Completed, VmmState::Stopping) + .run_unwinding_test(cptestctx) + .await; } #[derive(Clone, Copy, Default)] @@ -2219,6 +2095,44 @@ mod test { ) -> MigrationTest { MigrationTest::setup(self, cptestctx, other_sleds).await } + + async fn run_unwinding_test( + &self, + cptestctx: &ControlPlaneTestContext, + ) { + let nexus = &cptestctx.server.server_context().nexus; + let other_sleds = test_helpers::add_sleds(cptestctx, 1).await; + let _project_id = + setup_test_project(&cptestctx.external_client).await; + let opctx = test_helpers::test_opctx(&cptestctx); + + test_helpers::action_failure_can_unwind::< + SagaDoActualInstanceUpdate, + _, + _, + >( + nexus, + || { + Box::pin(async { + // Since the unwinding test will test unwinding from each + // individual saga node *in the saga DAG constructed by the + // provided params*, we need to give it the "real saga"'s + // params rather than the start saga's params. Otherwise, + // we're just testing the unwinding behavior of the trivial + // two-node start saga. + let start_saga_params = self + .setup_test(cptestctx, &other_sleds) + .await + .start_saga_params(); + make_real_params(cptestctx, &opctx, start_saga_params) + .await + }) + }, + || Box::pin(after_unwinding(cptestctx)), + &cptestctx.logctx.log, + ) + .await; + } } struct MigrationTest { @@ -2344,11 +2258,16 @@ mod test { &self, cptestctx: &ControlPlaneTestContext, ) { - let params = make_real_params(cptestctx, &self.opctx, self.start_saga_params()).await; + let params = make_real_params( + cptestctx, + &self.opctx, + self.start_saga_params(), + ) + .await; // Build the saga DAG with the provided test parameters - let dag = create_saga_dag::(params) - .unwrap(); + let dag = + create_saga_dag::(params).unwrap(); // Run the actions-succeed-idempotently test test_helpers::actions_succeed_idempotently( @@ -2684,11 +2603,6 @@ mod test { "update.network_config" => ?update.network_config, ); - RealParams { - authz_instance, - serialized_authn, - update, - orig_lock, - } + RealParams { authz_instance, serialized_authn, update, orig_lock } } } From e04b278b2be3da288c335f3e8e1b657091a8ea0d Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Tue, 6 Aug 2024 11:23:20 -0700 Subject: [PATCH 224/234] add back start saga unwinding/idempotency tests --- nexus/src/app/sagas/instance_update/mod.rs | 56 +++++++++++++++++++++- 1 file changed, 54 insertions(+), 2 deletions(-) diff --git a/nexus/src/app/sagas/instance_update/mod.rs b/nexus/src/app/sagas/instance_update/mod.rs index 18acbe9ba5a..72b354ab427 100644 --- a/nexus/src/app/sagas/instance_update/mod.rs +++ b/nexus/src/app/sagas/instance_update/mod.rs @@ -1529,7 +1529,7 @@ mod test { .await; } - // === Active VMM destroyed tests ==== + // === Active VMM destroyed tests === #[nexus_test(server = crate::Server)] async fn test_active_vmm_destroyed_succeeds( @@ -1585,7 +1585,11 @@ mod test { let nexus = &cptestctx.server.server_context().nexus; let opctx = test_helpers::test_opctx(cptestctx); - test_helpers::action_failure_can_unwind::( + test_helpers::action_failure_can_unwind::< + SagaDoActualInstanceUpdate, + _, + _, + >( nexus, || { Box::pin(async { @@ -1607,6 +1611,54 @@ mod test { .await; } + // === idempotency and unwinding tests for the start saga === + + // We only do these tests with an "active VMM destroyed" precondition, since + // the behavior of the `start-instance-update` saga does *not* depend on the + // specific update to perform, and it seems unnecessary to run the start + // saga's tests against every possible migration outcome combination tested + // below. + + #[nexus_test(server = crate::Server)] + async fn test_start_saga_actions_succeed_idempotently( + cptestctx: &ControlPlaneTestContext, + ) { + let _project_id = setup_test_project(&cptestctx.external_client).await; + let (state, params) = setup_active_vmm_destroyed_test(cptestctx).await; + let dag = create_saga_dag::(params).unwrap(); + + crate::app::sagas::test_helpers::actions_succeed_idempotently( + &cptestctx.server.server_context().nexus, + dag, + ) + .await; + + // Assert that the saga properly cleaned up the active VMM's resources. + verify_active_vmm_destroyed(cptestctx, state.instance().id()).await; + } + + #[nexus_test(server = crate::Server)] + async fn test_start_saga_action_failure_can_unwind( + cptestctx: &ControlPlaneTestContext, + ) { + let _project_id = setup_test_project(&cptestctx.external_client).await; + let nexus = &cptestctx.server.server_context().nexus; + + test_helpers::action_failure_can_unwind::( + nexus, + || { + Box::pin(async { + let (_, params) = + setup_active_vmm_destroyed_test(cptestctx).await; + params + }) + }, + || Box::pin(after_unwinding(cptestctx)), + &cptestctx.logctx.log, + ) + .await; + } + // --- test helpers --- async fn setup_active_vmm_destroyed_test( From caa42632bd25c2c02bd00075c7476d58a68fa8b2 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Tue, 6 Aug 2024 13:12:32 -0700 Subject: [PATCH 225/234] release lock if child sagas unwind before locking This code is *really* gruesome, but it should, hopefully, make the unwinding behavior correct in the case where the child saga unwinds *from* the `become_updater` node without having successfully locked the instance record yet. Previously, this could be leaked since the child saga unwinding does not make the start saga unwind. Now, however, the start saga will unwind for any child saga error *except* for an "already locked" error, in which case we can just complete cleanly. I've also updated the unwinding tests to assert that the lock is held either by the (fake) start saga, OR unlocked, since unwinding from the first action can leave it locked by the parent. --- nexus/db-queries/src/db/datastore/instance.rs | 2 +- nexus/src/app/saga.rs | 6 - nexus/src/app/sagas/instance_update/mod.rs | 101 +++++++++++++--- nexus/src/app/sagas/instance_update/start.rs | 112 +++++++++++++++--- 4 files changed, 178 insertions(+), 43 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/instance.rs b/nexus/db-queries/src/db/datastore/instance.rs index a4e3bd19995..6a449e1406a 100644 --- a/nexus/db-queries/src/db/datastore/instance.rs +++ b/nexus/db-queries/src/db/datastore/instance.rs @@ -234,7 +234,7 @@ pub struct InstanceGestalt { /// when the lock is released. #[derive(Debug, serde::Serialize, serde::Deserialize)] pub struct UpdaterLock { - updater_id: Uuid, + pub updater_id: Uuid, locked_gen: Generation, } diff --git a/nexus/src/app/saga.rs b/nexus/src/app/saga.rs index 2b510a0f12f..fcdbb0db597 100644 --- a/nexus/src/app/saga.rs +++ b/nexus/src/app/saga.rs @@ -371,12 +371,6 @@ pub(crate) struct StoppedSaga { impl StoppedSaga { /// Fetches the raw Steno result for the saga's execution - /// - /// This is a test-only routine meant for use in tests that need to examine - /// the details of a saga's final state (e.g., examining the exact point at - /// which it failed). Non-test callers should use `into_omicron_result` - /// instead. - #[cfg(test)] pub(crate) fn into_raw_result(self) -> SagaResult { self.result } diff --git a/nexus/src/app/sagas/instance_update/mod.rs b/nexus/src/app/sagas/instance_update/mod.rs index 72b354ab427..56de697414e 100644 --- a/nexus/src/app/sagas/instance_update/mod.rs +++ b/nexus/src/app/sagas/instance_update/mod.rs @@ -1358,6 +1358,8 @@ mod test { use omicron_uuid_kinds::GenericUuid; use omicron_uuid_kinds::PropolisUuid; use omicron_uuid_kinds::SledUuid; + use std::sync::Arc; + use std::sync::Mutex; use uuid::Uuid; type ControlPlaneTestContext = @@ -1407,8 +1409,8 @@ mod test { name: INSTANCE_NAME.parse().unwrap(), description: format!("instance {:?}", INSTANCE_NAME), }, - ncpus: InstanceCpuCount(2), - memory: ByteCount::from_gibibytes_u32(2), + ncpus: InstanceCpuCount(1), + memory: ByteCount::from_gibibytes_u32(1), hostname: INSTANCE_NAME.parse().unwrap(), user_data: b"#cloud-config".to_vec(), ssh_public_keys: Some(Vec::new()), @@ -1468,7 +1470,10 @@ mod test { } } - async fn after_unwinding(cptestctx: &ControlPlaneTestContext) { + async fn after_unwinding( + parent_saga_id: Option, + cptestctx: &ControlPlaneTestContext, + ) { let state = test_helpers::instance_fetch_by_name( cptestctx, INSTANCE_NAME, @@ -1480,11 +1485,24 @@ mod test { // Unlike most other sagas, we actually don't unwind the work performed // by an update saga, as we would prefer that at least some of it // succeeds. The only thing that *needs* to be rolled back when an - // instance-update saga fails is that the updater lock *MUST* be - // released so that a subsequent saga can run. See the section "on - // unwinding" in the documentation comment at the top of the - // instance-update module for details. - assert_instance_unlocked(instance); + // instance-update saga fails is that the updater lock *MUST* either + // remain locked by the parent start saga, or have been released so that + // a subsequent saga can run. See the section "on unwinding" in the + // documentation comment at the top of the instance-update module for + // details. + if let Some(parent_saga_id) = parent_saga_id { + if let Some(actual_lock_id) = instance.updater_id { + assert_eq!( + actual_lock_id, parent_saga_id, + "if the instance is locked after unwinding, it must be \ + locked by the `start-instance-update` saga, and not the \ + unwinding child saga!" + ); + } + } else { + assert_instance_unlocked(instance); + } + // Additionally, we assert that the instance record is in a // consistent state, ensuring that all changes to the instance record // are atomic. This is important *because* we won't roll back changes @@ -1584,6 +1602,10 @@ mod test { let _project_id = setup_test_project(&cptestctx.external_client).await; let nexus = &cptestctx.server.server_context().nexus; let opctx = test_helpers::test_opctx(cptestctx); + // Stupid side channel for passing the expected parent start saga's lock + // ID into the "after unwinding" method, so that it can check that the + // lock is either released or was never acquired. + let parent_saga_id = Arc::new(Mutex::new(None)); test_helpers::action_failure_can_unwind::< SagaDoActualInstanceUpdate, @@ -1592,7 +1614,9 @@ mod test { >( nexus, || { - Box::pin(async { + let parent_saga_id = parent_saga_id.clone(); + let opctx = &opctx; + Box::pin(async move { let (_, start_saga_params) = setup_active_vmm_destroyed_test(cptestctx).await; @@ -1602,10 +1626,25 @@ mod test { // params rather than the start saga's params. Otherwise, // we're just testing the unwinding behavior of the trivial // two-node start saga - make_real_params(cptestctx, &opctx, start_saga_params).await + let real_params = + make_real_params(cptestctx, opctx, start_saga_params) + .await; + *parent_saga_id.lock().unwrap() = + Some(real_params.orig_lock.updater_id); + real_params + }) + }, + || { + let parent_saga_id = parent_saga_id.clone(); + Box::pin(async move { + let parent_saga_id = + parent_saga_id.lock().unwrap().take().expect( + "parent saga's lock ID must have been set by the \ + `before_saga` function; this is a test bug", + ); + after_unwinding(Some(parent_saga_id), cptestctx).await }) }, - || Box::pin(after_unwinding(cptestctx)), &cptestctx.logctx.log, ) .await; @@ -1653,7 +1692,9 @@ mod test { params }) }, - || Box::pin(after_unwinding(cptestctx)), + // Don't pass a parent saga ID here because the saga MUST be + // unlocked if the whole start saga unwinds. + || Box::pin(after_unwinding(None, cptestctx)), &cptestctx.logctx.log, ) .await; @@ -2158,6 +2199,11 @@ mod test { setup_test_project(&cptestctx.external_client).await; let opctx = test_helpers::test_opctx(&cptestctx); + // Stupid side channel for passing the expected parent start saga's lock + // ID into the "after unwinding" method, so that it can check that the + // lock is either released or was never acquired. + let parent_saga_id = Arc::new(Mutex::new(None)); + test_helpers::action_failure_can_unwind::< SagaDoActualInstanceUpdate, _, @@ -2165,7 +2211,10 @@ mod test { >( nexus, || { - Box::pin(async { + let parent_saga_id = parent_saga_id.clone(); + let other_sleds = &other_sleds; + let opctx = &opctx; + Box::pin(async move { // Since the unwinding test will test unwinding from each // individual saga node *in the saga DAG constructed by the // provided params*, we need to give it the "real saga"'s @@ -2173,14 +2222,32 @@ mod test { // we're just testing the unwinding behavior of the trivial // two-node start saga. let start_saga_params = self - .setup_test(cptestctx, &other_sleds) + .setup_test(cptestctx, other_sleds) .await .start_saga_params(); - make_real_params(cptestctx, &opctx, start_saga_params) - .await + let real_params = make_real_params( + cptestctx, + opctx, + start_saga_params, + ) + .await; + *parent_saga_id.lock().unwrap() = + Some(real_params.orig_lock.updater_id); + real_params + }) + }, + || { + let parent_saga_id = parent_saga_id.clone(); + Box::pin(async move { + let parent_saga_id = + parent_saga_id.lock().unwrap().take().expect( + "parent saga's lock ID must have been set by \ + the `before_saga` function; this is a test \ + bug", + ); + after_unwinding(Some(parent_saga_id), cptestctx).await }) }, - || Box::pin(after_unwinding(cptestctx)), &cptestctx.logctx.log, ) .await; diff --git a/nexus/src/app/sagas/instance_update/start.rs b/nexus/src/app/sagas/instance_update/start.rs index ae7152d3ab0..1c4e3b00b85 100644 --- a/nexus/src/app/sagas/instance_update/start.rs +++ b/nexus/src/app/sagas/instance_update/start.rs @@ -9,11 +9,12 @@ use super::{ SagaDoActualInstanceUpdate, SagaInitError, UpdatesRequired, ACTION_GENERATE_ID, INSTANCE_LOCK, INSTANCE_LOCK_ID, }; +use crate::app::saga; use crate::app::sagas::declare_saga_actions; use nexus_db_queries::db::datastore::instance; use nexus_db_queries::{authn, authz}; use serde::{Deserialize, Serialize}; -use steno::{ActionError, DagBuilder, Node}; +use steno::{ActionError, DagBuilder, Node, SagaResultErr}; use uuid::Uuid; /// Parameters to the start instance update saga. @@ -164,6 +165,7 @@ async fn siu_fetch_state_and_start_real_saga( let opctx = crate::context::op_context_for_saga_action(&sagactx, &serialized_authn); let datastore = osagactx.datastore(); + let nexus = osagactx.nexus(); let state = datastore .instance_fetch_all(&opctx, &authz_instance) @@ -189,30 +191,102 @@ async fn siu_fetch_state_and_start_real_saga( "update.destroy_target_vmm" => ?update.destroy_target_vmm, "update.deprovision" => update.deprovision.is_some(), ); - if let Err(error) = osagactx - .nexus() - .sagas - .saga_execute::(RealParams { + // Prepare the child saga. + // + // /!\ WARNING /!\ This is really finicky: whether or not the start saga + // should unwind depends on *whether the child `instance-update` saga + // has advanced far enough to have inherited the lock or not. If the + // child has not inherited the lock, we *must* unwind to ensure the lock + // is dropped. + // + // Note that we *don't* use `SagaExecutor::saga_execute`, which prepares + // the child saga and waits for it to complete. That function wraps all + // the errors returned by this whole process in an external API error, + // which makes it difficult for us to figure out *why* the child saga + // failed, and whether we should unwind or not. + + let dag = + saga::create_saga_dag::(RealParams { serialized_authn, authz_instance, update, orig_lock, }) + // If we can't build a DAG for the child saga, we should unwind, so + // that we release the lock. + .map_err(|e| { + nexus.background_tasks.task_instance_updater.activate(); + ActionError::action_failed(e) + })?; + let child_result = nexus + .sagas + .saga_prepare(dag) .await - { - warn!( - log, - "instance update: real update saga failed (which *could* \ - mean nothing...)"; - "instance_id" => %instance_id, - "error" => %error, - ); - // If the real saga failed, kick the background task. If the real - // saga failed because this action was executed twice and the second - // child saga couldn't lock the instance, that's fine, because the - // background task will only start new sagas for instances whose DB - // state actually *needs* an update. - osagactx.nexus().background_tasks.task_instance_updater.activate(); + // Similarly, if we can't prepare the child saga, we need to unwind + // and release the lock. + .map_err(|e| { + nexus.background_tasks.task_instance_updater.activate(); + ActionError::action_failed(e) + })? + .start() + .await + // And, if we can't start it, we need to unwind. + .map_err(|e| { + nexus.background_tasks.task_instance_updater.activate(); + ActionError::action_failed(e) + })? + .wait_until_stopped() + .await + .into_raw_result(); + match child_result.kind { + Ok(_) => { + debug!( + log, + "instance update: child saga completed successfully"; + "instance_id" => %instance_id, + "child_saga_id" => %child_result.saga_id, + ) + } + // Check if the child saga failed to inherit the updater lock from + // this saga. + Err(SagaResultErr { + error_node_name, + error_source: ActionError::ActionFailed { source_error }, + .. + }) if error_node_name.as_ref() == super::INSTANCE_LOCK => { + if let Ok(instance::UpdaterLockError::AlreadyLocked) = + serde_json::from_value(source_error) + { + // If inheriting the lock failed because the lock was held by another + // saga. If this is the case, that's fine: this action must have + // executed more than once, and created multiple child sagas. No big deal. + return Ok(()); + } else { + // Otherwise, the child saga could not inherit the lock for + // some other reason. That means we MUST unwind to ensure + // the lock is released. + return Err(ActionError::action_failed( + "child saga failed to inherit lock".to_string(), + )); + } + } + Err(error) => { + warn!( + log, + "instance update: child saga failed, unwinding..."; + "instance_id" => %instance_id, + "child_saga_id" => %child_result.saga_id, + "error" => ?error, + ); + + // If the real saga failed, kick the background task. If the real + // saga failed because this action was executed twice and the second + // child saga couldn't lock the instance, that's fine, because the + // background task will only start new sagas for instances whose DB + // state actually *needs* an update. + nexus.background_tasks.task_instance_updater.activate(); + return Err(error.error_source); + } } } else { info!( From 59adbf4458ce0530b3523653ac7855845725229f Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Tue, 6 Aug 2024 14:29:04 -0700 Subject: [PATCH 226/234] turns out there's a normal reason that could happen --- nexus/src/app/sagas/instance_update/mod.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/nexus/src/app/sagas/instance_update/mod.rs b/nexus/src/app/sagas/instance_update/mod.rs index 56de697414e..cb402010f95 100644 --- a/nexus/src/app/sagas/instance_update/mod.rs +++ b/nexus/src/app/sagas/instance_update/mod.rs @@ -622,9 +622,9 @@ impl UpdatesRequired { // If the *new* state no longer has a `propolis_id` field, that means // that the active VMM was destroyed without a successful migration out // (or, we migrated out to a target VMM that was immediately destroyed, - // which...seems weird but certainly could happen). In that case, the - // instance is no longer incarnated on a sled, and we must update the - // state of the world to reflect that. + // which could happen if a running VM shut down immediately after + // migrating). In that case, the instance is no longer incarnated on a + // sled, and we must update the state of the world to reflect that. let deprovision = if new_runtime.propolis_id.is_none() { // N.B. that this does *not* set `update_required`, because // `new_runtime.propolis_id` might be `None` just because there was, From 94611681ce7a48eb27caab347b69b52ca9c24ab8 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Tue, 6 Aug 2024 14:30:21 -0700 Subject: [PATCH 227/234] rename `UpdatesRequired::for_snapshot` Since we're not calling 'em "snapshots" anymore, this should be clearer. --- nexus/src/app/sagas/instance_update/mod.rs | 6 +++--- nexus/src/app/sagas/instance_update/start.rs | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/nexus/src/app/sagas/instance_update/mod.rs b/nexus/src/app/sagas/instance_update/mod.rs index cb402010f95..6bc902ad180 100644 --- a/nexus/src/app/sagas/instance_update/mod.rs +++ b/nexus/src/app/sagas/instance_update/mod.rs @@ -464,7 +464,7 @@ struct Deprovision { } impl UpdatesRequired { - fn for_snapshot( + fn for_instance( log: &slog::Logger, snapshot: &InstanceGestalt, ) -> Option { @@ -1168,7 +1168,7 @@ async fn chain_update_saga( .await .context("failed to fetch latest snapshot for instance")?; - if let Some(update) = UpdatesRequired::for_snapshot(log, &new_state) { + if let Some(update) = UpdatesRequired::for_instance(log, &new_state) { debug!( log, "instance update: additional updates required, preparing a \ @@ -2705,7 +2705,7 @@ mod test { .instance_fetch_all(&opctx, &authz_instance) .await .expect("instance must exist"); - let update = UpdatesRequired::for_snapshot(&log, &state) + let update = UpdatesRequired::for_instance(&log, &state) .expect("the test's precondition should require updates"); info!( diff --git a/nexus/src/app/sagas/instance_update/start.rs b/nexus/src/app/sagas/instance_update/start.rs index 1c4e3b00b85..fbd8cbffc23 100644 --- a/nexus/src/app/sagas/instance_update/start.rs +++ b/nexus/src/app/sagas/instance_update/start.rs @@ -176,7 +176,7 @@ async fn siu_fetch_state_and_start_real_saga( // state snapshot. If there are updates to perform, execute the "real" // update saga. Otherwise, if we don't need to do anything else, simply // release the lock and finish this saga. - if let Some(update) = UpdatesRequired::for_snapshot(log, &state) { + if let Some(update) = UpdatesRequired::for_instance(log, &state) { info!( log, "instance update: starting real update saga..."; From 22533902ada590fe36bb88257a3b99bbd5397765 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Tue, 6 Aug 2024 14:45:59 -0700 Subject: [PATCH 228/234] 'rename symbol' doesnt work on docs --- nexus/src/app/sagas/instance_update/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nexus/src/app/sagas/instance_update/mod.rs b/nexus/src/app/sagas/instance_update/mod.rs index 6bc902ad180..1b7c88b750e 100644 --- a/nexus/src/app/sagas/instance_update/mod.rs +++ b/nexus/src/app/sagas/instance_update/mod.rs @@ -418,7 +418,7 @@ pub fn update_saga_needed( /// Depending on the current state of the instance and its VMM(s) and migration, /// an update saga may perform a variety of operations. Which operations need to /// be performed for the current state snapshot of the instance, VMM, and -/// migration records is determined by the [`UpdatesRequired::for_snapshot`] +/// migration records is determined by the [`UpdatesRequired::for_instance`] /// function. #[derive(Debug, Deserialize, Serialize)] struct UpdatesRequired { From 47d8c3839d8449463b56689b55060e249ddaa37b Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Wed, 7 Aug 2024 10:48:36 -0700 Subject: [PATCH 229/234] improve errors/log messages --- nexus/db-queries/src/db/datastore/instance.rs | 314 +++++++++++++----- nexus/src/app/sagas/instance_update/mod.rs | 7 +- 2 files changed, 234 insertions(+), 87 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/instance.rs b/nexus/db-queries/src/db/datastore/instance.rs index 6a449e1406a..0145c0ef85c 100644 --- a/nexus/db-queries/src/db/datastore/instance.rs +++ b/nexus/db-queries/src/db/datastore/instance.rs @@ -936,22 +936,27 @@ impl DataStore { } /// Attempts to lock an instance's record to apply state updates in an - /// instance-update saga, returning the state of the instance when the lock - /// was acquired. + /// instance-update saga, returning an [`UpdaterLock`] if the lock is + /// successfully acquired. /// /// # Notes /// /// This method MUST only be called from the context of a saga! The /// calling saga must ensure that the reverse action for the action that /// acquires the lock must call [`DataStore::instance_updater_unlock`] to - /// ensure that the lock is always released if the saga unwinds. + /// ensure that the lock is always released if the saga unwinds. If the saga + /// locking the instance completes successfully, it must release the lock + /// using [`DataStore::instance_updater_unlock`], or use + /// [`DataStore::instance_commit_update`] to release the lock and write back + /// a new [`InstanceRuntimeState`] in a single atomic query. /// /// This method is idempotent: if the instance is already locked by the same /// saga, it will succeed, as though the lock was acquired. /// /// # Arguments /// - /// - `authz_instance`: the instance to attempt to lock to lock + /// - `opctx`: the [`OpContext`] for this operation. + /// - `authz_instance`: the instance to attempt to lock. /// - `updater_id`: the UUID of the saga that's attempting to lock this /// instance. /// @@ -1076,15 +1081,59 @@ impl DataStore { } } + /// Attempts to "inherit" the lock acquired by + /// [`DataStore::instance_updater_lock`] by setting a new `child_lock_id` as + /// the current updater, if (and only if) the lock is held by the provided + /// `parent_lock`. + /// + /// This essentially performs the equivalent of a [compare-exchange] + /// operation on the instance record's lock ID field, which succeeds if the + /// current lock ID matches the parent. Using this method ensures that, if a + /// parent saga starts multiple child sagas, only one of them can + /// successfully acquire the lock. + /// + /// # Notes + /// + /// This method MUST only be called from the context of a saga! The + /// calling saga must ensure that the reverse action for the action that + /// acquires the lock must call [`DataStore::instance_updater_unlock`] to + /// ensure that the lock is always released if the saga unwinds. If the saga + /// locking the instance completes successfully, it must release the lock + /// using [`DataStore::instance_updater_unlock`], or use + /// [`DataStore::instance_commit_update`] to release the lock and write back + /// a new [`InstanceRuntimeState`] in a single atomic query. + + /// + /// This method is idempotent: if the instance is already locked by the same + /// saga, it will succeed, as though the lock was acquired. + /// + /// # Arguments + /// + /// - `opctx`: the [`OpContext`] for this operation. + /// - `authz_instance`: the instance to attempt to inherit the lock on. + /// - `parent_lock`: the [`UpdaterLock`] to attempt to inherit the lock + /// from. If the current updater UUID and generation matches this, the + /// lock can be inherited by `child_id`. + /// - `child_lock_id`: the UUID of the saga that's attempting to lock this + /// instance. + /// + /// # Returns + /// + /// - [`Ok`]`(`[`UpdaterLock`]`)` if the lock was successfully inherited. + /// - [`Err`]`([`UpdaterLockError::AlreadyLocked`])` if the instance was + /// locked by a different saga, other than the provided `parent_lock`. + /// - [`Err`]`([`UpdaterLockError::Query`]`(...))` if the query to fetch + /// the instance or lock it returned another error (such as if the + /// instance no longer exists, or if the database connection failed). pub async fn instance_updater_inherit_lock( &self, opctx: &OpContext, authz_instance: &authz::Instance, - UpdaterLock { updater_id: parent_id, locked_gen }: UpdaterLock, + parent_lock: UpdaterLock, child_lock_id: Uuid, ) -> Result { use db::schema::instance::dsl; - + let UpdaterLock { updater_id: parent_id, locked_gen } = parent_lock; let instance_id = authz_instance.id(); let new_gen = Generation(locked_gen.0.next()); @@ -1111,10 +1160,11 @@ impl DataStore { })?; match result { - // If we updated the record, the lock has been released! Return - // `Ok(true)` to indicate that we released the lock successfully. + // If we updated the record, the lock has been successfully + // inherited! Return `Ok(true)` to indicate that we have acquired + // the lock successfully. UpdateAndQueryResult { status: UpdateStatus::Updated, .. } => { - slog::info!( + slog::debug!( &opctx.log, "inherited lock from {parent_id} to {child_lock_id}"; "instance_id" => %instance_id, @@ -1135,16 +1185,38 @@ impl DataStore { status: UpdateStatus::NotUpdatedButExists, ref found, } if found.updater_id == Some(child_lock_id) => { - debug_assert_eq!(found.updater_gen, new_gen,); + slog::debug!( + &opctx.log, + "previously inherited lock from {parent_id} to \ + {child_lock_id}"; + "instance_id" => %instance_id, + "updater_id" => %child_lock_id, + "locked_gen" => ?found.updater_gen, + "parent_id" => %parent_id, + "parent_gen" => ?locked_gen, + ); + debug_assert_eq!(found.updater_gen, new_gen); Ok(UpdaterLock { updater_id: child_lock_id, locked_gen: new_gen, }) } - // The instance exists, but the lock ID doesn't match our lock ID. - // This means we were trying to release a lock we never held, whcih - // is almost certainly a programmer error. - UpdateAndQueryResult { .. } => Err(UpdaterLockError::AlreadyLocked), + // The instance exists, but it's locked by a different saga than the + // parent we were trying to inherit the lock from. We cannot acquire + // the lock at this time. + UpdateAndQueryResult { ref found, .. } => { + slog::debug!( + &opctx.log, + "cannot inherit instance-updater lock from {parent_id} to \ + {child_lock_id}: this instance is not locked by the \ + expected parent saga"; + "instance_id" => %instance_id, + "updater_id" => %child_lock_id, + "parent_id" => %parent_id, + "actual_lock_id" => ?found.updater_id, + ); + Err(UpdaterLockError::AlreadyLocked) + } } } @@ -1193,40 +1265,97 @@ impl DataStore { // If we updated the record, the lock has been released! Return // `Ok(true)` to indicate that we released the lock successfully. UpdateAndQueryResult { status: UpdateStatus::Updated, .. } => { - Ok(true) + return Ok(true); } - // The generation has advanced past the generation at which the - // lock was held. This means that we have already released the - // lock. Return `Ok(false)` here for idempotency. - UpdateAndQueryResult { - status: UpdateStatus::NotUpdatedButExists, - ref found, - } if found.updater_gen > locked_gen => Ok(false), - // The instance exists, but the lock ID doesn't match our lock ID. - // This means we were trying to release a lock we never held, whcih - // is almost certainly a programmer error. - UpdateAndQueryResult { ref found, .. } => { - match found.updater_id { - Some(actual_id) if actual_id != updater_id => { - slog::error!( - &opctx.log, - "attempted to release a lock held by another saga"; - "instance_id" => %instance_id, - "updater_id" => %updater_id, - "actual_id" => %actual_id, - "found_gen" => ?found.updater_gen, - "locked_gen" => ?locked_gen, - ); - Err(Error::internal_error( - "attempted to release a lock held by another saga! this is a bug!", - )) - }, - Some(_) => Ok(false), - None => Err(Error::internal_error( - "attempted to release a lock on an instance that is not locked! this is a bug!", - )), + // The instance exists, but we didn't unlock it. In almost all + // cases, that's actually *fine*, since this suggests we didn't + // actually have the lock to release, so we don't need to worry + // about unlocking the instance. However, depending on the + // particular reason we didn't actually unlock the instance, this + // may be more or less likely to indicate a bug. Remember that saga + // actions --- even unwind actions --- must be idempotent, so we + // *may* just be trying to unlock an instance we already + // successfully unlocked, which is fine. + UpdateAndQueryResult { ref found, .. } + if found.time_deleted().is_some() => + { + debug!( + &opctx.log, + "attempted to unlock an instance that has been deleted"; + "instance_id" => %instance_id, + "updater_id" => %updater_id, + "time_deleted" => ?found.time_deleted(), + ); + return Ok(false); + } + + // If the instance is no longer locked by this saga, that's probably fine. + // We don't need to unlock it. + UpdateAndQueryResult { ref found, .. } + if found.updater_id != Some(updater_id) => + { + if found.updater_gen > locked_gen { + // The generation has advanced past the generation where we + // acquired the lock. That's totally fine: a previous + // execution of the same saga action must have unlocked it, + // and now it is either unlocked, or locked by a different + // saga. + debug!( + &opctx.log, + "attempted to unlock an instance that is no longer \ + locked by this saga"; + "instance_id" => %instance_id, + "updater_id" => %updater_id, + "actual_id" => ?found.updater_id.as_ref(), + "found_gen" => ?found.updater_gen, + "locked_gen" => ?locked_gen, + ); + } else { + // On the other hand, if the generation is less than or + // equal to the generation at which we locked the instance, + // that eems kinda suspicious --- perhaps we believed we + // held the lock, but didn't actually, which could be + // programmer error. + // + // However, this *could* conceivably happen: the same saga + // node could have executed previously and released the + // lock, and then the generation counter advanced enough + // times to wrap around, and then the same action tried to + // release its lock again. 64-bit generation counters + // overflowing in an instance's lifetime seems unlikely, but + // nothing is impossible... + warn!( + &opctx.log, + "attempted to release a lock held by another saga \ + at the same generation! this seems suspicious..."; + "instance_id" => %instance_id, + "updater_id" => %updater_id, + "actual_id" => ?found.updater_id.as_ref(), + "found_gen" => ?found.updater_gen, + "locked_gen" => ?locked_gen, + ); } + + Ok(false) + } + + // If we *are* still holding the lock, we must be trying to + // release it at the wrong generation. That seems quite + // suspicious. + UpdateAndQueryResult { ref found, .. } => { + warn!( + &opctx.log, + "attempted to release a lock at the wrong generation"; + "instance_id" => %instance_id, + "updater_id" => %updater_id, + "found_gen" => ?found.updater_gen, + "locked_gen" => ?locked_gen, + ); + Err(Error::internal_error( + "instance is locked by this saga, but at a different \ + generation", + )) } } } @@ -1297,56 +1426,73 @@ impl DataStore { // The instance has been marked as deleted, so no updates were // committed! - UpdateAndQueryResult { - status: UpdateStatus::NotUpdatedButExists, - ref found, - } if found.time_deleted().is_some() => { + UpdateAndQueryResult { ref found, .. } + if found.time_deleted().is_some() => + { warn!( &opctx.log, - "cannot commit instance update, as the instance no longer exists"; + "cannot commit instance update, as the instance no longer \ + exists"; "instance_id" => %instance_id, "updater_id" => %updater_id, "time_deleted" => ?found.time_deleted() ); - Err(LookupType::ById(instance_id).into_not_found(ResourceType::Instance)) + Err(LookupType::ById(instance_id) + .into_not_found(ResourceType::Instance)) } - // The generation has advanced past the generation at which the - // lock was held. This means that we have already released the - // lock. Return `Ok(false)` here for idempotency. - UpdateAndQueryResult { - status: UpdateStatus::NotUpdatedButExists, - ref found, - } if found.updater_gen > locked_gen => Ok(false), + // The instance exists, but we cannot update it because the state + // generation has advanced past ours. That's fine --- assume we + // already updated the instance. + UpdateAndQueryResult { ref found, .. } + if found.runtime().r#gen > new_runtime.r#gen => + { + debug!( + &opctx.log, + "cannot commit instance updates, as the state generation \ + has advanced: they've probably already been committed."; + "instance_id" => %instance_id, + "updater_id" => %updater_id, + "expected_gen" => ?new_runtime.r#gen, + "actual_gen" => ?found.runtime().r#gen, + ); + Ok(false) + } - // The instance exists, but the lock ID doesn't match our lock ID. - // This means we were trying to release a lock we never held, whcih - // is almost certainly a programmer error. - UpdateAndQueryResult { ref found, .. } => { - match found.updater_id { - Some(actual_id) if actual_id != updater_id => { - slog::error!( - &opctx.log, - "attempted to release a lock held by another saga"; - "instance_id" => %instance_id, - "updater_id" => %updater_id, - "actual_id" => %actual_id, - "found_gen" => ?found.updater_gen, - "locked_gen" => ?locked_gen, - ); - Err(Error::internal_error( - "attempted to release a lock held by another saga! this is a bug!", - )) - }, - Some(_) => Err(Error::conflict( - "attempted to commit an instance update, but the state generation has advanced!" - )), - None => Err(Error::internal_error( - "attempted to release a lock on an instance that is not locked! this is a bug!", - )), + // The instance exists, but we could not update it because the lock + // did not match. + UpdateAndQueryResult { ref found, .. } => match found.updater_id { + Some(actual_id) => { + const MSG: &'static str = + "cannot commit instance updates: the instance is \ + locked by another saga!"; + error!( + &opctx.log, + "{MSG}"; + "instance_id" => %instance_id, + "updater_id" => %updater_id, + "actual_id" => %actual_id, + "found_gen" => ?found.updater_gen, + "locked_gen" => ?locked_gen, + ); + Err(Error::internal_error(MSG)) } - } + None => { + const MSG: &'static str = + "cannot commit instance updates: the instance is \ + not locked"; + error!( + &opctx.log, + "{MSG}"; + "instance_id" => %instance_id, + "updater_id" => %updater_id, + "found_gen" => ?found.updater_gen, + "locked_gen" => ?locked_gen, + ); + Err(Error::internal_error(MSG)) + } + }, } } } diff --git a/nexus/src/app/sagas/instance_update/mod.rs b/nexus/src/app/sagas/instance_update/mod.rs index 1b7c88b750e..06e99184579 100644 --- a/nexus/src/app/sagas/instance_update/mod.rs +++ b/nexus/src/app/sagas/instance_update/mod.rs @@ -1302,7 +1302,7 @@ async fn unwind_instance_lock( } else if total_duration > WARN_DURATION { warn!( log, - "instance update: server error while unlocking instance, + "instance update: server error while unlocking instance, \ retrying"; "instance_id" => %instance_id, "lock" => ?lock, @@ -1313,7 +1313,8 @@ async fn unwind_instance_lock( } else { info!( log, - "server error while recording saga event, retrying"; + "instance update: server error while unlocking instance, \ + retrying"; "instance_id" => %instance_id, "lock" => ?lock, "error" => &error, @@ -1692,7 +1693,7 @@ mod test { params }) }, - // Don't pass a parent saga ID here because the saga MUST be + // Don't pass a parent saga ID here because the instance MUST be // unlocked if the whole start saga unwinds. || Box::pin(after_unwinding(None, cptestctx)), &cptestctx.logctx.log, From 58bb35356c0d332de68581d3d10e5ddf74843ea0 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Wed, 7 Aug 2024 11:24:16 -0700 Subject: [PATCH 230/234] update instance lock tests to match new behavior --- nexus/db-queries/src/db/datastore/instance.rs | 49 +++++++++---------- 1 file changed, 23 insertions(+), 26 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/instance.rs b/nexus/db-queries/src/db/datastore/instance.rs index 0145c0ef85c..6492f98b8b5 100644 --- a/nexus/db-queries/src/db/datastore/instance.rs +++ b/nexus/db-queries/src/db/datastore/instance.rs @@ -1706,10 +1706,10 @@ mod tests { } #[tokio::test] - async fn test_instance_updater_unlocking_someone_elses_instance_errors() { + async fn test_instance_updater_cant_unlock_someone_elses_instance_() { // Setup let logctx = dev::test_setup_log( - "test_instance_updater_unlocking_someone_elses_instance_errors", + "test_instance_updater_cant_unlock_someone_elses_instance_", ); let mut db = test_setup_database(&logctx.log).await; let (opctx, datastore) = datastore_test(&logctx, &db).await; @@ -1725,8 +1725,8 @@ mod tests { ) .expect("instance should be locked"); - // attempting to unlock with a different saga ID should be an error. - let err = dbg!( + // attempting to unlock with a different saga ID shouldn't do anything. + let unlocked = dbg!( datastore .instance_updater_unlock( &opctx, @@ -1743,16 +1743,15 @@ mod tests { ) .await ) - .expect_err( - "unlocking the instance with someone else's ID should fail", - ); - assert_eq!( - err, - Error::internal_error( - "attempted to release a lock held by another saga! \ - this is a bug!", - ), - ); + .unwrap(); + assert!(!unlocked); + + let instance = + dbg!(datastore.instance_refetch(&opctx, &authz_instance).await) + .expect("instance should exist"); + assert_eq!(instance.updater_id, Some(saga1)); + assert_eq!(instance.updater_gen, lock1.locked_gen); + let next_gen = Generation(lock1.locked_gen.0.next()); // unlocking with the correct ID should succeed. @@ -1764,9 +1763,15 @@ mod tests { .expect("instance should unlock"); assert!(unlocked, "instance should have unlocked"); + let instance = + dbg!(datastore.instance_refetch(&opctx, &authz_instance).await) + .expect("instance should exist"); + assert_eq!(instance.updater_id, None); + assert_eq!(instance.updater_gen, next_gen); + // unlocking with the lock holder's ID *again* at a new generation - // (where the lock is no longer held) should fail. - let err = dbg!( + // (where the lock is no longer held) shouldn't do anything + let unlocked = dbg!( datastore .instance_updater_unlock( &opctx, @@ -1778,16 +1783,8 @@ mod tests { ) .await ) - .expect_err( - "unlocking the instance with someone else's ID should fail", - ); - assert_eq!( - err, - Error::internal_error( - "attempted to release a lock on an instance \ - that is not locked! this is a bug!" - ), - ); + .unwrap(); + assert!(!unlocked); // Clean up. db.cleanup().await.unwrap(); From c707814ff3342e7bbeabcf18e0b743c0430264dd Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Wed, 7 Aug 2024 11:51:08 -0700 Subject: [PATCH 231/234] fix `instance_commit_update` idempotency --- nexus/db-queries/src/db/datastore/instance.rs | 92 ++++++++++++++++++- 1 file changed, 91 insertions(+), 1 deletion(-) diff --git a/nexus/db-queries/src/db/datastore/instance.rs b/nexus/db-queries/src/db/datastore/instance.rs index 6492f98b8b5..62b3ca8019a 100644 --- a/nexus/db-queries/src/db/datastore/instance.rs +++ b/nexus/db-queries/src/db/datastore/instance.rs @@ -1446,7 +1446,7 @@ impl DataStore { // generation has advanced past ours. That's fine --- assume we // already updated the instance. UpdateAndQueryResult { ref found, .. } - if found.runtime().r#gen > new_runtime.r#gen => + if found.runtime().r#gen >= new_runtime.r#gen => { debug!( &opctx.log, @@ -1842,6 +1842,96 @@ mod tests { logctx.cleanup_successful(); } + #[tokio::test] + async fn test_instance_commit_update_is_idempotent() { + // Setup + let logctx = + dev::test_setup_log("test_instance_commit_update_is_idempotent"); + let mut db = test_setup_database(&logctx.log).await; + let (opctx, datastore) = datastore_test(&logctx, &db).await; + let authz_instance = create_test_instance(&datastore, &opctx).await; + let saga1 = Uuid::new_v4(); + + // lock the instance once. + let lock = dbg!( + datastore + .instance_updater_lock(&opctx, &authz_instance, saga1) + .await + ) + .expect("instance should be locked"); + let new_runtime = &InstanceRuntimeState { + time_updated: Utc::now(), + r#gen: Generation(external::Generation::from_u32(2)), + propolis_id: Some(Uuid::new_v4()), + dst_propolis_id: None, + migration_id: None, + nexus_state: InstanceState::Vmm, + }; + + let updated = dbg!( + datastore + .instance_commit_update( + &opctx, + &authz_instance, + &lock, + &new_runtime + ) + .await + ) + .expect("instance_commit_update should succeed"); + assert!(updated, "it should be updated"); + + // okay, let's do it again at the same generation. + let updated = dbg!( + datastore + .instance_commit_update( + &opctx, + &authz_instance, + &lock, + &new_runtime + ) + .await + ) + .expect("instance_commit_update should succeed"); + assert!(!updated, "it was already updated"); + let instance = + dbg!(datastore.instance_refetch(&opctx, &authz_instance).await) + .expect("instance should exist"); + assert_eq!(instance.runtime().propolis_id, new_runtime.propolis_id); + assert_eq!(instance.runtime().r#gen, new_runtime.r#gen); + + // Doing it again at the same generation with a *different* state + // shouldn't change the instance at all. + let updated = dbg!( + datastore + .instance_commit_update( + &opctx, + &authz_instance, + &lock, + &InstanceRuntimeState { + propolis_id: Some(Uuid::new_v4()), + migration_id: Some(Uuid::new_v4()), + dst_propolis_id: Some(Uuid::new_v4()), + ..new_runtime.clone() + } + ) + .await + ) + .expect("instance_commit_update should succeed"); + assert!(!updated, "it was already updated"); + let instance = + dbg!(datastore.instance_refetch(&opctx, &authz_instance).await) + .expect("instance should exist"); + assert_eq!(instance.runtime().propolis_id, new_runtime.propolis_id); + assert_eq!(instance.runtime().dst_propolis_id, None); + assert_eq!(instance.runtime().migration_id, None); + assert_eq!(instance.runtime().r#gen, new_runtime.r#gen); + + // Clean up. + db.cleanup().await.unwrap(); + logctx.cleanup_successful(); + } + #[tokio::test] async fn test_instance_fetch_all() { // Setup From 8ed6dc6b6d07beec28161f0071521f9ea1f9da70 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Wed, 7 Aug 2024 16:11:12 -0700 Subject: [PATCH 232/234] fail to commit update if gen has advanced while locked see https://github.com/oxidecomputer/omicron/pull/5749#discussion_r1707889846 --- nexus/db-queries/src/db/datastore/instance.rs | 126 ++++++++++++++++-- 1 file changed, 118 insertions(+), 8 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/instance.rs b/nexus/db-queries/src/db/datastore/instance.rs index 62b3ca8019a..455aa62192b 100644 --- a/nexus/db-queries/src/db/datastore/instance.rs +++ b/nexus/db-queries/src/db/datastore/instance.rs @@ -1398,7 +1398,7 @@ impl DataStore { // - the provided updater generation matches the current updater // generation. .filter(dsl::updater_gen.eq(locked_gen)) - .filter(dsl::state_generation.lt(new_runtime.gen)) + .filter(dsl::state_generation.lt(new_runtime.r#gen)) .set(( dsl::updater_gen.eq(Generation(locked_gen.0.next())), dsl::updater_id.eq(None::), @@ -1417,6 +1417,9 @@ impl DataStore { ) })?; + // The expected state generation number of the instance record *before* + // applying the update. + let prev_state_gen = u64::from(new_runtime.r#gen.0).saturating_sub(1); match result { // If we updated the record, the lock has been released! Return // `Ok(true)` to indicate that we released the lock successfully. @@ -1442,24 +1445,50 @@ impl DataStore { .into_not_found(ResourceType::Instance)) } - // The instance exists, but we cannot update it because the state - // generation has advanced past ours. That's fine --- assume we - // already updated the instance. + // The instance exists, but both the lock generation *and* the state + // generation no longer matches ours. That's fine --- presumably, + // another execution of the same saga action has already updated the + // instance record. UpdateAndQueryResult { ref found, .. } - if found.runtime().r#gen >= new_runtime.r#gen => + if u64::from(found.runtime().r#gen.0) != prev_state_gen + && found.updater_gen != locked_gen => { + debug_assert_ne!(found.updater_id, Some(updater_id)); debug!( &opctx.log, "cannot commit instance updates, as the state generation \ - has advanced: they've probably already been committed."; + and lock generation have advanced: the required updates \ + have probably already been committed."; "instance_id" => %instance_id, + "expected_state_gen" => ?new_runtime.r#gen, + "actual_state_gen" => ?found.runtime().r#gen, "updater_id" => %updater_id, - "expected_gen" => ?new_runtime.r#gen, - "actual_gen" => ?found.runtime().r#gen, + "updater_gen" => ?locked_gen, + "actual_updater_gen" => ?found.updater_gen, ); Ok(false) } + // The state generation has advanced, but the instance is *still* + // locked by this saga. That's bad --- this update saga may no + // longer update the instance, as its state has changed, potentially + // invalidating the updates. We need to unwind. + UpdateAndQueryResult { ref found, .. } + if u64::from(found.runtime().r#gen.0) != prev_state_gen + && found.updater_gen == locked_gen + && found.updater_id == Some(updater_id) => + { + info!( + &opctx.log, + "cannot commit instance update, as the state generation \ + has advanced, potentially invalidating the update"; + "instance_id" => %instance_id, + "expected_state_gen" => ?new_runtime.r#gen, + "actual_state_gen" => ?found.runtime().r#gen, + ); + Err(Error::conflict("instance state has changed")) + } + // The instance exists, but we could not update it because the lock // did not match. UpdateAndQueryResult { ref found, .. } => match found.updater_id { @@ -1932,6 +1961,87 @@ mod tests { logctx.cleanup_successful(); } + #[tokio::test] + async fn test_instance_update_invalidated_while_locked() { + // Setup + let logctx = dev::test_setup_log( + "test_instance_update_invalidated_while_locked", + ); + let mut db = test_setup_database(&logctx.log).await; + let (opctx, datastore) = datastore_test(&logctx, &db).await; + let authz_instance = create_test_instance(&datastore, &opctx).await; + let saga1 = Uuid::new_v4(); + + // Lock the instance + let lock = dbg!( + datastore + .instance_updater_lock(&opctx, &authz_instance, saga1) + .await + ) + .expect("instance should be locked"); + + // Mutate the instance state, invalidating the state when the lock was + // acquired. + let new_runtime = &InstanceRuntimeState { + time_updated: Utc::now(), + r#gen: Generation(external::Generation::from_u32(2)), + propolis_id: Some(Uuid::new_v4()), + dst_propolis_id: Some(Uuid::new_v4()), + migration_id: Some(Uuid::new_v4()), + nexus_state: InstanceState::Vmm, + }; + let updated = dbg!( + datastore + .instance_update_runtime( + &InstanceUuid::from_untyped_uuid(authz_instance.id()), + &new_runtime + ) + .await + ) + .expect("instance_update_runtime should succeed"); + assert!(updated, "it should be updated"); + + // Okay, now try to commit the result of an update saga. This must fail, + // because the state generation has changed while we had locked the + // instance. + let _err = dbg!( + datastore + .instance_commit_update( + &opctx, + &authz_instance, + &lock, + &InstanceRuntimeState { + time_updated: Utc::now(), + r#gen: Generation(external::Generation::from_u32(2)), + propolis_id: None, + dst_propolis_id: None, + migration_id: None, + nexus_state: InstanceState::NoVmm, + }, + ) + .await + ) + .expect_err( + "instance_commit_update should fail if the state generation is \ + stale", + ); + + let instance = + dbg!(datastore.instance_refetch(&opctx, &authz_instance).await) + .expect("instance should exist"); + assert_eq!(instance.runtime().propolis_id, new_runtime.propolis_id); + assert_eq!( + instance.runtime().dst_propolis_id, + new_runtime.dst_propolis_id + ); + assert_eq!(instance.runtime().migration_id, new_runtime.migration_id); + assert_eq!(instance.runtime().nexus_state, new_runtime.nexus_state); + + // Clean up. + db.cleanup().await.unwrap(); + logctx.cleanup_successful(); + } + #[tokio::test] async fn test_instance_fetch_all() { // Setup From d93a80bc7bcf90fd263f4ac39ed8e3463e5d4336 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Fri, 9 Aug 2024 09:49:09 -0700 Subject: [PATCH 233/234] use `InstanceAndVmm::effective_state` in start This makes @gjcolombo's change from #6278 a bit more consistent with the rest of the code, and in particular, ensures that we always present the user with the same instance state names. It also addresses the `SagaUnwound` behavior Greg pointed out in [this comment][1]. [1]: https://github.com/oxidecomputer/omicron/pull/5749#discussion_r1710677685 --- nexus/src/app/instance.rs | 138 +++++++++++++++++--------------------- 1 file changed, 63 insertions(+), 75 deletions(-) diff --git a/nexus/src/app/instance.rs b/nexus/src/app/instance.rs index d5f869a4cfc..344d2688f7c 100644 --- a/nexus/src/app/instance.rs +++ b/nexus/src/app/instance.rs @@ -19,7 +19,6 @@ use crate::external_api::params; use cancel_safe_futures::prelude::*; use futures::future::Fuse; use futures::{FutureExt, SinkExt, StreamExt}; -use nexus_db_model::InstanceState as DbInstanceState; use nexus_db_model::IpAttachState; use nexus_db_model::IpKind; use nexus_db_model::Vmm as DbVmm; @@ -1899,83 +1898,69 @@ fn instance_start_allowed( // // If the instance doesn't have an active VMM, see if the instance state // permits it to start. - if let Some(vmm) = vmm { - match vmm.runtime.state { - // If the VMM is already starting or is in another "active" - // state, succeed to make successful start attempts idempotent. - DbVmmState::Starting - | DbVmmState::Running - | DbVmmState::Rebooting - | DbVmmState::Migrating => { - debug!(log, "asked to start an active instance"; - "instance_id" => %instance.id()); - - Ok(InstanceStartDisposition::AlreadyStarted) - } - // If a previous start saga failed and left behind a VMM in the - // SagaUnwound state, allow a new start saga to try to overwrite - // it. - DbVmmState::SagaUnwound => { - debug!( - log, - "instance's last VMM's start saga unwound, OK to start"; - "instance_id" => %instance.id() - ); + match state.effective_state() { + // If the VMM is already starting or is in another "active" + // state, succeed to make successful start attempts idempotent. + s @ InstanceState::Starting + | s @ InstanceState::Running + | s @ InstanceState::Rebooting + | s @ InstanceState::Migrating => { + debug!(log, "asked to start an active instance"; + "instance_id" => %instance.id(), + "state" => ?s); + + Ok(InstanceStartDisposition::AlreadyStarted) + } + InstanceState::Stopped => { + match vmm.as_ref() { + // If a previous start saga failed and left behind a VMM in the + // SagaUnwound state, allow a new start saga to try to overwrite + // it. + Some(vmm) if vmm.runtime.state == DbVmmState::SagaUnwound => { + debug!( + log, + "instance's last VMM's start saga unwound, OK to start"; + "instance_id" => %instance.id() + ); - Ok(InstanceStartDisposition::Start) - } - // When sled agent publishes a Stopped state, Nexus should clean - // up the instance/VMM pointer. - DbVmmState::Stopped => { - let propolis_id = instance - .runtime() - .propolis_id - .expect("needed a VMM ID to fetch a VMM record"); - error!(log, - "instance is stopped but still has an active VMM"; - "instance_id" => %instance.id(), - "propolis_id" => %propolis_id); - - Err(Error::internal_error( - "instance is stopped but still has an active VMM", - )) + Ok(InstanceStartDisposition::Start) + } + // This shouldn't happen: `InstanceAndVmm::effective_state` should + // only return `Stopped` if there is no active VMM or if the VMM is + // `SagaUnwound`. + Some(vmm) => { + error!(log, + "instance is stopped but still has an active VMM"; + "instance_id" => %instance.id(), + "propolis_id" => %vmm.id, + "propolis_state" => ?vmm.runtime.state); + + Err(Error::internal_error( + "instance is stopped but still has an active VMM", + )) + } + // Ah, it's actually stopped. We can restart it. + None => Ok(InstanceStartDisposition::Start), } - _ => Err(Error::conflict(&format!( - "instance is in state {} but must be {} to be started", - vmm.runtime.state, - InstanceState::Stopped - ))), } - } else { - match instance.runtime_state.nexus_state { - // If the instance is in a known-good no-VMM state, it can - // start. - DbInstanceState::NoVmm => { - debug!(log, "instance has no VMM, OK to start"; - "instance_id" => %instance.id()); - - Ok(InstanceStartDisposition::Start) - } - // If the instance isn't ready yet or has been destroyed, it - // can't start. - // - // TODO(#2825): If the "Failed" state could be interpreted to - // mean "stopped abnormally" and not just "Nexus doesn't know - // what state the instance is in," it would be fine to start the - // instance here. See RFD 486. - DbInstanceState::Creating - | DbInstanceState::Failed - | DbInstanceState::Destroyed => Err(Error::conflict(&format!( - "instance is in state {} but must be {} to be started", - instance.runtime_state.nexus_state, + InstanceState::Stopping => { + let (propolis_id, propolis_state) = match vmm.as_ref() { + Some(vmm) => (Some(vmm.id), Some(vmm.runtime.state)), + None => (None, None), + }; + debug!(log, "instance's VMM is still in the process of stopping"; + "instance_id" => %instance.id(), + "propolis_id" => ?propolis_id, + "propolis_state" => ?propolis_state); + Err(Error::conflict( + "instance must finish stopping before it can be started", + )) + } + s => { + return Err(Error::conflict(&format!( + "instance is in state {s} but it must be {} to be started", InstanceState::Stopped - ))), - // If the instance is in the Vmm state, there should have been - // an active Propolis ID and a VMM record to read, so this - // branch shouldn't have been reached. - DbInstanceState::Vmm => Err(Error::internal_error( - "instance is in state Vmm but has no active VMM", - )), + ))) } } } @@ -1986,7 +1971,10 @@ mod tests { use super::*; use core::time::Duration; use futures::{SinkExt, StreamExt}; - use nexus_db_model::{Instance as DbInstance, VmmInitialState}; + use nexus_db_model::{ + Instance as DbInstance, InstanceState as DbInstanceState, + VmmInitialState, VmmState as DbVmmState, + }; use omicron_common::api::external::{ Hostname, IdentityMetadataCreateParams, InstanceCpuCount, Name, }; From 67c424c182d2d18f832f7d4886e6d54c64cc09fe Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Fri, 9 Aug 2024 13:19:22 -0700 Subject: [PATCH 234/234] document more saga interactions --- nexus/src/app/sagas/instance_update/mod.rs | 52 +++++++++++++++++++++- 1 file changed, 51 insertions(+), 1 deletion(-) diff --git a/nexus/src/app/sagas/instance_update/mod.rs b/nexus/src/app/sagas/instance_update/mod.rs index 06e99184579..71abe63bbd1 100644 --- a/nexus/src/app/sagas/instance_update/mod.rs +++ b/nexus/src/app/sagas/instance_update/mod.rs @@ -110,7 +110,8 @@ //! - Only the `instance_start` saga can set the instance's *active* Propolis //! ID, and it can only do this if there is currently no active Propolis. //! - Only the `instance_migrate` saga can set the instance's *target* Propolis -//! ID and migration ID, and it can only do that if these fields are unset. +//! ID and migration ID, and it can only do that if these fields are unset, or +//! were left behind by a failed `instance_migrate` saga unwinding. //! - Only the `instance_update` saga can unset a migration ID and target //! Propolis ID, which it will do when handling an update from sled-agent that //! indicates that a migration has succeeded or failed. @@ -176,6 +177,45 @@ //! saga unwinds into the start saga, that's fine, because a double-unlock is //! prevented by the saga ID having changed in the "inherit lock" operation. //! +//! ### Interaction With Other Sagas +//! +//! The instance-updater lock only provides mutual exclusion with regards to +//! *other `instance-update` sagas*. It does *not* prevent modifications to the +//! instance record by other sagas, such as `instance-start`, +//! `instance-migrate`, and `instance-delete`. Instead, mutual exclusion between +//! the `instance-update` saga and `instance-start` and `instance-delete` sagas +//! is ensured by the actual state of the instance record, as discussed above: +//! start and delete sagas can be started only when the instance has no active +//! VMM, and the `instance-update` saga will only run when an instance *does* +//! have an active VMM that has transitioned to a state where it must be +//! unlinked from the instance. The update saga unlinks the VMM from the +//! instance record as its last action, which allows the instance to be a valid +//! target for a start or delete saga. +//! +//! On the other hand, an `instance-migrate` saga can, potentially, mutate the +//! instance record while an update saga is running, if it attempts to start a +//! migration while an update is still being processed. If the migrate saga +//! starts during an update and completes before the update saga, the update +//! saga writing back an updated instance state to the instance record could +//! result in an [ABA problem]-like issue, where the changes made by the migrate +//! saga are clobbered by the update saga. These issues are instead guarded +//! against by the instance record's state generation number: the update saga +//! determines the generation for the updated instance record by incrementing +//! the generation number observed when the initial state for the update is +//! read. The query that writes back the instance's runtime state fails if the +//! generation number has changed since the state was read at the beginning of +//! the saga, which causes the saga to unwind. An unwinding saga activates the +//! `instance-updater` background task, which may in turn start a new saga if +//! the instance's current state still requires an update. +//! +//! To avoid unnecessarily changing an instance's state generation and +//! invalidating in-progress update sagas, unwinding `instance-start` and +//! `instance-migrate` sagas don't remove the VMMs and migrations they create +//! from the instance's `propolis_id`, `target_propolis_id`, and `migration_id` +//! fields. Instead, they transition the `vmm` records to +//! [`VmmState::SagaUnwound`], which is treated as equivalent to having no VMM +//! in that position by other instances of those sagas. +//! //! ### Avoiding Missed Updates, or, "The `InstanceRuntimeState` Will Always Get Through" //! //! The lock operation we've described above is really more of a "try-lock" @@ -290,6 +330,7 @@ //! //! [dist-locking]: //! https://martin.kleppmann.com/2016/02/08/how-to-do-distributed-locking.html +//! [ABA problem]: https://en.wikipedia.org/wiki/ABA_problem //! //! [^1]: And, if a process *can* die, well...we can assume it *will*. //! [^2]: Barring human intervention. @@ -739,6 +780,15 @@ declare_saga_actions! { // and re-fetch the VMM and migration states. If they have changed in a way // that requires an additional update saga, attempt to execute an additional // update saga immediately. + // + // Writing back the updated instance runtime state is conditional on both + // the instance updater lock *and* the instance record's state generation + // number. If the state generation has advanced since this update saga + // began, writing the new runtime state will fail, as the update was + // performed based on an initial state that is no longer current. In that + // case, this action will fail, causing the saga to unwind, release the + // updater lock, and activate the `instance-updater` background task to + // schedule new update saga if one is still required. COMMIT_INSTANCE_UPDATES -> "commit_instance_updates" { + siu_commit_instance_updates }