Skip to content

Commit a0cdce7

Browse files
authored
sled agent: index running VMMs by VMM ID, not instance ID (#6429)
Change sled agent's instance lookup tables so that Propolis jobs are indexed by Propolis/VMM IDs instead of instance IDs. This is a prerequisite to revisiting how the Failed instance state works. See RFD 486 section 6.1 for all the details of why this is needed, but very broadly: when an instance's VMM is Destroyed, we'd like sled agent to tell Nexus that *before* the agent deregisters the instance from the sled, for reasons described in the RFD; but if we do that with no other changes, there's a race where Nexus may try to restart the instance on the same sled before sled agent can update its instance table, causing instance start to fail. To achieve this: - In sled agent, change the `InstanceManagerRunner`'s instance map to a `BTreeMap<PropolisUuid, Instance>`, then clean up all the compilation errors. - In Nexus: - Make callers of instance APIs furnish a Propolis ID instead of an instance ID. This is generally very straightforward because they already had to get a VMM record to figure out what sled to talk to. - Change `cpapi_instances_put` to take a Propolis ID instead of an instance ID. Regular sled agent still has both IDs, but with these changes, simulated sled agents only have a Propolis ID to work with, and plumbing an instance ID down to them requires significant code changes. - Update test code: - Unify the Nexus helper routines that let integration tests get sled agent clients or sled IDs; now they get a single struct containing both of those and the instance's Propolis IDs. - Update users of the simulated agent's `poke` endpoints to use Propolis IDs. - Delete the "detach disks on instance stop" bits of simulated sled agent. These don't appear to be load-bearing, they don't correspond to any behavior in the actual sled agent (which doesn't manage disk attachment or detachment), and it was a pain to rework them to work with Propolis IDs. Tests: cargo nextest. Related: #4226 and #4872, among others.
1 parent 758818a commit a0cdce7

39 files changed

+1565
-1630
lines changed

clients/nexus-client/src/lib.rs

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -131,14 +131,11 @@ impl From<omicron_common::api::internal::nexus::VmmRuntimeState>
131131
}
132132
}
133133

134-
impl From<omicron_common::api::internal::nexus::SledInstanceState>
135-
for types::SledInstanceState
134+
impl From<omicron_common::api::internal::nexus::SledVmmState>
135+
for types::SledVmmState
136136
{
137-
fn from(
138-
s: omicron_common::api::internal::nexus::SledInstanceState,
139-
) -> Self {
137+
fn from(s: omicron_common::api::internal::nexus::SledVmmState) -> Self {
140138
Self {
141-
propolis_id: s.propolis_id,
142139
vmm_state: s.vmm_state.into(),
143140
migration_in: s.migration_in.map(Into::into),
144141
migration_out: s.migration_out.map(Into::into),

clients/sled-agent-client/src/lib.rs

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
//! Interface for making API requests to a Sled Agent
66
77
use async_trait::async_trait;
8+
use omicron_uuid_kinds::PropolisUuid;
89
use schemars::JsonSchema;
910
use serde::Deserialize;
1011
use serde::Serialize;
@@ -161,12 +162,11 @@ impl From<types::VmmRuntimeState>
161162
}
162163
}
163164

164-
impl From<types::SledInstanceState>
165-
for omicron_common::api::internal::nexus::SledInstanceState
165+
impl From<types::SledVmmState>
166+
for omicron_common::api::internal::nexus::SledVmmState
166167
{
167-
fn from(s: types::SledInstanceState) -> Self {
168+
fn from(s: types::SledVmmState) -> Self {
168169
Self {
169-
propolis_id: s.propolis_id,
170170
vmm_state: s.vmm_state.into(),
171171
migration_in: s.migration_in.map(Into::into),
172172
migration_out: s.migration_out.map(Into::into),
@@ -448,33 +448,33 @@ impl From<types::SledIdentifiers>
448448
/// are bonus endpoints, not generated in the real client.
449449
#[async_trait]
450450
pub trait TestInterfaces {
451-
async fn instance_single_step(&self, id: Uuid);
452-
async fn instance_finish_transition(&self, id: Uuid);
453-
async fn instance_simulate_migration_source(
451+
async fn vmm_single_step(&self, id: PropolisUuid);
452+
async fn vmm_finish_transition(&self, id: PropolisUuid);
453+
async fn vmm_simulate_migration_source(
454454
&self,
455-
id: Uuid,
455+
id: PropolisUuid,
456456
params: SimulateMigrationSource,
457457
);
458458
async fn disk_finish_transition(&self, id: Uuid);
459459
}
460460

461461
#[async_trait]
462462
impl TestInterfaces for Client {
463-
async fn instance_single_step(&self, id: Uuid) {
463+
async fn vmm_single_step(&self, id: PropolisUuid) {
464464
let baseurl = self.baseurl();
465465
let client = self.client();
466-
let url = format!("{}/instances/{}/poke-single-step", baseurl, id);
466+
let url = format!("{}/vmms/{}/poke-single-step", baseurl, id);
467467
client
468468
.post(url)
469469
.send()
470470
.await
471471
.expect("instance_single_step() failed unexpectedly");
472472
}
473473

474-
async fn instance_finish_transition(&self, id: Uuid) {
474+
async fn vmm_finish_transition(&self, id: PropolisUuid) {
475475
let baseurl = self.baseurl();
476476
let client = self.client();
477-
let url = format!("{}/instances/{}/poke", baseurl, id);
477+
let url = format!("{}/vmms/{}/poke", baseurl, id);
478478
client
479479
.post(url)
480480
.send()
@@ -493,14 +493,14 @@ impl TestInterfaces for Client {
493493
.expect("disk_finish_transition() failed unexpectedly");
494494
}
495495

496-
async fn instance_simulate_migration_source(
496+
async fn vmm_simulate_migration_source(
497497
&self,
498-
id: Uuid,
498+
id: PropolisUuid,
499499
params: SimulateMigrationSource,
500500
) {
501501
let baseurl = self.baseurl();
502502
let client = self.client();
503-
let url = format!("{baseurl}/instances/{id}/sim-migration-source");
503+
let url = format!("{baseurl}/vmms/{id}/sim-migration-source");
504504
client
505505
.post(url)
506506
.json(&params)

common/src/api/internal/nexus.rs

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -113,13 +113,9 @@ pub struct VmmRuntimeState {
113113
pub time_updated: DateTime<Utc>,
114114
}
115115

116-
/// A wrapper type containing a sled's total knowledge of the state of a
117-
/// specific VMM and the instance it incarnates.
116+
/// A wrapper type containing a sled's total knowledge of the state of a VMM.
118117
#[derive(Clone, Debug, Deserialize, Serialize, JsonSchema)]
119-
pub struct SledInstanceState {
120-
/// The ID of the VMM whose state is being reported.
121-
pub propolis_id: PropolisUuid,
122-
118+
pub struct SledVmmState {
123119
/// The most recent state of the sled's VMM process.
124120
pub vmm_state: VmmRuntimeState,
125121

@@ -142,7 +138,7 @@ impl Migrations<'_> {
142138
}
143139
}
144140

145-
impl SledInstanceState {
141+
impl SledVmmState {
146142
pub fn migrations(&self) -> Migrations<'_> {
147143
Migrations {
148144
migration_in: self.migration_in.as_ref(),

nexus/db-queries/src/db/datastore/vmm.rs

Lines changed: 17 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
//! [`DataStore`] helpers for working with VMM records.
66
77
use super::DataStore;
8-
use crate::authz;
98
use crate::context::OpContext;
109
use crate::db;
1110
use crate::db::error::public_error_from_diesel;
@@ -40,8 +39,13 @@ use uuid::Uuid;
4039

4140
/// The result of an [`DataStore::vmm_and_migration_update_runtime`] call,
4241
/// indicating which records were updated.
43-
#[derive(Copy, Clone, Debug)]
42+
#[derive(Clone, Debug)]
4443
pub struct VmmStateUpdateResult {
44+
/// The VMM record that the update query found and possibly updated.
45+
///
46+
/// NOTE: This is the record prior to the update!
47+
pub found_vmm: Vmm,
48+
4549
/// `true` if the VMM record was updated, `false` otherwise.
4650
pub vmm_updated: bool,
4751

@@ -108,14 +112,10 @@ impl DataStore {
108112
pub async fn vmm_fetch(
109113
&self,
110114
opctx: &OpContext,
111-
authz_instance: &authz::Instance,
112115
vmm_id: &PropolisUuid,
113116
) -> LookupResult<Vmm> {
114-
opctx.authorize(authz::Action::Read, authz_instance).await?;
115-
116117
let vmm = dsl::vmm
117118
.filter(dsl::id.eq(vmm_id.into_untyped_uuid()))
118-
.filter(dsl::instance_id.eq(authz_instance.id()))
119119
.filter(dsl::time_deleted.is_null())
120120
.select(Vmm::as_select())
121121
.get_result_async(&*self.pool_connection_authorized(opctx).await?)
@@ -233,13 +233,21 @@ impl DataStore {
233233
.transaction(&conn, |conn| {
234234
let err = err.clone();
235235
async move {
236-
let vmm_updated = self
236+
let vmm_update_result = self
237237
.vmm_update_runtime_on_connection(
238238
&conn,
239239
&vmm_id,
240240
new_runtime,
241241
)
242-
.await.map(|r| match r.status { UpdateStatus::Updated => true, UpdateStatus::NotUpdatedButExists => false })?;
242+
.await?;
243+
244+
245+
let found_vmm = vmm_update_result.found;
246+
let vmm_updated = match vmm_update_result.status {
247+
UpdateStatus::Updated => true,
248+
UpdateStatus::NotUpdatedButExists => false
249+
};
250+
243251
let migration_out_updated = match migration_out {
244252
Some(migration) => {
245253
let r = self.migration_update_source_on_connection(
@@ -287,6 +295,7 @@ impl DataStore {
287295
None => false,
288296
};
289297
Ok(VmmStateUpdateResult {
298+
found_vmm,
290299
vmm_updated,
291300
migration_in_updated,
292301
migration_out_updated,

nexus/internal-api/src/lib.rs

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -33,14 +33,14 @@ use omicron_common::{
3333
DiskRuntimeState, DownstairsClientStopRequest,
3434
DownstairsClientStopped, ProducerEndpoint,
3535
ProducerRegistrationResponse, RepairFinishInfo, RepairProgress,
36-
RepairStartInfo, SledInstanceState,
36+
RepairStartInfo, SledVmmState,
3737
},
3838
},
3939
update::ArtifactId,
4040
};
4141
use omicron_uuid_kinds::{
42-
DemoSagaUuid, DownstairsKind, SledUuid, TypedUuid, UpstairsKind,
43-
UpstairsRepairKind,
42+
DemoSagaUuid, DownstairsKind, PropolisUuid, SledUuid, TypedUuid,
43+
UpstairsKind, UpstairsRepairKind,
4444
};
4545
use schemars::JsonSchema;
4646
use serde::{Deserialize, Serialize};
@@ -108,15 +108,15 @@ pub trait NexusInternalApi {
108108
body: TypedBody<SwitchPutRequest>,
109109
) -> Result<HttpResponseOk<SwitchPutResponse>, HttpError>;
110110

111-
/// Report updated state for an instance.
111+
/// Report updated state for a VMM.
112112
#[endpoint {
113113
method = PUT,
114-
path = "/instances/{instance_id}",
114+
path = "/vmms/{propolis_id}",
115115
}]
116116
async fn cpapi_instances_put(
117117
rqctx: RequestContext<Self::Context>,
118-
path_params: Path<InstancePathParam>,
119-
new_runtime_state: TypedBody<SledInstanceState>,
118+
path_params: Path<VmmPathParam>,
119+
new_runtime_state: TypedBody<SledVmmState>,
120120
) -> Result<HttpResponseUpdatedNoContent, HttpError>;
121121

122122
#[endpoint {
@@ -568,6 +568,12 @@ pub struct InstancePathParam {
568568
pub instance_id: Uuid,
569569
}
570570

571+
/// Path parameters for VMM requests (internal API)
572+
#[derive(Deserialize, JsonSchema)]
573+
pub struct VmmPathParam {
574+
pub propolis_id: PropolisUuid,
575+
}
576+
571577
#[derive(Clone, Copy, Debug, Deserialize, JsonSchema, Serialize)]
572578
pub struct CollectorIdPathParams {
573579
/// The ID of the oximeter collector.

nexus/src/app/background/tasks/abandoned_vmm_reaper.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,8 @@
2828
//! remains alive and continues to own its virtual provisioning resources.
2929
//!
3030
//! Cleanup of instance resources when an instance's *active* VMM is destroyed
31-
//! is handled elsewhere, by `notify_instance_updated` and (eventually) the
32-
//! `instance-update` saga.
31+
//! is handled elsewhere, by `process_vmm_update` and the `instance-update`
32+
//! saga.
3333
3434
use crate::app::background::BackgroundTask;
3535
use anyhow::Context;

nexus/src/app/background/tasks/instance_watcher.rs

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,9 @@ use nexus_types::identity::Asset;
1919
use nexus_types::identity::Resource;
2020
use omicron_common::api::external::Error;
2121
use omicron_common::api::external::InstanceState;
22-
use omicron_common::api::internal::nexus::SledInstanceState;
22+
use omicron_common::api::internal::nexus::SledVmmState;
2323
use omicron_uuid_kinds::GenericUuid;
24-
use omicron_uuid_kinds::InstanceUuid;
24+
use omicron_uuid_kinds::PropolisUuid;
2525
use oximeter::types::ProducerRegistry;
2626
use sled_agent_client::Client as SledAgentClient;
2727
use std::borrow::Cow;
@@ -81,12 +81,12 @@ impl InstanceWatcher {
8181
let client = client.clone();
8282

8383
async move {
84-
slog::trace!(opctx.log, "checking on instance...");
85-
let rsp = client
86-
.instance_get_state(&InstanceUuid::from_untyped_uuid(
87-
target.instance_id,
88-
))
89-
.await;
84+
let vmm_id = PropolisUuid::from_untyped_uuid(target.vmm_id);
85+
slog::trace!(
86+
opctx.log, "checking on VMM"; "propolis_id" => %vmm_id
87+
);
88+
89+
let rsp = client.vmm_get_state(&vmm_id).await;
9090
let mut check = Check {
9191
target,
9292
outcome: Default::default(),
@@ -151,18 +151,18 @@ impl InstanceWatcher {
151151
}
152152
};
153153

154-
let new_runtime_state: SledInstanceState = state.into();
154+
let new_runtime_state: SledVmmState = state.into();
155155
check.outcome =
156156
CheckOutcome::Success(new_runtime_state.vmm_state.state.into());
157157
debug!(
158158
opctx.log,
159159
"updating instance state";
160160
"state" => ?new_runtime_state.vmm_state.state,
161161
);
162-
match crate::app::instance::notify_instance_updated(
162+
match crate::app::instance::process_vmm_update(
163163
&datastore,
164164
&opctx,
165-
InstanceUuid::from_untyped_uuid(target.instance_id),
165+
PropolisUuid::from_untyped_uuid(target.vmm_id),
166166
&new_runtime_state,
167167
)
168168
.await
@@ -176,7 +176,7 @@ impl InstanceWatcher {
176176
_ => Err(Incomplete::UpdateFailed),
177177
};
178178
}
179-
Ok(Some(saga)) => {
179+
Ok(Some((_, saga))) => {
180180
check.update_saga_queued = true;
181181
if let Err(e) = sagas.saga_start(saga).await {
182182
warn!(opctx.log, "update saga failed"; "error" => ?e);

0 commit comments

Comments
 (0)