Skip to content

Commit 0a0db97

Browse files
authored
Add migration table and explicit migration tracking in sled-agent (#5859)
As part of ongoing work on improving instance lifecycle management (see #5749) , we intend to remove the `InstanceRuntimeState` tracking from `sled-agent`, and make Nexus the sole owner of `instance` records in CRDB, with a new `instance-update` saga taking over the responsibility of managing the instance's state transitions. In order to properly manage the instance state machine, Nexus will need information about the status of active migrations that are currently only available to sled-agents. For example, if an instance is migrating, and a sled agent reports that the source VMM is `Destroyed`, Nexus doesn't presently have the capability to determine whether the source VMM was destroyed because the migration completed successfully, or that the source shut down prior to starting the migration, resulting in a failure. In order for Nexus to correctly manage state updates during live migration, we introduce a new `migration` table to the schema for tracking the state of ongoing migrations. The `instance-migrate` saga creates a `migration` record when beginning a migration. The Nexus and sled-agent APIs are extended to include migration state updates from sled-agents to Nexus. In *this* branch, the `migration` table is (basically) write-only; Nexus doesn't really read from it, and just stuffs updates into it. In the future, however, this will be used by the `instance-update` saga. It occurred to me that, in addition to using the migration table for instance updates, it might also be useful to add an OMDB command to look up the status of a migration using this table. However, I decided that made more sense as a follow-up change, as I'd like to get back to integrating this into #5749. Fixes #2948
1 parent 5f75a98 commit 0a0db97

File tree

24 files changed

+1389
-151
lines changed

24 files changed

+1389
-151
lines changed

clients/nexus-client/src/lib.rs

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -149,6 +149,49 @@ impl From<omicron_common::api::internal::nexus::SledInstanceState>
149149
instance_state: s.instance_state.into(),
150150
propolis_id: s.propolis_id,
151151
vmm_state: s.vmm_state.into(),
152+
migration_state: s.migration_state.map(Into::into),
153+
}
154+
}
155+
}
156+
157+
impl From<omicron_common::api::internal::nexus::MigrationRuntimeState>
158+
for types::MigrationRuntimeState
159+
{
160+
fn from(
161+
s: omicron_common::api::internal::nexus::MigrationRuntimeState,
162+
) -> Self {
163+
Self {
164+
migration_id: s.migration_id,
165+
role: s.role.into(),
166+
state: s.state.into(),
167+
gen: s.gen,
168+
time_updated: s.time_updated,
169+
}
170+
}
171+
}
172+
173+
impl From<omicron_common::api::internal::nexus::MigrationRole>
174+
for types::MigrationRole
175+
{
176+
fn from(s: omicron_common::api::internal::nexus::MigrationRole) -> Self {
177+
use omicron_common::api::internal::nexus::MigrationRole as Input;
178+
match s {
179+
Input::Source => Self::Source,
180+
Input::Target => Self::Target,
181+
}
182+
}
183+
}
184+
185+
impl From<omicron_common::api::internal::nexus::MigrationState>
186+
for types::MigrationState
187+
{
188+
fn from(s: omicron_common::api::internal::nexus::MigrationState) -> Self {
189+
use omicron_common::api::internal::nexus::MigrationState as Input;
190+
match s {
191+
Input::Pending => Self::Pending,
192+
Input::InProgress => Self::InProgress,
193+
Input::Completed => Self::Completed,
194+
Input::Failed => Self::Failed,
152195
}
153196
}
154197
}

clients/sled-agent-client/src/lib.rs

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -328,6 +328,47 @@ impl From<types::SledInstanceState>
328328
instance_state: s.instance_state.into(),
329329
propolis_id: s.propolis_id,
330330
vmm_state: s.vmm_state.into(),
331+
migration_state: s.migration_state.map(Into::into),
332+
}
333+
}
334+
}
335+
336+
impl From<types::MigrationRuntimeState>
337+
for omicron_common::api::internal::nexus::MigrationRuntimeState
338+
{
339+
fn from(s: types::MigrationRuntimeState) -> Self {
340+
Self {
341+
migration_id: s.migration_id,
342+
state: s.state.into(),
343+
role: s.role.into(),
344+
gen: s.gen,
345+
time_updated: s.time_updated,
346+
}
347+
}
348+
}
349+
350+
impl From<types::MigrationRole>
351+
for omicron_common::api::internal::nexus::MigrationRole
352+
{
353+
fn from(r: types::MigrationRole) -> Self {
354+
use omicron_common::api::internal::nexus::MigrationRole as Output;
355+
match r {
356+
types::MigrationRole::Source => Output::Source,
357+
types::MigrationRole::Target => Output::Target,
358+
}
359+
}
360+
}
361+
362+
impl From<types::MigrationState>
363+
for omicron_common::api::internal::nexus::MigrationState
364+
{
365+
fn from(s: types::MigrationState) -> Self {
366+
use omicron_common::api::internal::nexus::MigrationState as Output;
367+
match s {
368+
types::MigrationState::Pending => Output::Pending,
369+
types::MigrationState::InProgress => Output::InProgress,
370+
types::MigrationState::Failed => Output::Failed,
371+
types::MigrationState::Completed => Output::Completed,
331372
}
332373
}
333374
}

common/src/api/internal/nexus.rs

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ use omicron_uuid_kinds::UpstairsSessionKind;
1616
use parse_display::{Display, FromStr};
1717
use schemars::JsonSchema;
1818
use serde::{Deserialize, Serialize};
19+
use std::fmt;
1920
use std::net::SocketAddr;
2021
use std::time::Duration;
2122
use strum::{EnumIter, IntoEnumIterator};
@@ -108,6 +109,97 @@ pub struct SledInstanceState {
108109

109110
/// The most recent state of the sled's VMM process.
110111
pub vmm_state: VmmRuntimeState,
112+
113+
/// The current state of any in-progress migration for this instance, as
114+
/// understood by this sled.
115+
pub migration_state: Option<MigrationRuntimeState>,
116+
}
117+
118+
/// An update from a sled regarding the state of a migration, indicating the
119+
/// role of the VMM whose migration state was updated.
120+
#[derive(Clone, Debug, Deserialize, Serialize, JsonSchema)]
121+
pub struct MigrationRuntimeState {
122+
pub migration_id: Uuid,
123+
pub state: MigrationState,
124+
pub role: MigrationRole,
125+
pub gen: Generation,
126+
127+
/// Timestamp for the migration state update.
128+
pub time_updated: DateTime<Utc>,
129+
}
130+
131+
/// The state of an instance's live migration.
132+
#[derive(
133+
Clone,
134+
Copy,
135+
Debug,
136+
Default,
137+
PartialEq,
138+
Eq,
139+
Deserialize,
140+
Serialize,
141+
JsonSchema,
142+
)]
143+
#[serde(rename_all = "snake_case")]
144+
pub enum MigrationState {
145+
/// The migration has not started for this VMM.
146+
#[default]
147+
Pending,
148+
/// The migration is in progress.
149+
InProgress,
150+
/// The migration has failed.
151+
Failed,
152+
/// The migration has completed.
153+
Completed,
154+
}
155+
156+
impl MigrationState {
157+
pub fn label(&self) -> &'static str {
158+
match self {
159+
Self::Pending => "pending",
160+
Self::InProgress => "in_progress",
161+
Self::Completed => "completed",
162+
Self::Failed => "failed",
163+
}
164+
}
165+
/// Returns `true` if this migration state means that the migration is no
166+
/// longer in progress (it has either succeeded or failed).
167+
#[must_use]
168+
pub fn is_terminal(&self) -> bool {
169+
matches!(self, MigrationState::Completed | MigrationState::Failed)
170+
}
171+
}
172+
173+
impl fmt::Display for MigrationState {
174+
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
175+
f.write_str(self.label())
176+
}
177+
}
178+
179+
#[derive(
180+
Clone, Copy, Debug, PartialEq, Eq, Deserialize, Serialize, JsonSchema,
181+
)]
182+
#[serde(rename_all = "snake_case")]
183+
pub enum MigrationRole {
184+
/// This update concerns the source VMM of a migration.
185+
Source,
186+
/// This update concerns the target VMM of a migration.
187+
Target,
188+
}
189+
190+
impl MigrationRole {
191+
pub fn label(&self) -> &'static str {
192+
match self {
193+
Self::Source => "source",
194+
Self::Target => "target",
195+
}
196+
}
197+
}
198+
199+
impl fmt::Display for MigrationRole {
200+
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
201+
f.write_str(self.label())
202+
}
111203
}
112204

113205
// Oximeter producer/collector objects.

nexus/db-model/src/lib.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,8 @@ pub mod ipv6;
4242
mod ipv6net;
4343
mod l4_port_range;
4444
mod macaddr;
45+
mod migration;
46+
mod migration_state;
4547
mod name;
4648
mod network_interface;
4749
mod oximeter_info;
@@ -152,6 +154,8 @@ pub use ipv4net::*;
152154
pub use ipv6::*;
153155
pub use ipv6net::*;
154156
pub use l4_port_range::*;
157+
pub use migration::*;
158+
pub use migration_state::*;
155159
pub use name::*;
156160
pub use network_interface::*;
157161
pub use oximeter_info::*;

nexus/db-model/src/migration.rs

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
// This Source Code Form is subject to the terms of the Mozilla Public
2+
// License, v. 2.0. If a copy of the MPL was not distributed with this
3+
// file, You can obtain one at https://mozilla.org/MPL/2.0/.
4+
5+
use super::Generation;
6+
use crate::schema::migration;
7+
use crate::MigrationState;
8+
use chrono::DateTime;
9+
use chrono::Utc;
10+
use omicron_common::api::internal::nexus;
11+
use serde::Deserialize;
12+
use serde::Serialize;
13+
use uuid::Uuid;
14+
15+
/// The state of a migration as understood by Nexus.
16+
#[derive(
17+
Clone, Debug, Queryable, Insertable, Selectable, Serialize, Deserialize,
18+
)]
19+
#[diesel(table_name = migration)]
20+
pub struct Migration {
21+
/// The migration's UUID.
22+
///
23+
/// This is the primary key of the migration table and is referenced by the
24+
/// `instance` table's `migration_id` field.
25+
pub id: Uuid,
26+
27+
/// The time at which this migration record was created.
28+
pub time_created: DateTime<Utc>,
29+
30+
/// The time at which this migration record was deleted,
31+
pub time_deleted: Option<DateTime<Utc>>,
32+
33+
/// The state of the migration source VMM.
34+
pub source_state: MigrationState,
35+
36+
/// The ID of the migration source VMM.
37+
pub source_propolis_id: Uuid,
38+
39+
/// The generation number for the source state.
40+
pub source_gen: Generation,
41+
42+
/// The time the source VMM state was most recently updated.
43+
pub time_source_updated: Option<DateTime<Utc>>,
44+
45+
/// The state of the migration target VMM.
46+
pub target_state: MigrationState,
47+
48+
/// The ID of the migration target VMM.
49+
pub target_propolis_id: Uuid,
50+
51+
/// The generation number for the target state.
52+
pub target_gen: Generation,
53+
54+
/// The time the target VMM state was most recently updated.
55+
pub time_target_updated: Option<DateTime<Utc>>,
56+
}
57+
58+
impl Migration {
59+
pub fn new(
60+
migration_id: Uuid,
61+
source_propolis_id: Uuid,
62+
target_propolis_id: Uuid,
63+
) -> Self {
64+
Self {
65+
id: migration_id,
66+
time_created: Utc::now(),
67+
time_deleted: None,
68+
source_state: nexus::MigrationState::Pending.into(),
69+
source_propolis_id,
70+
source_gen: Generation::new(),
71+
time_source_updated: None,
72+
target_state: nexus::MigrationState::Pending.into(),
73+
target_propolis_id,
74+
target_gen: Generation::new(),
75+
time_target_updated: None,
76+
}
77+
}
78+
}

nexus/db-model/src/migration_state.rs

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
// This Source Code Form is subject to the terms of the Mozilla Public
2+
// License, v. 2.0. If a copy of the MPL was not distributed with this
3+
// file, You can obtain one at https://mozilla.org/MPL/2.0/.
4+
5+
//! Database representation of a migration's state as understood by Nexus.
6+
7+
use super::impl_enum_wrapper;
8+
use omicron_common::api::internal::nexus;
9+
use serde::Deserialize;
10+
use serde::Serialize;
11+
use std::fmt;
12+
use std::io::Write;
13+
14+
impl_enum_wrapper!(
15+
#[derive(Clone, SqlType, Debug, QueryId)]
16+
#[diesel(postgres_type(name = "migration_state", schema = "public"))]
17+
pub struct MigrationStateEnum;
18+
19+
#[derive(Clone, Copy, Debug, AsExpression, FromSqlRow, Serialize, Deserialize, PartialEq, Eq)]
20+
#[diesel(sql_type = MigrationStateEnum)]
21+
pub struct MigrationState(pub nexus::MigrationState);
22+
23+
// Enum values
24+
Pending => b"pending"
25+
InProgress => b"in_progress"
26+
Completed => b"completed"
27+
Failed => b"failed"
28+
);
29+
30+
impl MigrationState {
31+
/// Returns `true` if this migration state means that the migration is no
32+
/// longer in progress (it has either succeeded or failed).
33+
#[must_use]
34+
pub fn is_terminal(&self) -> bool {
35+
self.0.is_terminal()
36+
}
37+
}
38+
39+
impl fmt::Display for MigrationState {
40+
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
41+
fmt::Display::fmt(&self.0, f)
42+
}
43+
}
44+
45+
impl From<nexus::MigrationState> for MigrationState {
46+
fn from(s: nexus::MigrationState) -> Self {
47+
Self(s)
48+
}
49+
}

nexus/db-model/src/schema.rs

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1759,6 +1759,25 @@ table! {
17591759
}
17601760
}
17611761

1762+
table! {
1763+
migration (id) {
1764+
id -> Uuid,
1765+
time_created -> Timestamptz,
1766+
time_deleted -> Nullable<Timestamptz>,
1767+
source_state -> crate::MigrationStateEnum,
1768+
source_propolis_id -> Uuid,
1769+
source_gen -> Int8,
1770+
time_source_updated -> Nullable<Timestamptz>,
1771+
target_state -> crate::MigrationStateEnum,
1772+
target_propolis_id -> Uuid,
1773+
target_gen -> Int8,
1774+
time_target_updated -> Nullable<Timestamptz>,
1775+
}
1776+
}
1777+
1778+
allow_tables_to_appear_in_same_query!(instance, migration);
1779+
joinable!(instance -> migration (migration_id));
1780+
17621781
allow_tables_to_appear_in_same_query!(
17631782
ip_pool_range,
17641783
ip_pool,

nexus/db-model/src/schema_versions.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ use std::collections::BTreeMap;
1717
///
1818
/// This must be updated when you change the database schema. Refer to
1919
/// schema/crdb/README.adoc in the root of this repository for details.
20-
pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(73, 0, 0);
20+
pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(74, 0, 0);
2121

2222
/// List of all past database schema versions, in *reverse* order
2323
///
@@ -29,6 +29,7 @@ static KNOWN_VERSIONS: Lazy<Vec<KnownVersion>> = Lazy::new(|| {
2929
// | leaving the first copy as an example for the next person.
3030
// v
3131
// KnownVersion::new(next_int, "unique-dirname-with-the-sql-files"),
32+
KnownVersion::new(74, "add-migration-table"),
3233
KnownVersion::new(73, "add-vlan-to-uplink"),
3334
KnownVersion::new(72, "fix-provisioning-counters"),
3435
KnownVersion::new(71, "add-saga-unwound-vmm-state"),

0 commit comments

Comments
 (0)