Skip to content

Commit 0a5b91a

Browse files
authored
Add an SMF property for rack_id to MGS (#4145)
A follow up commit will plumb the rack_id along with any discovered sleds to a new "hardware_sled" table in CRDB used for sled addition to an already initialized rack. This is going to be required when adding sleds to initialized racks, in order to create a `StartSledAgentRequest` for new sleds.
1 parent 9a1f6bc commit 0a5b91a

File tree

8 files changed

+110
-79
lines changed

8 files changed

+110
-79
lines changed

gateway-test-utils/src/setup.rs

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -142,13 +142,12 @@ pub async fn test_setup_with_config(
142142
}
143143

144144
// Start gateway server
145-
let rack_id = Uuid::parse_str(RACK_UUID).unwrap();
145+
let rack_id = Some(Uuid::parse_str(RACK_UUID).unwrap());
146146

147-
let args = MgsArguments { id: Uuid::new_v4(), addresses };
147+
let args = MgsArguments { id: Uuid::new_v4(), addresses, rack_id };
148148
let server = omicron_gateway::Server::start(
149149
server_config.clone(),
150150
args,
151-
rack_id,
152151
log.clone(),
153152
)
154153
.await

gateway/src/bin/mgs.rs

Lines changed: 34 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ enum Args {
5656
struct ConfigProperties {
5757
id: Uuid,
5858
addresses: Vec<SocketAddrV6>,
59+
rack_id: Option<Uuid>,
5960
}
6061

6162
#[tokio::main]
@@ -91,15 +92,17 @@ async fn do_run() -> Result<(), CmdError> {
9192
))
9293
})?;
9394

94-
let (id, addresses) = if id_and_address_from_smf {
95+
let (id, addresses, rack_id) = if id_and_address_from_smf {
9596
let config = read_smf_config()?;
96-
(config.id, config.addresses)
97+
(config.id, config.addresses, config.rack_id)
9798
} else {
98-
// Clap ensures these are present if `id_and_address_from_smf`
99-
// is false, so we can safely unwrap.
100-
(id.unwrap(), vec![address.unwrap()])
99+
// Does it matter if `rack_id` is always `None` in this case?
100+
let rack_id = None;
101+
// Clap ensures the first two fields are present if
102+
// `id_and_address_from_smf` is false, so we can safely unwrap.
103+
(id.unwrap(), vec![address.unwrap()], rack_id)
101104
};
102-
let args = MgsArguments { id, addresses };
105+
let args = MgsArguments { id, addresses, rack_id };
103106
let mut server =
104107
start_server(config, args).await.map_err(CmdError::Failure)?;
105108

@@ -114,6 +117,7 @@ async fn do_run() -> Result<(), CmdError> {
114117
.to_string()
115118
));
116119
}
120+
server.set_rack_id(new_config.rack_id);
117121
server
118122
.adjust_dropshot_addresses(&new_config.addresses)
119123
.await
@@ -148,6 +152,9 @@ fn read_smf_config() -> Result<ConfigProperties, CmdError> {
148152
// Name of the property within CONFIG_PG for our server addresses.
149153
const PROP_ADDR: &str = "address";
150154

155+
// Name of the property within CONFIG_PG for our rack ID.
156+
const PROP_RACK_ID: &str = "rack_id";
157+
151158
// This function is pretty boilerplate-y; we can reduce it by using this
152159
// error type to help us construct a `CmdError::Failure(_)` string. It
153160
// assumes (for the purposes of error messages) any property being fetched
@@ -210,6 +217,26 @@ fn read_smf_config() -> Result<ConfigProperties, CmdError> {
210217
))
211218
})?;
212219

220+
let prop_rack_id = config
221+
.get_property(PROP_RACK_ID)
222+
.map_err(|err| Error::GetProperty { prop: PROP_RACK_ID, err })?
223+
.ok_or_else(|| Error::MissingProperty { prop: PROP_RACK_ID })?
224+
.value()
225+
.map_err(|err| Error::GetValue { prop: PROP_RACK_ID, err })?
226+
.ok_or(Error::MissingValue { prop: PROP_RACK_ID })?
227+
.as_string()
228+
.map_err(|err| Error::ValueAsString { prop: PROP_RACK_ID, err })?;
229+
230+
let rack_id = if prop_rack_id.as_str() == "unknown" {
231+
None
232+
} else {
233+
Some(Uuid::try_parse(&prop_rack_id).map_err(|err| {
234+
CmdError::Failure(format!(
235+
"failed to parse `{CONFIG_PG}/{PROP_RACK_ID}` ({prop_rack_id:?}) as a UUID: {err}"
236+
))
237+
})?)
238+
};
239+
213240
let prop_addr = config
214241
.get_property(PROP_ADDR)
215242
.map_err(|err| Error::GetProperty { prop: PROP_ADDR, err })?
@@ -236,7 +263,7 @@ fn read_smf_config() -> Result<ConfigProperties, CmdError> {
236263
"no addresses specified by `{CONFIG_PG}/{PROP_ADDR}`"
237264
)))
238265
} else {
239-
Ok(ConfigProperties { id: prop_id, addresses })
266+
Ok(ConfigProperties { id: prop_id, addresses, rack_id })
240267
}
241268
}
242269

gateway/src/context.rs

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,29 +6,41 @@ use crate::error::StartupError;
66
use crate::management_switch::ManagementSwitch;
77
use crate::management_switch::SwitchConfig;
88
use gateway_sp_comms::InMemoryHostPhase2Provider;
9-
use slog::Logger;
9+
use slog::{info, Logger};
1010
use std::sync::Arc;
11+
use std::sync::OnceLock;
12+
use uuid::Uuid;
1113

1214
/// Shared state used by API request handlers
1315
pub struct ServerContext {
1416
pub mgmt_switch: ManagementSwitch,
1517
pub host_phase2_provider: Arc<InMemoryHostPhase2Provider>,
18+
pub rack_id: OnceLock<Uuid>,
1619
pub log: Logger,
1720
}
1821

1922
impl ServerContext {
2023
pub async fn new(
2124
host_phase2_provider: Arc<InMemoryHostPhase2Provider>,
2225
switch_config: SwitchConfig,
26+
rack_id_config: Option<Uuid>,
2327
log: &Logger,
2428
) -> Result<Arc<Self>, StartupError> {
2529
let mgmt_switch =
2630
ManagementSwitch::new(switch_config, &host_phase2_provider, log)
2731
.await?;
2832

33+
let rack_id = if let Some(id) = rack_id_config {
34+
info!(log, "Setting rack_id"; "rack_id" => %id);
35+
OnceLock::from(id)
36+
} else {
37+
OnceLock::new()
38+
};
39+
2940
Ok(Arc::new(ServerContext {
3041
mgmt_switch,
3142
host_phase2_provider,
43+
rack_id,
3244
log: log.clone(),
3345
}))
3446
}

gateway/src/lib.rs

Lines changed: 29 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@ pub fn run_openapi() -> Result<(), String> {
5858
pub struct MgsArguments {
5959
pub id: Uuid,
6060
pub addresses: Vec<SocketAddrV6>,
61+
pub rack_id: Option<Uuid>,
6162
}
6263

6364
type HttpServer = dropshot::HttpServer<Arc<ServerContext>>;
@@ -125,7 +126,6 @@ impl Server {
125126
pub async fn start(
126127
config: Config,
127128
args: MgsArguments,
128-
_rack_id: Uuid,
129129
log: Logger,
130130
) -> Result<Server, String> {
131131
if args.addresses.is_empty() {
@@ -146,12 +146,14 @@ impl Server {
146146
Arc::new(InMemoryHostPhase2Provider::with_capacity(
147147
config.host_phase2_recovery_image_cache_max_images,
148148
));
149-
let apictx =
150-
ServerContext::new(host_phase2_provider, config.switch, &log)
151-
.await
152-
.map_err(|error| {
153-
format!("initializing server context: {}", error)
154-
})?;
149+
let apictx = ServerContext::new(
150+
host_phase2_provider,
151+
config.switch,
152+
args.rack_id,
153+
&log,
154+
)
155+
.await
156+
.map_err(|error| format!("initializing server context: {}", error))?;
155157

156158
let mut http_servers = HashMap::with_capacity(args.addresses.len());
157159
let all_servers_shutdown = FuturesUnordered::new();
@@ -283,6 +285,25 @@ impl Server {
283285
Ok(())
284286
}
285287

288+
/// The rack_id will be set on a refresh of the SMF property when the sled
289+
/// agent starts.
290+
pub fn set_rack_id(&self, rack_id: Option<Uuid>) {
291+
if let Some(rack_id) = rack_id {
292+
let val = self.apictx.rack_id.get_or_init(|| rack_id);
293+
if *val != rack_id {
294+
error!(
295+
self.apictx.log,
296+
"Ignoring attempted change to rack ID";
297+
"current_rack_id" => %val,
298+
"ignored_new_rack_id" => %rack_id);
299+
} else {
300+
info!(self.apictx.log, "Set rack_id"; "rack_id" => %rack_id);
301+
}
302+
} else {
303+
warn!(self.apictx.log, "SMF refresh called without a rack id");
304+
}
305+
}
306+
286307
// TODO does MGS register itself with oximeter?
287308
// Register the Nexus server as a metric producer with `oximeter.
288309
// pub async fn register_as_producer(&self) {
@@ -313,8 +334,7 @@ pub async fn start_server(
313334
} else {
314335
debug!(log, "registered DTrace probes");
315336
}
316-
let rack_id = Uuid::new_v4();
317-
let server = Server::start(config, args, rack_id, log).await?;
337+
let server = Server::start(config, args, log).await?;
318338
// server.register_as_producer().await;
319339
Ok(server)
320340
}

package-manifest.toml

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -259,7 +259,10 @@ service_name = "mgs"
259259
only_for_targets.image = "standard"
260260
only_for_targets.switch = "stub"
261261
source.type = "local"
262-
source.paths = [ { from = "smf/mgs-sim", to = "/var/svc/manifest/site/mgs" } ]
262+
source.paths = [
263+
{ from = "smf/mgs/manifest.xml", to = "/var/svc/manifest/site/mgs/manifest.xml" },
264+
{ from = "smf/mgs-sim/config.toml", to = "/var/svc/manifest/site/mgs/config.toml" }
265+
]
263266
output.intermediate_only = true
264267
output.type = "zone"
265268

@@ -268,7 +271,10 @@ service_name = "mgs"
268271
only_for_targets.image = "standard"
269272
only_for_targets.switch = "softnpu"
270273
source.type = "local"
271-
source.paths = [ { from = "smf/mgs-sim", to = "/var/svc/manifest/site/mgs" } ]
274+
source.paths = [
275+
{ from = "smf/mgs/manifest.xml", to = "/var/svc/manifest/site/mgs/manifest.xml" },
276+
{ from = "smf/mgs-sim/config.toml", to = "/var/svc/manifest/site/mgs/config.toml" }
277+
]
272278
output.intermediate_only = true
273279
output.type = "zone"
274280

sled-agent/src/services.rs

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1645,6 +1645,10 @@ impl ServiceManager {
16451645
}
16461646
}
16471647

1648+
if let Some(info) = self.inner.sled_info.get() {
1649+
smfh.setprop("config/rack_id", info.rack_id)?;
1650+
}
1651+
16481652
smfh.refresh()?;
16491653
}
16501654
ServiceType::SpSim => {
@@ -2754,6 +2758,20 @@ impl ServiceManager {
27542758
&format!("[{address}]:{MGS_PORT}"),
27552759
)?;
27562760

2761+
// It should be impossible for the `sled_info` not to be set here,
2762+
// as the underlay is set at the same time.
2763+
if let Some(info) = self.inner.sled_info.get() {
2764+
smfh.setprop("config/rack_id", info.rack_id)?;
2765+
} else {
2766+
error!(
2767+
self.inner.log,
2768+
concat!(
2769+
"rack_id not present,",
2770+
" even though underlay address exists"
2771+
)
2772+
);
2773+
}
2774+
27572775
smfh.refresh()?;
27582776
}
27592777
ServiceType::Dendrite { .. } => {

smf/mgs-sim/manifest.xml

Lines changed: 0 additions & 57 deletions
This file was deleted.

smf/mgs/manifest.xml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,12 @@
3131
"unknown" with a UUID as a part of starting us.
3232
-->
3333
<propval name='id' type='astring' value='unknown' />
34+
<!--
35+
`config/rack_id` is expected to have a single value; sled-agent will replace
36+
"unknown" with a UUID when it learns about it. This should happen in the same
37+
refresh where the underlay address is provisioned into `config/address`.
38+
-->
39+
<propval name='rack_id' type='astring' value='unknown' />
3440
<!--
3541
`config/address` is allowed to have multiple values, so we do not seed it
3642
with an initial `unknown` that sled-agent would need to delete.

0 commit comments

Comments
 (0)