Skip to content

Commit 2c22a2e

Browse files
author
lif
committed
Add some unit tests for sled-agent Instance creation
At time of writing, instance creation roughly looks like: - nexus -> sled-agent: `instance_put_state` - sled-agent: `InstanceManager::ensure_state` - sled-agent: `Instance::propolis_ensure` - sled-agent -> nexus: `cpapi_instances_put` (if not migrating) - sled-agent: `Instance::setup_propolis_locked` (*blocking!*) - `RunningZone::install` and `Zones::boot` - `illumos_utils::svc::wait_for_service` - `self::wait_for_http_server` for propolis-server itself - sled-agent: `Instance::ensure_propolis_and_tasks` - sled-agent: spawn `Instance::monitor_state_task` - sled-agent -> nexus: `cpapi_instances_put` (if not migrating) - sled-agent: return ok result - nexus: `handle_instance_put_result` Or at least, it does in the happy path. omicron#3927 saw propolis zone creation take longer than the minute nexus's call to sled-agent's `instance_put_state`. That might've looked something like: - nexus -> sled-agent: `instance_put_state` - sled-agent: `InstanceManager::ensure_state` - sled-agent: `Instance::propolis_ensure` - sled-agent -> nexus: `cpapi_instances_put` (if not migrating) - sled-agent: `Instance::setup_propolis_locked` (*blocking!*) - `RunningZone::install` and `Zones::boot` - nexus: i've been waiting a whole minute for this. connection timeout! - nexus: `handle_instance_put_result` - sled-agent: [...] return... oh, they hung up. :( To avoid this timeout being implicit at the *Dropshot configuration* layer (that is to say, we should still have *some* timeout), we could consider a small refactor to make `instance_put_state` not a blocking call -- especially since it's already sending nexus updates on its progress via out-of-band `cpapi_instances_put` calls! That might look something like: - nexus -> sled-agent: `instance_put_state` - sled-agent: `InstanceManager::ensure_state` - sled-agent: spawn { - sled-agent: `Instance::propolis_ensure` - sled-agent -> nexus: `cpapi_instances_put` (if not migrating) - sled-agent: `Instance::setup_propolis_locked` (blocking!) - sled-agent: `Instance::ensure_propolis_and_tasks` - sled-agent: spawn `Instance::monitor_state_task` - sled-agent -> nexus: `cpapi_instances_put` (if not migrating) - sled-agent -> nexus: a cpapi call equivalent to the `handle_instance_put_result` nexus currently invokes after getting the response from the blocking call (With a way for nexus to cancel an instance creation by ID, and a timeout in sled-agent itself for terminating the attempt and reporting the failure back to nexus, and a shorter threshold for logging the event of an instance creation taking a long time.) Before such a change, though, we should really have some more tests around sled-agent's instance creation code at all! So here's a few.
1 parent b114324 commit 2c22a2e

File tree

5 files changed

+572
-5
lines changed

5 files changed

+572
-5
lines changed

illumos-utils/src/running_zone.rs

Lines changed: 46 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1169,29 +1169,55 @@ impl ZoneBuilderFactory {
11691169
/// Created by [ZoneBuilderFactory].
11701170
#[derive(Default)]
11711171
pub struct ZoneBuilder<'a> {
1172+
/// Logger to which status messages are written during zone installation.
11721173
log: Option<Logger>,
1174+
/// Allocates the NIC used for control plane communication.
11731175
underlay_vnic_allocator: Option<&'a VnicAllocator<Etherstub>>,
1176+
/// Filesystem path at which the installed zone will reside.
11741177
zone_root_path: Option<&'a Utf8Path>,
1178+
/// The directories that will be searched for the image tarball for the
1179+
/// provided zone type ([`Self::with_zone_type`]).
11751180
zone_image_paths: Option<&'a [Utf8PathBuf]>,
1181+
/// The name of the type of zone being created (e.g. "propolis-server")
11761182
zone_type: Option<&'a str>,
1177-
unique_name: Option<Uuid>, // actually optional
1183+
/// Unique ID of the instance of the zone being created. (optional)
1184+
// *actually* optional (in contrast to other fields that are `Option` for
1185+
// builder purposes - that is, skipping this field in the builder will
1186+
// still result in an `Ok(InstalledZone)` from `.install()`, rather than
1187+
// an `Err(InstallZoneError::IncompleteBuilder)`.
1188+
unique_name: Option<Uuid>,
1189+
/// ZFS datasets to be accessed from within the zone.
11781190
datasets: Option<&'a [zone::Dataset]>,
1191+
/// Filesystems to mount within the zone.
11791192
filesystems: Option<&'a [zone::Fs]>,
1193+
/// Additional network device names to add to the zone.
11801194
data_links: Option<&'a [String]>,
1195+
/// Device nodes to pass through to the zone.
11811196
devices: Option<&'a [zone::Device]>,
1197+
/// OPTE devices for the guest network interfaces.
11821198
opte_ports: Option<Vec<(Port, PortTicket)>>,
1183-
bootstrap_vnic: Option<Link>, // actually optional
1199+
/// NIC to use for creating a bootstrap address on the switch zone.
1200+
// actually optional (as above)
1201+
bootstrap_vnic: Option<Link>,
1202+
/// Physical NICs possibly provisioned to the zone.
11841203
links: Option<Vec<Link>>,
1204+
/// The maximum set of privileges any process in this zone can obtain.
11851205
limit_priv: Option<Vec<String>>,
1206+
/// For unit tests only: if `Some`, then no actual zones will be installed
1207+
/// by this builder, and minimal facsimiles of them will be placed in
1208+
/// temporary directories according to the contents of the provided
1209+
/// `FakeZoneBuilderConfig`.
11861210
fake_cfg: Option<FakeZoneBuilderConfig>,
11871211
}
11881212

11891213
impl<'a> ZoneBuilder<'a> {
1214+
/// Logger to which status messages are written during zone installation.
11901215
pub fn with_log(mut self, log: Logger) -> Self {
11911216
self.log = Some(log);
11921217
self
11931218
}
11941219

1220+
/// Allocates the NIC used for control plane communication.
11951221
pub fn with_underlay_vnic_allocator(
11961222
mut self,
11971223
vnic_allocator: &'a VnicAllocator<Etherstub>,
@@ -1200,11 +1226,14 @@ impl<'a> ZoneBuilder<'a> {
12001226
self
12011227
}
12021228

1229+
/// Filesystem path at which the installed zone will reside.
12031230
pub fn with_zone_root_path(mut self, root_path: &'a Utf8Path) -> Self {
12041231
self.zone_root_path = Some(root_path);
12051232
self
12061233
}
12071234

1235+
/// The directories that will be searched for the image tarball for the
1236+
/// provided zone type ([`Self::with_zone_type`]).
12081237
pub fn with_zone_image_paths(
12091238
mut self,
12101239
image_paths: &'a [Utf8PathBuf],
@@ -1213,56 +1242,68 @@ impl<'a> ZoneBuilder<'a> {
12131242
self
12141243
}
12151244

1245+
/// The name of the type of zone being created (e.g. "propolis-server")
12161246
pub fn with_zone_type(mut self, zone_type: &'a str) -> Self {
12171247
self.zone_type = Some(zone_type);
12181248
self
12191249
}
12201250

1251+
/// Unique ID of the instance of the zone being created. (optional)
12211252
pub fn with_unique_name(mut self, uuid: Uuid) -> Self {
12221253
self.unique_name = Some(uuid);
12231254
self
12241255
}
12251256

1257+
/// ZFS datasets to be accessed from within the zone.
12261258
pub fn with_datasets(mut self, datasets: &'a [zone::Dataset]) -> Self {
12271259
self.datasets = Some(datasets);
12281260
self
12291261
}
12301262

1263+
/// Filesystems to mount within the zone.
12311264
pub fn with_filesystems(mut self, filesystems: &'a [zone::Fs]) -> Self {
12321265
self.filesystems = Some(filesystems);
12331266
self
12341267
}
12351268

1269+
/// Additional network device names to add to the zone.
12361270
pub fn with_data_links(mut self, links: &'a [String]) -> Self {
12371271
self.data_links = Some(links);
12381272
self
12391273
}
12401274

1275+
/// Device nodes to pass through to the zone.
12411276
pub fn with_devices(mut self, devices: &'a [zone::Device]) -> Self {
12421277
self.devices = Some(devices);
12431278
self
12441279
}
12451280

1281+
/// OPTE devices for the guest network interfaces.
12461282
pub fn with_opte_ports(mut self, ports: Vec<(Port, PortTicket)>) -> Self {
12471283
self.opte_ports = Some(ports);
12481284
self
12491285
}
12501286

1287+
/// NIC to use for creating a bootstrap address on the switch zone.
1288+
/// (optional)
12511289
pub fn with_bootstrap_vnic(mut self, vnic: Link) -> Self {
12521290
self.bootstrap_vnic = Some(vnic);
12531291
self
12541292
}
12551293

1294+
/// Physical NICs possibly provisioned to the zone.
12561295
pub fn with_links(mut self, links: Vec<Link>) -> Self {
12571296
self.links = Some(links);
12581297
self
12591298
}
12601299

1300+
/// The maximum set of privileges any process in this zone can obtain.
12611301
pub fn with_limit_priv(mut self, limit_priv: Vec<String>) -> Self {
12621302
self.limit_priv = Some(limit_priv);
12631303
self
12641304
}
12651305

1306+
// (used in unit tests)
12661307
fn fake_install(self) -> Result<InstalledZone, InstallZoneError> {
12671308
let zone = self
12681309
.zone_type
@@ -1300,6 +1341,9 @@ impl<'a> ZoneBuilder<'a> {
13001341
.ok_or(InstallZoneError::IncompleteBuilder)
13011342
}
13021343

1344+
/// Create the zone with the provided parameters.
1345+
/// Returns `Err(InstallZoneError::IncompleteBuilder)` if a necessary
1346+
/// parameter was not provided.
13031347
pub async fn install(self) -> Result<InstalledZone, InstallZoneError> {
13041348
if self.fake_cfg.is_some() {
13051349
return self.fake_install();

sled-agent/src/fakes/nexus.rs

Lines changed: 38 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,13 +8,18 @@
88
//! to operate correctly.
99
1010
use dropshot::{
11-
endpoint, ApiDescription, FreeformBody, HttpError, HttpResponseOk, Path,
12-
RequestContext,
11+
endpoint, ApiDescription, FreeformBody, HttpError, HttpResponseOk,
12+
HttpResponseUpdatedNoContent, Path, RequestContext, TypedBody,
1313
};
1414
use hyper::Body;
1515
use internal_dns::ServiceName;
1616
use omicron_common::api::external::Error;
17-
use omicron_common::api::internal::nexus::UpdateArtifactId;
17+
use omicron_common::api::internal::nexus::{
18+
SledInstanceState, UpdateArtifactId,
19+
};
20+
use schemars::JsonSchema;
21+
use serde::Deserialize;
22+
use uuid::Uuid;
1823

1924
/// Implements a fake Nexus.
2025
///
@@ -28,6 +33,13 @@ pub trait FakeNexusServer: Send + Sync {
2833
) -> Result<Vec<u8>, Error> {
2934
Err(Error::internal_error("Not implemented"))
3035
}
36+
fn cpapi_instances_put(
37+
&self,
38+
_instance_id: Uuid,
39+
_new_runtime_state: SledInstanceState,
40+
) -> Result<(), Error> {
41+
Err(Error::internal_error("Not implemented"))
42+
}
3143
}
3244

3345
/// Describes the server context type.
@@ -52,9 +64,32 @@ async fn cpapi_artifact_download(
5264
))
5365
}
5466

67+
#[derive(Deserialize, JsonSchema)]
68+
struct InstancePathParam {
69+
instance_id: Uuid,
70+
}
71+
72+
#[endpoint {
73+
method = PUT,
74+
path = "/instances/{instance_id}",
75+
}]
76+
async fn cpapi_instances_put(
77+
request_context: RequestContext<ServerContext>,
78+
path_params: Path<InstancePathParam>,
79+
new_runtime_state: TypedBody<SledInstanceState>,
80+
) -> Result<HttpResponseUpdatedNoContent, HttpError> {
81+
let context = request_context.context();
82+
context.cpapi_instances_put(
83+
path_params.into_inner().instance_id,
84+
new_runtime_state.into_inner(),
85+
)?;
86+
Ok(HttpResponseUpdatedNoContent())
87+
}
88+
5589
fn api() -> ApiDescription<ServerContext> {
5690
let mut api = ApiDescription::new();
5791
api.register(cpapi_artifact_download).unwrap();
92+
api.register(cpapi_instances_put).unwrap();
5893
api
5994
}
6095

0 commit comments

Comments
 (0)