Skip to content

Commit 74d4b87

Browse files
author
lif
committed
Add some unit tests for sled-agent Instance creation
At time of writing, instance creation roughly looks like: - nexus -> sled-agent: `instance_put_state` - sled-agent: `InstanceManager::ensure_state` - sled-agent: `Instance::propolis_ensure` - sled-agent -> nexus: `cpapi_instances_put` (if not migrating) - sled-agent: `Instance::setup_propolis_locked` (*blocking!*) - `RunningZone::install` and `Zones::boot` - `illumos_utils::svc::wait_for_service` - `self::wait_for_http_server` for propolis-server itself - sled-agent: `Instance::ensure_propolis_and_tasks` - sled-agent: spawn `Instance::monitor_state_task` - sled-agent -> nexus: `cpapi_instances_put` (if not migrating) - sled-agent: return ok result - nexus: `handle_instance_put_result` Or at least, it does in the happy path. omicron#3927 saw propolis zone creation take longer than the minute nexus's call to sled-agent's `instance_put_state`. That might've looked something like: - nexus -> sled-agent: `instance_put_state` - sled-agent: `InstanceManager::ensure_state` - sled-agent: `Instance::propolis_ensure` - sled-agent -> nexus: `cpapi_instances_put` (if not migrating) - sled-agent: `Instance::setup_propolis_locked` (*blocking!*) - `RunningZone::install` and `Zones::boot` - nexus: i've been waiting a whole minute for this. connection timeout! - nexus: `handle_instance_put_result` - sled-agent: [...] return... oh, they hung up. :( To avoid this timeout being implicit at the *Dropshot configuration* layer (that is to say, we should still have *some* timeout), we could consider a small refactor to make `instance_put_state` not a blocking call -- especially since it's already sending nexus updates on its progress via out-of-band `cpapi_instances_put` calls! That might look something like: - nexus -> sled-agent: `instance_put_state` - sled-agent: `InstanceManager::ensure_state` - sled-agent: spawn { - sled-agent: `Instance::propolis_ensure` - sled-agent -> nexus: `cpapi_instances_put` (if not migrating) - sled-agent: `Instance::setup_propolis_locked` (blocking!) - sled-agent: `Instance::ensure_propolis_and_tasks` - sled-agent: spawn `Instance::monitor_state_task` - sled-agent -> nexus: `cpapi_instances_put` (if not migrating) - sled-agent -> nexus: a cpapi call equivalent to the `handle_instance_put_result` nexus currently invokes after getting the response from the blocking call (With a way for nexus to cancel an instance creation by ID, and a timeout in sled-agent itself for terminating the attempt and reporting the failure back to nexus, and a shorter threshold for logging the event of an instance creation taking a long time.) Before such a change, though, we should really have some more tests around sled-agent's instance creation code at all! So here's a few.
1 parent a04f5e3 commit 74d4b87

File tree

5 files changed

+551
-5
lines changed

5 files changed

+551
-5
lines changed

illumos-utils/src/running_zone.rs

Lines changed: 25 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1174,24 +1174,30 @@ pub struct ZoneBuilder<'a> {
11741174
zone_root_path: Option<&'a Utf8Path>,
11751175
zone_image_paths: Option<&'a [Utf8PathBuf]>,
11761176
zone_type: Option<&'a str>,
1177-
unique_name: Option<Uuid>, // actually optional
1177+
// actually optional - that is, skipping this field in the builder will
1178+
// still result in an `Ok(InstalledZone)` from `.install()`, rather than
1179+
// an `Err(InstallZoneError::IncompleteBuilder)`.
1180+
unique_name: Option<Uuid>,
11781181
datasets: Option<&'a [zone::Dataset]>,
11791182
filesystems: Option<&'a [zone::Fs]>,
11801183
data_links: Option<&'a [String]>,
11811184
devices: Option<&'a [zone::Device]>,
11821185
opte_ports: Option<Vec<(Port, PortTicket)>>,
1183-
bootstrap_vnic: Option<Link>, // actually optional
1186+
// actually optional (as above)
1187+
bootstrap_vnic: Option<Link>,
11841188
links: Option<Vec<Link>>,
11851189
limit_priv: Option<Vec<String>>,
11861190
fake_cfg: Option<FakeZoneBuilderConfig>,
11871191
}
11881192

11891193
impl<'a> ZoneBuilder<'a> {
1194+
/// Logger to which status messages are written during zone installation.
11901195
pub fn with_log(mut self, log: Logger) -> Self {
11911196
self.log = Some(log);
11921197
self
11931198
}
11941199

1200+
/// Allocates the NIC used for control plane communication.
11951201
pub fn with_underlay_vnic_allocator(
11961202
mut self,
11971203
vnic_allocator: &'a VnicAllocator<Etherstub>,
@@ -1200,11 +1206,14 @@ impl<'a> ZoneBuilder<'a> {
12001206
self
12011207
}
12021208

1209+
/// Filesystem path at which the installed zone will reside.
12031210
pub fn with_zone_root_path(mut self, root_path: &'a Utf8Path) -> Self {
12041211
self.zone_root_path = Some(root_path);
12051212
self
12061213
}
12071214

1215+
/// The directories that will be searched for the image tarball for the
1216+
/// provided zone type ([`Self::with_zone_type`]).
12081217
pub fn with_zone_image_paths(
12091218
mut self,
12101219
image_paths: &'a [Utf8PathBuf],
@@ -1213,56 +1222,67 @@ impl<'a> ZoneBuilder<'a> {
12131222
self
12141223
}
12151224

1225+
/// The name of the type of zone being created (e.g. "propolis-server")
12161226
pub fn with_zone_type(mut self, zone_type: &'a str) -> Self {
12171227
self.zone_type = Some(zone_type);
12181228
self
12191229
}
12201230

1231+
/// Unique ID of the instance of the zone being created.
12211232
pub fn with_unique_name(mut self, uuid: Uuid) -> Self {
12221233
self.unique_name = Some(uuid);
12231234
self
12241235
}
12251236

1237+
/// ZFS datasets to be accessed from within the zone.
12261238
pub fn with_datasets(mut self, datasets: &'a [zone::Dataset]) -> Self {
12271239
self.datasets = Some(datasets);
12281240
self
12291241
}
12301242

1243+
/// Filesystems to mount within the zone.
12311244
pub fn with_filesystems(mut self, filesystems: &'a [zone::Fs]) -> Self {
12321245
self.filesystems = Some(filesystems);
12331246
self
12341247
}
12351248

1249+
/// Additional network device names to add to the zone.
12361250
pub fn with_data_links(mut self, links: &'a [String]) -> Self {
12371251
self.data_links = Some(links);
12381252
self
12391253
}
12401254

1255+
/// Device nodes to pass through to the zone.
12411256
pub fn with_devices(mut self, devices: &'a [zone::Device]) -> Self {
12421257
self.devices = Some(devices);
12431258
self
12441259
}
12451260

1261+
/// OPTE devices for the guest network interfaces.
12461262
pub fn with_opte_ports(mut self, ports: Vec<(Port, PortTicket)>) -> Self {
12471263
self.opte_ports = Some(ports);
12481264
self
12491265
}
12501266

1267+
/// NIC to use for creating a bootstrap address on the switch zone.
12511268
pub fn with_bootstrap_vnic(mut self, vnic: Link) -> Self {
12521269
self.bootstrap_vnic = Some(vnic);
12531270
self
12541271
}
12551272

1273+
/// Physical NICs possibly provisioned to the zone.
12561274
pub fn with_links(mut self, links: Vec<Link>) -> Self {
12571275
self.links = Some(links);
12581276
self
12591277
}
12601278

1279+
/// The maximum set of privileges any process in this zone can obtain.
12611280
pub fn with_limit_priv(mut self, limit_priv: Vec<String>) -> Self {
12621281
self.limit_priv = Some(limit_priv);
12631282
self
12641283
}
12651284

1285+
// (used in unit tests)
12661286
fn fake_install(self) -> Result<InstalledZone, InstallZoneError> {
12671287
let zone = self
12681288
.zone_type
@@ -1300,6 +1320,9 @@ impl<'a> ZoneBuilder<'a> {
13001320
.ok_or(InstallZoneError::IncompleteBuilder)
13011321
}
13021322

1323+
/// Create the zone with the provided parameters.
1324+
/// Returns `Err(InstallZoneError::IncompleteBuilder)` if a necessary
1325+
/// parameter was not provided.
13031326
pub async fn install(self) -> Result<InstalledZone, InstallZoneError> {
13041327
if self.fake_cfg.is_some() {
13051328
return self.fake_install();

sled-agent/src/fakes/nexus.rs

Lines changed: 38 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,13 +8,18 @@
88
//! to operate correctly.
99
1010
use dropshot::{
11-
endpoint, ApiDescription, FreeformBody, HttpError, HttpResponseOk, Path,
12-
RequestContext,
11+
endpoint, ApiDescription, FreeformBody, HttpError, HttpResponseOk,
12+
HttpResponseUpdatedNoContent, Path, RequestContext, TypedBody,
1313
};
1414
use hyper::Body;
1515
use internal_dns::ServiceName;
1616
use omicron_common::api::external::Error;
17-
use omicron_common::api::internal::nexus::UpdateArtifactId;
17+
use omicron_common::api::internal::nexus::{
18+
SledInstanceState, UpdateArtifactId,
19+
};
20+
use schemars::JsonSchema;
21+
use serde::Deserialize;
22+
use uuid::Uuid;
1823

1924
/// Implements a fake Nexus.
2025
///
@@ -28,6 +33,13 @@ pub trait FakeNexusServer: Send + Sync {
2833
) -> Result<Vec<u8>, Error> {
2934
Err(Error::internal_error("Not implemented"))
3035
}
36+
fn cpapi_instances_put(
37+
&self,
38+
_instance_id: Uuid,
39+
_new_runtime_state: SledInstanceState,
40+
) -> Result<(), Error> {
41+
Err(Error::internal_error("Not implemented"))
42+
}
3143
}
3244

3345
/// Describes the server context type.
@@ -52,9 +64,32 @@ async fn cpapi_artifact_download(
5264
))
5365
}
5466

67+
#[derive(Deserialize, JsonSchema)]
68+
struct InstancePathParam {
69+
instance_id: Uuid,
70+
}
71+
72+
#[endpoint {
73+
method = PUT,
74+
path = "/instances/{instance_id}",
75+
}]
76+
async fn cpapi_instances_put(
77+
request_context: RequestContext<ServerContext>,
78+
path_params: Path<InstancePathParam>,
79+
new_runtime_state: TypedBody<SledInstanceState>,
80+
) -> Result<HttpResponseUpdatedNoContent, HttpError> {
81+
let context = request_context.context();
82+
context.cpapi_instances_put(
83+
path_params.into_inner().instance_id,
84+
new_runtime_state.into_inner(),
85+
)?;
86+
Ok(HttpResponseUpdatedNoContent())
87+
}
88+
5589
fn api() -> ApiDescription<ServerContext> {
5690
let mut api = ApiDescription::new();
5791
api.register(cpapi_artifact_download).unwrap();
92+
api.register(cpapi_instances_put).unwrap();
5893
api
5994
}
6095

0 commit comments

Comments
 (0)