Skip to content

sled-agent: don't block during instance creation request from nexus #4691

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
70 changes: 56 additions & 14 deletions end-to-end-tests/src/instance_launch.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,14 @@ use omicron_test_utils::dev::poll::{wait_for_condition, CondCheckError};
use oxide_client::types::{
ByteCount, DiskCreate, DiskSource, ExternalIp, ExternalIpCreate,
InstanceCpuCount, InstanceCreate, InstanceDiskAttachment,
InstanceNetworkInterfaceAttachment, SshKeyCreate,
InstanceNetworkInterfaceAttachment, InstanceState, SshKeyCreate,
};
use oxide_client::{ClientDisksExt, ClientInstancesExt, ClientSessionExt};
use russh::{ChannelMsg, Disconnect};
use russh_keys::key::{KeyPair, PublicKey};
use russh_keys::PublicKeyBase64;
use std::sync::Arc;
use std::time::Duration;
use tokio::time::sleep;

#[tokio::test]
async fn instance_launch() -> Result<()> {
Expand Down Expand Up @@ -106,6 +105,19 @@ async fn instance_launch() -> Result<()> {
type Error =
CondCheckError<oxide_client::Error<oxide_client::types::Error>>;

let instance_state = ctx
.client
.instance_view()
.project(ctx.project_name.clone())
.instance(instance.name.clone())
.send()
.await?
.run_state;

if instance_state == InstanceState::Starting {
return Err(Error::NotYet);
}

let data = String::from_utf8_lossy(
&ctx.client
.instance_serial_console()
Expand Down Expand Up @@ -188,19 +200,49 @@ async fn instance_launch() -> Result<()> {

// check that we saw it on the console
eprintln!("waiting for serial console");
sleep(Duration::from_secs(5)).await;
let data = String::from_utf8_lossy(
&ctx.client
.instance_serial_console()
.project(ctx.project_name.clone())
.instance(instance.name.clone())
.most_recent(1024 * 1024)
.max_bytes(1024 * 1024)
.send()
.await?
.data,

let data = wait_for_condition(
|| async {
type Error =
CondCheckError<oxide_client::Error<oxide_client::types::Error>>;

let instance_state = ctx
.client
.instance_view()
.project(ctx.project_name.clone())
.instance(instance.name.clone())
.send()
.await?
.run_state;

if instance_state == InstanceState::Starting {
return Err(Error::NotYet);
}

let data = String::from_utf8_lossy(
&ctx.client
.instance_serial_console()
.project(ctx.project_name.clone())
.instance(instance.name.clone())
.most_recent(1024 * 1024)
.max_bytes(1024 * 1024)
.send()
.await
.map_err(|_e| Error::NotYet)?
.data,
)
.into_owned();
if data.contains("-----END SSH HOST KEY KEYS-----") {
Ok(data)
} else {
Err(Error::NotYet)
}
},
&Duration::from_secs(5),
&Duration::from_secs(300),
)
.into_owned();
.await?;

ensure!(
data.contains("Hello, Oxide!"),
"string not seen on console\n{}",
Expand Down
48 changes: 46 additions & 2 deletions illumos-utils/src/running_zone.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1193,29 +1193,55 @@ impl ZoneBuilderFactory {
/// Created by [ZoneBuilderFactory].
#[derive(Default)]
pub struct ZoneBuilder<'a> {
/// Logger to which status messages are written during zone installation.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for all the additional info!

log: Option<Logger>,
/// Allocates the NIC used for control plane communication.
underlay_vnic_allocator: Option<&'a VnicAllocator<Etherstub>>,
/// Filesystem path at which the installed zone will reside.
zone_root_path: Option<&'a Utf8Path>,
/// The directories that will be searched for the image tarball for the
/// provided zone type ([`Self::with_zone_type`]).
zone_image_paths: Option<&'a [Utf8PathBuf]>,
/// The name of the type of zone being created (e.g. "propolis-server")
zone_type: Option<&'a str>,
unique_name: Option<Uuid>, // actually optional
/// Unique ID of the instance of the zone being created. (optional)
// *actually* optional (in contrast to other fields that are `Option` for
// builder purposes - that is, skipping this field in the builder will
// still result in an `Ok(InstalledZone)` from `.install()`, rather than
// an `Err(InstallZoneError::IncompleteBuilder)`.
unique_name: Option<Uuid>,
/// ZFS datasets to be accessed from within the zone.
datasets: Option<&'a [zone::Dataset]>,
/// Filesystems to mount within the zone.
filesystems: Option<&'a [zone::Fs]>,
/// Additional network device names to add to the zone.
data_links: Option<&'a [String]>,
/// Device nodes to pass through to the zone.
devices: Option<&'a [zone::Device]>,
/// OPTE devices for the guest network interfaces.
opte_ports: Option<Vec<(Port, PortTicket)>>,
bootstrap_vnic: Option<Link>, // actually optional
/// NIC to use for creating a bootstrap address on the switch zone.
// actually optional (as above)
bootstrap_vnic: Option<Link>,
/// Physical NICs possibly provisioned to the zone.
links: Option<Vec<Link>>,
/// The maximum set of privileges any process in this zone can obtain.
limit_priv: Option<Vec<String>>,
/// For unit tests only: if `Some`, then no actual zones will be installed
/// by this builder, and minimal facsimiles of them will be placed in
/// temporary directories according to the contents of the provided
/// `FakeZoneBuilderConfig`.
fake_cfg: Option<FakeZoneBuilderConfig>,
}

impl<'a> ZoneBuilder<'a> {
/// Logger to which status messages are written during zone installation.
pub fn with_log(mut self, log: Logger) -> Self {
self.log = Some(log);
self
}

/// Allocates the NIC used for control plane communication.
pub fn with_underlay_vnic_allocator(
mut self,
vnic_allocator: &'a VnicAllocator<Etherstub>,
Expand All @@ -1224,11 +1250,14 @@ impl<'a> ZoneBuilder<'a> {
self
}

/// Filesystem path at which the installed zone will reside.
pub fn with_zone_root_path(mut self, root_path: &'a Utf8Path) -> Self {
self.zone_root_path = Some(root_path);
self
}

/// The directories that will be searched for the image tarball for the
/// provided zone type ([`Self::with_zone_type`]).
pub fn with_zone_image_paths(
mut self,
image_paths: &'a [Utf8PathBuf],
Expand All @@ -1237,56 +1266,68 @@ impl<'a> ZoneBuilder<'a> {
self
}

/// The name of the type of zone being created (e.g. "propolis-server")
pub fn with_zone_type(mut self, zone_type: &'a str) -> Self {
self.zone_type = Some(zone_type);
self
}

/// Unique ID of the instance of the zone being created. (optional)
pub fn with_unique_name(mut self, uuid: Uuid) -> Self {
self.unique_name = Some(uuid);
self
}

/// ZFS datasets to be accessed from within the zone.
pub fn with_datasets(mut self, datasets: &'a [zone::Dataset]) -> Self {
self.datasets = Some(datasets);
self
}

/// Filesystems to mount within the zone.
pub fn with_filesystems(mut self, filesystems: &'a [zone::Fs]) -> Self {
self.filesystems = Some(filesystems);
self
}

/// Additional network device names to add to the zone.
pub fn with_data_links(mut self, links: &'a [String]) -> Self {
self.data_links = Some(links);
self
}

/// Device nodes to pass through to the zone.
pub fn with_devices(mut self, devices: &'a [zone::Device]) -> Self {
self.devices = Some(devices);
self
}

/// OPTE devices for the guest network interfaces.
pub fn with_opte_ports(mut self, ports: Vec<(Port, PortTicket)>) -> Self {
self.opte_ports = Some(ports);
self
}

/// NIC to use for creating a bootstrap address on the switch zone.
/// (optional)
pub fn with_bootstrap_vnic(mut self, vnic: Link) -> Self {
self.bootstrap_vnic = Some(vnic);
self
}

/// Physical NICs possibly provisioned to the zone.
pub fn with_links(mut self, links: Vec<Link>) -> Self {
self.links = Some(links);
self
}

/// The maximum set of privileges any process in this zone can obtain.
pub fn with_limit_priv(mut self, limit_priv: Vec<String>) -> Self {
self.limit_priv = Some(limit_priv);
self
}

// (used in unit tests)
fn fake_install(self) -> Result<InstalledZone, InstallZoneError> {
let zone = self
.zone_type
Expand Down Expand Up @@ -1324,6 +1365,9 @@ impl<'a> ZoneBuilder<'a> {
.ok_or(InstallZoneError::IncompleteBuilder)
}

/// Create the zone with the provided parameters.
/// Returns `Err(InstallZoneError::IncompleteBuilder)` if a necessary
/// parameter was not provided.
pub async fn install(self) -> Result<InstalledZone, InstallZoneError> {
if self.fake_cfg.is_some() {
return self.fake_install();
Expand Down
3 changes: 3 additions & 0 deletions nexus/src/app/instance.rs
Original file line number Diff line number Diff line change
Expand Up @@ -986,6 +986,9 @@ impl super::Nexus {
//
// If the operation failed, kick the sled agent error back up to
// the caller to let it decide how to handle it.
//
// When creating the zone for the first time, we just get
// Ok(None) here, which is a no-op in write_returned_instance_state.
match instance_put_result {
Ok(state) => self
.write_returned_instance_state(&instance_id, state)
Expand Down
21 changes: 18 additions & 3 deletions nexus/tests/integration_tests/instances.rs
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ use sled_agent_client::TestInterfaces as _;
use std::convert::TryFrom;
use std::net::Ipv4Addr;
use std::sync::Arc;
use std::time::Duration;
use uuid::Uuid;

use dropshot::test_util::ClientTestContext;
Expand All @@ -80,6 +81,8 @@ use nexus_test_utils::resource_helpers::{
use nexus_test_utils_macros::nexus_test;
use nexus_types::external_api::shared::SiloRole;
use omicron_sled_agent::sim;
use omicron_test_utils::dev::poll;
use omicron_test_utils::dev::poll::CondCheckError;

type ControlPlaneTestContext =
nexus_test_utils::ControlPlaneTestContext<omicron_nexus::Server>;
Expand Down Expand Up @@ -3794,10 +3797,22 @@ async fn test_instance_serial(cptestctx: &ControlPlaneTestContext) {

// Create an instance and poke it to ensure it's running.
let instance = create_instance(client, PROJECT_NAME, instance_name).await;
instance_simulate(nexus, &instance.identity.id).await;
let instance_next = instance_get(&client, &instance_url).await;
let instance_next = poll::wait_for_condition(
|| async {
instance_simulate(nexus, &instance.identity.id).await;
let instance_next = instance_get(&client, &instance_url).await;
if instance_next.runtime.run_state == InstanceState::Running {
Ok(instance_next)
} else {
Err(CondCheckError::<()>::NotYet)
}
},
&Duration::from_secs(5),
&Duration::from_secs(60),
)
.await
.unwrap();
identity_eq(&instance.identity, &instance_next.identity);
assert_eq!(instance_next.runtime.run_state, InstanceState::Running);
assert!(
instance_next.runtime.time_run_state_updated
> instance.runtime.time_run_state_updated
Expand Down
34 changes: 33 additions & 1 deletion sled-agent/src/fakes/nexus.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,9 @@ use hyper::Body;
use internal_dns::ServiceName;
use nexus_client::types::SledAgentInfo;
use omicron_common::api::external::Error;
use omicron_common::api::internal::nexus::UpdateArtifactId;
use omicron_common::api::internal::nexus::{
SledInstanceState, UpdateArtifactId,
};
use schemars::JsonSchema;
use serde::Deserialize;
use uuid::Uuid;
Expand Down Expand Up @@ -44,6 +46,14 @@ pub trait FakeNexusServer: Send + Sync {
) -> Result<(), Error> {
Err(Error::internal_error("Not implemented"))
}

fn cpapi_instances_put(
&self,
_instance_id: Uuid,
_new_runtime_state: SledInstanceState,
) -> Result<(), Error> {
Err(Error::internal_error("Not implemented"))
}
}

/// Describes the server context type.
Expand Down Expand Up @@ -107,11 +117,33 @@ async fn sled_agent_put(
Ok(HttpResponseUpdatedNoContent())
}

#[derive(Deserialize, JsonSchema)]
struct InstancePathParam {
instance_id: Uuid,
}
#[endpoint {
method = PUT,
path = "/instances/{instance_id}",
}]
async fn cpapi_instances_put(
request_context: RequestContext<ServerContext>,
path_params: Path<InstancePathParam>,
new_runtime_state: TypedBody<SledInstanceState>,
) -> Result<HttpResponseUpdatedNoContent, HttpError> {
let context = request_context.context();
context.cpapi_instances_put(
path_params.into_inner().instance_id,
new_runtime_state.into_inner(),
)?;
Ok(HttpResponseUpdatedNoContent())
}

fn api() -> ApiDescription<ServerContext> {
let mut api = ApiDescription::new();
api.register(cpapi_artifact_download).unwrap();
api.register(sled_agent_get).unwrap();
api.register(sled_agent_put).unwrap();
api.register(cpapi_instances_put).unwrap();
api
}

Expand Down
Loading