diff --git a/end-to-end-tests/src/instance_launch.rs b/end-to-end-tests/src/instance_launch.rs index 1aae46fe984..377fef4c0bc 100644 --- a/end-to-end-tests/src/instance_launch.rs +++ b/end-to-end-tests/src/instance_launch.rs @@ -7,7 +7,7 @@ use omicron_test_utils::dev::poll::{wait_for_condition, CondCheckError}; use oxide_client::types::{ ByteCount, DiskCreate, DiskSource, ExternalIp, ExternalIpCreate, InstanceCpuCount, InstanceCreate, InstanceDiskAttachment, - InstanceNetworkInterfaceAttachment, SshKeyCreate, + InstanceNetworkInterfaceAttachment, InstanceState, SshKeyCreate, }; use oxide_client::{ClientDisksExt, ClientInstancesExt, ClientSessionExt}; use russh::{ChannelMsg, Disconnect}; @@ -15,7 +15,6 @@ use russh_keys::key::{KeyPair, PublicKey}; use russh_keys::PublicKeyBase64; use std::sync::Arc; use std::time::Duration; -use tokio::time::sleep; #[tokio::test] async fn instance_launch() -> Result<()> { @@ -106,6 +105,19 @@ async fn instance_launch() -> Result<()> { type Error = CondCheckError>; + let instance_state = ctx + .client + .instance_view() + .project(ctx.project_name.clone()) + .instance(instance.name.clone()) + .send() + .await? + .run_state; + + if instance_state == InstanceState::Starting { + return Err(Error::NotYet); + } + let data = String::from_utf8_lossy( &ctx.client .instance_serial_console() @@ -188,19 +200,49 @@ async fn instance_launch() -> Result<()> { // check that we saw it on the console eprintln!("waiting for serial console"); - sleep(Duration::from_secs(5)).await; - let data = String::from_utf8_lossy( - &ctx.client - .instance_serial_console() - .project(ctx.project_name.clone()) - .instance(instance.name.clone()) - .most_recent(1024 * 1024) - .max_bytes(1024 * 1024) - .send() - .await? - .data, + + let data = wait_for_condition( + || async { + type Error = + CondCheckError>; + + let instance_state = ctx + .client + .instance_view() + .project(ctx.project_name.clone()) + .instance(instance.name.clone()) + .send() + .await? + .run_state; + + if instance_state == InstanceState::Starting { + return Err(Error::NotYet); + } + + let data = String::from_utf8_lossy( + &ctx.client + .instance_serial_console() + .project(ctx.project_name.clone()) + .instance(instance.name.clone()) + .most_recent(1024 * 1024) + .max_bytes(1024 * 1024) + .send() + .await + .map_err(|_e| Error::NotYet)? + .data, + ) + .into_owned(); + if data.contains("-----END SSH HOST KEY KEYS-----") { + Ok(data) + } else { + Err(Error::NotYet) + } + }, + &Duration::from_secs(5), + &Duration::from_secs(300), ) - .into_owned(); + .await?; + ensure!( data.contains("Hello, Oxide!"), "string not seen on console\n{}", diff --git a/illumos-utils/src/running_zone.rs b/illumos-utils/src/running_zone.rs index 0dd8f85e4ec..c529a1b6d45 100644 --- a/illumos-utils/src/running_zone.rs +++ b/illumos-utils/src/running_zone.rs @@ -1193,29 +1193,55 @@ impl ZoneBuilderFactory { /// Created by [ZoneBuilderFactory]. #[derive(Default)] pub struct ZoneBuilder<'a> { + /// Logger to which status messages are written during zone installation. log: Option, + /// Allocates the NIC used for control plane communication. underlay_vnic_allocator: Option<&'a VnicAllocator>, + /// Filesystem path at which the installed zone will reside. zone_root_path: Option<&'a Utf8Path>, + /// The directories that will be searched for the image tarball for the + /// provided zone type ([`Self::with_zone_type`]). zone_image_paths: Option<&'a [Utf8PathBuf]>, + /// The name of the type of zone being created (e.g. "propolis-server") zone_type: Option<&'a str>, - unique_name: Option, // actually optional + /// Unique ID of the instance of the zone being created. (optional) + // *actually* optional (in contrast to other fields that are `Option` for + // builder purposes - that is, skipping this field in the builder will + // still result in an `Ok(InstalledZone)` from `.install()`, rather than + // an `Err(InstallZoneError::IncompleteBuilder)`. + unique_name: Option, + /// ZFS datasets to be accessed from within the zone. datasets: Option<&'a [zone::Dataset]>, + /// Filesystems to mount within the zone. filesystems: Option<&'a [zone::Fs]>, + /// Additional network device names to add to the zone. data_links: Option<&'a [String]>, + /// Device nodes to pass through to the zone. devices: Option<&'a [zone::Device]>, + /// OPTE devices for the guest network interfaces. opte_ports: Option>, - bootstrap_vnic: Option, // actually optional + /// NIC to use for creating a bootstrap address on the switch zone. + // actually optional (as above) + bootstrap_vnic: Option, + /// Physical NICs possibly provisioned to the zone. links: Option>, + /// The maximum set of privileges any process in this zone can obtain. limit_priv: Option>, + /// For unit tests only: if `Some`, then no actual zones will be installed + /// by this builder, and minimal facsimiles of them will be placed in + /// temporary directories according to the contents of the provided + /// `FakeZoneBuilderConfig`. fake_cfg: Option, } impl<'a> ZoneBuilder<'a> { + /// Logger to which status messages are written during zone installation. pub fn with_log(mut self, log: Logger) -> Self { self.log = Some(log); self } + /// Allocates the NIC used for control plane communication. pub fn with_underlay_vnic_allocator( mut self, vnic_allocator: &'a VnicAllocator, @@ -1224,11 +1250,14 @@ impl<'a> ZoneBuilder<'a> { self } + /// Filesystem path at which the installed zone will reside. pub fn with_zone_root_path(mut self, root_path: &'a Utf8Path) -> Self { self.zone_root_path = Some(root_path); self } + /// The directories that will be searched for the image tarball for the + /// provided zone type ([`Self::with_zone_type`]). pub fn with_zone_image_paths( mut self, image_paths: &'a [Utf8PathBuf], @@ -1237,56 +1266,68 @@ impl<'a> ZoneBuilder<'a> { self } + /// The name of the type of zone being created (e.g. "propolis-server") pub fn with_zone_type(mut self, zone_type: &'a str) -> Self { self.zone_type = Some(zone_type); self } + /// Unique ID of the instance of the zone being created. (optional) pub fn with_unique_name(mut self, uuid: Uuid) -> Self { self.unique_name = Some(uuid); self } + /// ZFS datasets to be accessed from within the zone. pub fn with_datasets(mut self, datasets: &'a [zone::Dataset]) -> Self { self.datasets = Some(datasets); self } + /// Filesystems to mount within the zone. pub fn with_filesystems(mut self, filesystems: &'a [zone::Fs]) -> Self { self.filesystems = Some(filesystems); self } + /// Additional network device names to add to the zone. pub fn with_data_links(mut self, links: &'a [String]) -> Self { self.data_links = Some(links); self } + /// Device nodes to pass through to the zone. pub fn with_devices(mut self, devices: &'a [zone::Device]) -> Self { self.devices = Some(devices); self } + /// OPTE devices for the guest network interfaces. pub fn with_opte_ports(mut self, ports: Vec<(Port, PortTicket)>) -> Self { self.opte_ports = Some(ports); self } + /// NIC to use for creating a bootstrap address on the switch zone. + /// (optional) pub fn with_bootstrap_vnic(mut self, vnic: Link) -> Self { self.bootstrap_vnic = Some(vnic); self } + /// Physical NICs possibly provisioned to the zone. pub fn with_links(mut self, links: Vec) -> Self { self.links = Some(links); self } + /// The maximum set of privileges any process in this zone can obtain. pub fn with_limit_priv(mut self, limit_priv: Vec) -> Self { self.limit_priv = Some(limit_priv); self } + // (used in unit tests) fn fake_install(self) -> Result { let zone = self .zone_type @@ -1324,6 +1365,9 @@ impl<'a> ZoneBuilder<'a> { .ok_or(InstallZoneError::IncompleteBuilder) } + /// Create the zone with the provided parameters. + /// Returns `Err(InstallZoneError::IncompleteBuilder)` if a necessary + /// parameter was not provided. pub async fn install(self) -> Result { if self.fake_cfg.is_some() { return self.fake_install(); diff --git a/nexus/src/app/instance.rs b/nexus/src/app/instance.rs index 2d09078e189..2300bd56f26 100644 --- a/nexus/src/app/instance.rs +++ b/nexus/src/app/instance.rs @@ -986,6 +986,9 @@ impl super::Nexus { // // If the operation failed, kick the sled agent error back up to // the caller to let it decide how to handle it. + // + // When creating the zone for the first time, we just get + // Ok(None) here, which is a no-op in write_returned_instance_state. match instance_put_result { Ok(state) => self .write_returned_instance_state(&instance_id, state) diff --git a/nexus/tests/integration_tests/instances.rs b/nexus/tests/integration_tests/instances.rs index 309036256f9..03d5a8c8a99 100644 --- a/nexus/tests/integration_tests/instances.rs +++ b/nexus/tests/integration_tests/instances.rs @@ -67,6 +67,7 @@ use sled_agent_client::TestInterfaces as _; use std::convert::TryFrom; use std::net::Ipv4Addr; use std::sync::Arc; +use std::time::Duration; use uuid::Uuid; use dropshot::test_util::ClientTestContext; @@ -80,6 +81,8 @@ use nexus_test_utils::resource_helpers::{ use nexus_test_utils_macros::nexus_test; use nexus_types::external_api::shared::SiloRole; use omicron_sled_agent::sim; +use omicron_test_utils::dev::poll; +use omicron_test_utils::dev::poll::CondCheckError; type ControlPlaneTestContext = nexus_test_utils::ControlPlaneTestContext; @@ -3794,10 +3797,22 @@ async fn test_instance_serial(cptestctx: &ControlPlaneTestContext) { // Create an instance and poke it to ensure it's running. let instance = create_instance(client, PROJECT_NAME, instance_name).await; - instance_simulate(nexus, &instance.identity.id).await; - let instance_next = instance_get(&client, &instance_url).await; + let instance_next = poll::wait_for_condition( + || async { + instance_simulate(nexus, &instance.identity.id).await; + let instance_next = instance_get(&client, &instance_url).await; + if instance_next.runtime.run_state == InstanceState::Running { + Ok(instance_next) + } else { + Err(CondCheckError::<()>::NotYet) + } + }, + &Duration::from_secs(5), + &Duration::from_secs(60), + ) + .await + .unwrap(); identity_eq(&instance.identity, &instance_next.identity); - assert_eq!(instance_next.runtime.run_state, InstanceState::Running); assert!( instance_next.runtime.time_run_state_updated > instance.runtime.time_run_state_updated diff --git a/sled-agent/src/fakes/nexus.rs b/sled-agent/src/fakes/nexus.rs index 5920d40f6ca..de37b77bcd5 100644 --- a/sled-agent/src/fakes/nexus.rs +++ b/sled-agent/src/fakes/nexus.rs @@ -15,7 +15,9 @@ use hyper::Body; use internal_dns::ServiceName; use nexus_client::types::SledAgentInfo; use omicron_common::api::external::Error; -use omicron_common::api::internal::nexus::UpdateArtifactId; +use omicron_common::api::internal::nexus::{ + SledInstanceState, UpdateArtifactId, +}; use schemars::JsonSchema; use serde::Deserialize; use uuid::Uuid; @@ -44,6 +46,14 @@ pub trait FakeNexusServer: Send + Sync { ) -> Result<(), Error> { Err(Error::internal_error("Not implemented")) } + + fn cpapi_instances_put( + &self, + _instance_id: Uuid, + _new_runtime_state: SledInstanceState, + ) -> Result<(), Error> { + Err(Error::internal_error("Not implemented")) + } } /// Describes the server context type. @@ -107,11 +117,33 @@ async fn sled_agent_put( Ok(HttpResponseUpdatedNoContent()) } +#[derive(Deserialize, JsonSchema)] +struct InstancePathParam { + instance_id: Uuid, +} +#[endpoint { + method = PUT, + path = "/instances/{instance_id}", +}] +async fn cpapi_instances_put( + request_context: RequestContext, + path_params: Path, + new_runtime_state: TypedBody, +) -> Result { + let context = request_context.context(); + context.cpapi_instances_put( + path_params.into_inner().instance_id, + new_runtime_state.into_inner(), + )?; + Ok(HttpResponseUpdatedNoContent()) +} + fn api() -> ApiDescription { let mut api = ApiDescription::new(); api.register(cpapi_artifact_download).unwrap(); api.register(sled_agent_get).unwrap(); api.register(sled_agent_put).unwrap(); + api.register(cpapi_instances_put).unwrap(); api } diff --git a/sled-agent/src/instance.rs b/sled-agent/src/instance.rs index 5ddca904036..8b6699c8ee1 100644 --- a/sled-agent/src/instance.rs +++ b/sled-agent/src/instance.rs @@ -30,7 +30,6 @@ use illumos_utils::link::VnicAllocator; use illumos_utils::opte::{DhcpCfg, PortManager}; use illumos_utils::running_zone::{RunningZone, ZoneBuilderFactory}; use illumos_utils::svc::wait_for_service; -use illumos_utils::zone::Zones; use illumos_utils::zone::PROPOLIS_ZONE_PREFIX; use omicron_common::address::NEXUS_INTERNAL_PORT; use omicron_common::api::internal::nexus::{ @@ -52,6 +51,11 @@ use std::sync::Arc; use tokio::sync::{mpsc, oneshot}; use uuid::Uuid; +#[cfg(test)] +use illumos_utils::zone::MockZones as Zones; +#[cfg(not(test))] +use illumos_utils::zone::Zones; + // The depth of the request queue for the instance. const QUEUE_SIZE: usize = 32; @@ -1521,3 +1525,683 @@ impl InstanceRunner { out } } + +#[cfg(test)] +mod tests { + use super::*; + use crate::fakes::nexus::{FakeNexusServer, ServerContext}; + use crate::nexus::NexusClient; + use crate::vmm_reservoir::VmmReservoirManagerHandle; + use crate::zone_bundle::CleanupContext; + use camino_tempfile::Utf8TempDir; + use dns_server::dns_server::ServerHandle as DnsServerHandle; + use dropshot::test_util::LogContext; + use dropshot::{HandlerTaskMode, HttpServer}; + use illumos_utils::dladm::MockDladm; + use illumos_utils::dladm::__mock_MockDladm::__create_vnic::Context as MockDladmCreateVnicContext; + use illumos_utils::dladm::__mock_MockDladm::__delete_vnic::Context as MockDladmDeleteVnicContext; + use illumos_utils::opte::params::DhcpConfig; + use illumos_utils::svc::__wait_for_service::Context as MockWaitForServiceContext; + use illumos_utils::zone::MockZones; + use illumos_utils::zone::__mock_MockZones::__boot::Context as MockZonesBootContext; + use illumos_utils::zone::__mock_MockZones::__id::Context as MockZonesIdContext; + use illumos_utils::zpool::ZpoolName; + use internal_dns::resolver::Resolver; + use internal_dns::ServiceName; + use omicron_common::api::external::{ + ByteCount, Generation, Hostname, InstanceCpuCount, InstanceState, + }; + use omicron_common::api::internal::nexus::InstanceProperties; + use sled_storage::disk::{RawDisk, SyntheticDisk}; + use sled_storage::manager::FakeStorageManager; + use std::net::Ipv6Addr; + use std::str::FromStr; + use tokio::sync::watch::Receiver; + use tokio::time::timeout; + + const TIMEOUT_DURATION: tokio::time::Duration = + tokio::time::Duration::from_secs(30); + + #[derive(Default, Clone)] + enum ReceivedInstanceState { + #[default] + None, + InstancePut(SledInstanceState), + } + + struct NexusServer { + observed_runtime_state: + tokio::sync::watch::Sender, + } + impl FakeNexusServer for NexusServer { + fn cpapi_instances_put( + &self, + _instance_id: Uuid, + new_runtime_state: SledInstanceState, + ) -> Result<(), omicron_common::api::external::Error> { + self.observed_runtime_state + .send(ReceivedInstanceState::InstancePut(new_runtime_state)) + .map_err(|_| { + omicron_common::api::external::Error::internal_error( + "couldn't send SledInstanceState to test driver", + ) + }) + } + } + + struct FakeNexusParts { + nexus_client: NexusClient, + nexus_server: HttpServer, + state_rx: Receiver, + } + + impl FakeNexusParts { + fn new(logctx: &LogContext) -> Self { + let (state_tx, state_rx) = + tokio::sync::watch::channel(ReceivedInstanceState::None); + + let nexus_server = crate::fakes::nexus::start_test_server( + logctx.log.new(o!("component" => "FakeNexusServer")), + Box::new(NexusServer { observed_runtime_state: state_tx }), + ); + let nexus_client = NexusClient::new( + &format!("http://{}", nexus_server.local_addr()), + logctx.log.new(o!("component" => "NexusClient")), + ); + + Self { nexus_client, nexus_server, state_rx } + } + } + + fn mock_vnic_contexts( + ) -> (MockDladmCreateVnicContext, MockDladmDeleteVnicContext) { + let create_vnic_ctx = MockDladm::create_vnic_context(); + let delete_vnic_ctx = MockDladm::delete_vnic_context(); + create_vnic_ctx.expect().return_once( + |physical_link: &Etherstub, _, _, _, _| { + assert_eq!(&physical_link.0, "mystub"); + Ok(()) + }, + ); + delete_vnic_ctx.expect().returning(|_| Ok(())); + (create_vnic_ctx, delete_vnic_ctx) + } + + // InstanceManager::ensure_state calls Instance::put_state(Running), + // which calls Instance::propolis_ensure, + // which spawns Instance::monitor_state_task, + // which calls cpapi_instances_put + // and calls Instance::setup_propolis_inner, + // which creates the zone (which isn't real in these tests, of course) + fn mock_zone_contexts( + ) -> (MockZonesBootContext, MockWaitForServiceContext, MockZonesIdContext) + { + let boot_ctx = MockZones::boot_context(); + boot_ctx.expect().return_once(|_| Ok(())); + let wait_ctx = illumos_utils::svc::wait_for_service_context(); + wait_ctx.expect().times(..).returning(|_, _, _| Ok(())); + let zone_id_ctx = MockZones::id_context(); + zone_id_ctx.expect().times(..).returning(|_| Ok(Some(1))); + (boot_ctx, wait_ctx, zone_id_ctx) + } + + async fn dns_server( + logctx: &LogContext, + nexus_server: &HttpServer, + ) -> (DnsServerHandle, Arc, Utf8TempDir) { + let storage_path = + Utf8TempDir::new().expect("Failed to create temporary directory"); + let config_store = dns_server::storage::Config { + keep_old_generations: 3, + storage_path: storage_path.path().to_owned(), + }; + + let (dns_server, dns_dropshot) = dns_server::start_servers( + logctx.log.new(o!("component" => "DnsServer")), + dns_server::storage::Store::new( + logctx.log.new(o!("component" => "DnsStore")), + &config_store, + ) + .unwrap(), + &dns_server::dns_server::Config { + bind_address: "[::1]:0".parse().unwrap(), + }, + &dropshot::ConfigDropshot { + bind_address: "[::1]:0".parse().unwrap(), + request_body_max_bytes: 8 * 1024, + default_handler_task_mode: HandlerTaskMode::Detached, + }, + ) + .await + .expect("starting DNS server"); + + let dns_dropshot_client = dns_service_client::Client::new( + &format!("http://{}", dns_dropshot.local_addr()), + logctx.log.new(o!("component" => "DnsDropshotClient")), + ); + let mut dns_config = internal_dns::DnsConfigBuilder::new(); + let IpAddr::V6(nexus_ip_addr) = nexus_server.local_addr().ip() else { + panic!("IPv6 address required for nexus_server") + }; + let zone = dns_config.host_zone(Uuid::new_v4(), nexus_ip_addr).unwrap(); + dns_config + .service_backend_zone( + ServiceName::Nexus, + &zone, + nexus_server.local_addr().port(), + ) + .unwrap(); + let dns_config = dns_config.build(); + dns_dropshot_client.dns_config_put(&dns_config).await.unwrap(); + + let resolver = Arc::new( + Resolver::new_from_addrs( + logctx.log.new(o!("component" => "Resolver")), + &[dns_server.local_address()], + ) + .unwrap(), + ); + (dns_server, resolver, storage_path) + } + + // note the "mock" here is different from the vnic/zone contexts above. + // this is actually running code for a dropshot server from propolis. + // (might we want a locally-defined fake whose behavior we can control + // more directly from the test driver?) + // TODO: factor out, this is also in sled-agent-sim. + fn propolis_mock_server( + log: &Logger, + ) -> (HttpServer>, PropolisClient) { + let propolis_bind_address = + SocketAddr::new(Ipv6Addr::LOCALHOST.into(), 0); // allocate port + let dropshot_config = dropshot::ConfigDropshot { + bind_address: propolis_bind_address, + ..Default::default() + }; + let propolis_log = log.new(o!("component" => "propolis-server-mock")); + let private = + Arc::new(propolis_mock_server::Context::new(propolis_log)); + info!(log, "Starting mock propolis-server..."); + let dropshot_log = log.new(o!("component" => "dropshot")); + let mock_api = propolis_mock_server::api(); + + let srv = dropshot::HttpServerStarter::new( + &dropshot_config, + mock_api, + private, + &dropshot_log, + ) + .expect("couldn't create mock propolis-server") + .start(); + + let client = propolis_client::Client::new(&format!( + "http://{}", + srv.local_addr() + )); + + (srv, client) + } + + // make a FakeStorageManager with a "U2" upserted + async fn fake_storage_manager_with_u2() -> StorageHandle { + let (storage_manager, storage_handle) = FakeStorageManager::new(); + tokio::spawn(storage_manager.run()); + let external_zpool_name = ZpoolName::new_external(Uuid::new_v4()); + let external_disk: RawDisk = + SyntheticDisk::new(external_zpool_name, 0).into(); + storage_handle.upsert_disk(external_disk).await; + storage_handle + } + + async fn instance_struct( + logctx: &LogContext, + propolis_addr: SocketAddr, + nexus_client_with_resolver: NexusClientWithResolver, + storage_handle: StorageHandle, + temp_dir: &String, + ) -> Instance { + let id = Uuid::new_v4(); + let propolis_id = Uuid::new_v4(); + + let ticket = InstanceTicket::new_without_manager_for_test(id); + + let initial_state = + fake_instance_initial_state(propolis_id, propolis_addr); + + let services = fake_instance_manager_services( + logctx, + storage_handle, + nexus_client_with_resolver, + temp_dir, + ); + + let metadata = InstanceMetadata { + silo_id: Uuid::new_v4(), + project_id: Uuid::new_v4(), + }; + + Instance::new( + logctx.log.new(o!("component" => "Instance")), + id, + propolis_id, + ticket, + initial_state, + services, + metadata, + ) + .unwrap() + } + + fn fake_instance_initial_state( + propolis_id: Uuid, + propolis_addr: SocketAddr, + ) -> InstanceInitialState { + let hardware = InstanceHardware { + properties: InstanceProperties { + ncpus: InstanceCpuCount(1), + memory: ByteCount::from_gibibytes_u32(1), + hostname: Hostname::from_str("bert").unwrap(), + }, + nics: vec![], + source_nat: SourceNatConfig { + ip: IpAddr::V6(Ipv6Addr::UNSPECIFIED), + first_port: 0, + last_port: 0, + }, + ephemeral_ip: None, + floating_ips: vec![], + firewall_rules: vec![], + dhcp_config: DhcpConfig { + dns_servers: vec![], + host_domain: None, + search_domains: vec![], + }, + disks: vec![], + cloud_init_bytes: None, + }; + + InstanceInitialState { + hardware, + instance_runtime: InstanceRuntimeState { + propolis_id: Some(propolis_id), + dst_propolis_id: None, + migration_id: None, + gen: Generation::new(), + time_updated: Default::default(), + }, + vmm_runtime: VmmRuntimeState { + state: InstanceState::Creating, + gen: Generation::new(), + time_updated: Default::default(), + }, + propolis_addr, + } + } + + fn fake_instance_manager_services( + logctx: &LogContext, + storage_handle: StorageHandle, + nexus_client_with_resolver: NexusClientWithResolver, + temp_dir: &String, + ) -> InstanceManagerServices { + let vnic_allocator = + VnicAllocator::new("Foo", Etherstub("mystub".to_string())); + let port_manager = PortManager::new( + logctx.log.new(o!("component" => "PortManager")), + Ipv6Addr::new(0xfd00, 0x1de, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01), + ); + + let cleanup_context = CleanupContext::default(); + let zone_bundler = ZoneBundler::new( + logctx.log.new(o!("component" => "ZoneBundler")), + storage_handle.clone(), + cleanup_context, + ); + + InstanceManagerServices { + nexus_client: nexus_client_with_resolver, + vnic_allocator, + port_manager, + storage: storage_handle, + zone_bundler, + zone_builder_factory: ZoneBuilderFactory::fake(Some(temp_dir)), + } + } + + #[tokio::test] + async fn test_instance_create_events_normal() { + let logctx = omicron_test_utils::dev::test_setup_log( + "test_instance_create_events_normal", + ); + + let (propolis_server, _propolis_client) = + propolis_mock_server(&logctx.log); + let propolis_addr = propolis_server.local_addr(); + + // automock'd things used during this test + let _mock_vnic_contexts = mock_vnic_contexts(); + let _mock_zone_contexts = mock_zone_contexts(); + + let FakeNexusParts { nexus_client, nexus_server, mut state_rx } = + FakeNexusParts::new(&logctx); + + let (_dns_server, resolver, _dns_config_dir) = + timeout(TIMEOUT_DURATION, dns_server(&logctx, &nexus_server)) + .await + .expect("timed out making DNS server and Resolver"); + + let nexus_client_with_resolver = + NexusClientWithResolver::new_with_client(nexus_client, resolver); + + let storage_handle = fake_storage_manager_with_u2().await; + + let temp_guard = Utf8TempDir::new().unwrap(); + let temp_dir = temp_guard.path().to_string(); + + let inst = timeout( + TIMEOUT_DURATION, + instance_struct( + &logctx, + propolis_addr, + nexus_client_with_resolver, + storage_handle, + &temp_dir, + ), + ) + .await + .expect("timed out creating Instance struct"); + + let (put_tx, put_rx) = oneshot::channel(); + + // pretending we're InstanceManager::ensure_state, start our "instance" + // (backed by fakes and propolis_mock_server) + inst.put_state(put_tx, InstanceStateRequested::Running) + .await + .expect("failed to send Instance::put_state"); + + // even though we ignore this result at instance creation time in + // practice (to avoid request timeouts), in this test let's make sure + // it actually completes. + timeout(TIMEOUT_DURATION, put_rx) + .await + .expect("timed out waiting for Instance::put_state result") + .expect("failed to receive Instance::put_state result") + .expect("Instance::put_state failed"); + + timeout( + TIMEOUT_DURATION, + state_rx.wait_for(|maybe_state| match maybe_state { + ReceivedInstanceState::InstancePut(sled_inst_state) => { + sled_inst_state.vmm_state.state == InstanceState::Running + } + _ => false, + }), + ) + .await + .expect("timed out waiting for InstanceState::Running in FakeNexus") + .expect("failed to receive FakeNexus' InstanceState"); + + logctx.cleanup_successful(); + } + + // tests around dropshot request timeouts during the blocking propolis setup + #[tokio::test] + async fn test_instance_create_timeout_while_starting_propolis() { + let logctx = omicron_test_utils::dev::test_setup_log( + "test_instance_create_timeout_while_starting_propolis", + ); + + // automock'd things used during this test + let _mock_vnic_contexts = mock_vnic_contexts(); + let _mock_zone_contexts = mock_zone_contexts(); + + let FakeNexusParts { nexus_client, nexus_server, state_rx } = + FakeNexusParts::new(&logctx); + + let (_dns_server, resolver, _dns_config_dir) = + timeout(TIMEOUT_DURATION, dns_server(&logctx, &nexus_server)) + .await + .expect("timed out making DNS server and Resolver"); + + let nexus_client_with_resolver = + NexusClientWithResolver::new_with_client(nexus_client, resolver); + + let storage_handle = fake_storage_manager_with_u2().await; + + let temp_guard = Utf8TempDir::new().unwrap(); + let temp_dir = temp_guard.path().to_string(); + + let inst = timeout( + TIMEOUT_DURATION, + instance_struct( + &logctx, + // we want to test propolis not ever coming up + SocketAddr::V6(SocketAddrV6::new(Ipv6Addr::LOCALHOST, 1, 0, 0)), + nexus_client_with_resolver, + storage_handle, + &temp_dir, + ), + ) + .await + .expect("timed out creating Instance struct"); + + let (put_tx, put_rx) = oneshot::channel(); + + tokio::time::pause(); + + // pretending we're InstanceManager::ensure_state, try in vain to start + // our "instance", but no propolis server is running + inst.put_state(put_tx, InstanceStateRequested::Running) + .await + .expect("failed to send Instance::put_state"); + + let timeout_fut = timeout(TIMEOUT_DURATION, put_rx); + + tokio::time::advance(TIMEOUT_DURATION).await; + + tokio::time::resume(); + + timeout_fut + .await + .expect_err("*should've* timed out waiting for Instance::put_state, but didn't?"); + + if let ReceivedInstanceState::InstancePut(SledInstanceState { + vmm_state: VmmRuntimeState { state: InstanceState::Running, .. }, + .. + }) = state_rx.borrow().to_owned() + { + panic!("Nexus's InstanceState should never have reached running if zone creation timed out"); + } + + logctx.cleanup_successful(); + } + + #[tokio::test] + async fn test_instance_create_timeout_while_creating_zone() { + let logctx = omicron_test_utils::dev::test_setup_log( + "test_instance_create_timeout_while_creating_zone", + ); + + // automock'd things used during this test + let _mock_vnic_contexts = mock_vnic_contexts(); + + let rt_handle = tokio::runtime::Handle::current(); + + // time out while booting zone, on purpose! + let boot_ctx = MockZones::boot_context(); + boot_ctx.expect().return_once(move |_| { + rt_handle.block_on(tokio::time::sleep(TIMEOUT_DURATION * 2)); + Ok(()) + }); + let wait_ctx = illumos_utils::svc::wait_for_service_context(); + wait_ctx.expect().times(..).returning(|_, _, _| Ok(())); + let zone_id_ctx = MockZones::id_context(); + zone_id_ctx.expect().times(..).returning(|_| Ok(Some(1))); + + let FakeNexusParts { nexus_client, nexus_server, state_rx } = + FakeNexusParts::new(&logctx); + + let (_dns_server, resolver, _dns_config_dir) = + timeout(TIMEOUT_DURATION, dns_server(&logctx, &nexus_server)) + .await + .expect("timed out making DNS server and Resolver"); + + let nexus_client_with_resolver = + NexusClientWithResolver::new_with_client(nexus_client, resolver); + + let storage_handle = fake_storage_manager_with_u2().await; + + let temp_guard = Utf8TempDir::new().unwrap(); + let temp_dir = temp_guard.path().to_string(); + + let inst = timeout( + TIMEOUT_DURATION, + instance_struct( + &logctx, + // isn't running because the "zone" never "boots" + SocketAddr::V6(SocketAddrV6::new(Ipv6Addr::LOCALHOST, 1, 0, 0)), + nexus_client_with_resolver, + storage_handle, + &temp_dir, + ), + ) + .await + .expect("timed out creating Instance struct"); + + tokio::time::pause(); + + let (put_tx, put_rx) = oneshot::channel(); + + // pretending we're InstanceManager::ensure_state, try in vain to start + // our "instance", but the zone never finishes installing + inst.put_state(put_tx, InstanceStateRequested::Running) + .await + .expect("failed to send Instance::put_state"); + + let timeout_fut = timeout(TIMEOUT_DURATION, put_rx); + + tokio::time::advance(TIMEOUT_DURATION * 2).await; + + tokio::time::resume(); + + timeout_fut + .await + .expect_err("*should've* timed out waiting for Instance::put_state, but didn't?"); + + if let ReceivedInstanceState::InstancePut(SledInstanceState { + vmm_state: VmmRuntimeState { state: InstanceState::Running, .. }, + .. + }) = state_rx.borrow().to_owned() + { + panic!("Nexus's InstanceState should never have reached running if zone creation timed out"); + } + + logctx.cleanup_successful(); + } + + #[tokio::test] + async fn test_instance_manager_creation() { + let logctx = omicron_test_utils::dev::test_setup_log( + "test_instance_manager_creation", + ); + + // automock'd things used during this test + let _mock_vnic_contexts = mock_vnic_contexts(); + let _mock_zone_contexts = mock_zone_contexts(); + + let storage_handle = fake_storage_manager_with_u2().await; + + let FakeNexusParts { nexus_client, nexus_server, mut state_rx } = + FakeNexusParts::new(&logctx); + + let (_dns_server, resolver, _dns_config_dir) = + timeout(TIMEOUT_DURATION, dns_server(&logctx, &nexus_server)) + .await + .expect("timed out making DNS server and Resolver"); + + let nexus_client_with_resolver = + NexusClientWithResolver::new_with_client(nexus_client, resolver); + + let temp_guard = Utf8TempDir::new().unwrap(); + let temp_dir = temp_guard.path().to_string(); + + let InstanceManagerServices { + nexus_client, + vnic_allocator: _, + port_manager, + storage, + zone_bundler, + zone_builder_factory, + } = fake_instance_manager_services( + &logctx, + storage_handle, + nexus_client_with_resolver, + &temp_dir, + ); + + let etherstub = Etherstub("mystub".to_string()); + + let vmm_reservoir_manager = VmmReservoirManagerHandle::stub_for_test(); + + let mgr = crate::instance_manager::InstanceManager::new( + logctx.log.new(o!("component" => "InstanceManager")), + nexus_client, + etherstub, + port_manager, + storage, + zone_bundler, + zone_builder_factory, + vmm_reservoir_manager, + ) + .unwrap(); + + let (propolis_server, _propolis_client) = + propolis_mock_server(&logctx.log); + let propolis_addr = propolis_server.local_addr(); + + let instance_id = Uuid::new_v4(); + let propolis_id = Uuid::new_v4(); + let InstanceInitialState { + hardware, + instance_runtime, + vmm_runtime, + propolis_addr, + } = fake_instance_initial_state(propolis_id, propolis_addr); + + let metadata = InstanceMetadata { + silo_id: Uuid::new_v4(), + project_id: Uuid::new_v4(), + }; + + mgr.ensure_registered( + instance_id, + propolis_id, + hardware, + instance_runtime, + vmm_runtime, + propolis_addr, + metadata, + ) + .await + .unwrap(); + + mgr.ensure_state(instance_id, InstanceStateRequested::Running) + .await + .unwrap(); + + timeout( + TIMEOUT_DURATION, + state_rx.wait_for(|maybe_state| match maybe_state { + ReceivedInstanceState::InstancePut(sled_inst_state) => { + sled_inst_state.vmm_state.state == InstanceState::Running + } + _ => false, + }), + ) + .await + .expect("timed out waiting for InstanceState::Running in FakeNexus") + .expect("failed to receive FakeNexus' InstanceState"); + + logctx.cleanup_successful(); + } +} diff --git a/sled-agent/src/instance_manager.rs b/sled-agent/src/instance_manager.rs index badaf2d7baa..2c9780b3ce6 100644 --- a/sled-agent/src/instance_manager.rs +++ b/sled-agent/src/instance_manager.rs @@ -194,7 +194,23 @@ impl InstanceManager { }) .await .map_err(|_| Error::FailedSendInstanceManagerClosed)?; - rx.await? + + match target { + // these may involve a long-running zone creation, so avoid HTTP + // request timeouts by decoupling the response + // (see InstanceRunner::put_state) + InstanceStateRequested::MigrationTarget(_) + | InstanceStateRequested::Running => { + // We don't want the sending side of the channel to see an + // error if we drop rx without awaiting it. + // Since we don't care about the response here, we spawn rx + // into a task which will await it for us in the background. + tokio::spawn(rx); + Ok(InstancePutStateResponse { updated_runtime: None }) + } + InstanceStateRequested::Stopped + | InstanceStateRequested::Reboot => rx.await?, + } } pub async fn put_migration_ids( @@ -734,6 +750,11 @@ impl InstanceTicket { InstanceTicket { id, terminate_tx: Some(terminate_tx) } } + #[cfg(test)] + pub(crate) fn new_without_manager_for_test(id: Uuid) -> Self { + Self { id, terminate_tx: None } + } + /// Idempotently removes this instance from the tracked set of /// instances. This acts as an "upcall" for instances to remove /// themselves after stopping. diff --git a/sled-agent/src/nexus.rs b/sled-agent/src/nexus.rs index 12fcc05ce30..3f24c6a8065 100644 --- a/sled-agent/src/nexus.rs +++ b/sled-agent/src/nexus.rs @@ -60,6 +60,16 @@ impl NexusClientWithResolver { } } + // for when we have a NexusClient constructed from a FakeNexusServer + // (no need to expose this function outside of tests) + #[cfg(test)] + pub(crate) fn new_with_client( + client: NexusClient, + resolver: Arc, + ) -> Self { + Self { client, resolver } + } + /// Access the progenitor-based Nexus Client. pub fn client(&self) -> &NexusClient { &self.client diff --git a/sled-agent/src/params.rs b/sled-agent/src/params.rs index f74438a6784..a94b95fc475 100644 --- a/sled-agent/src/params.rs +++ b/sled-agent/src/params.rs @@ -129,7 +129,7 @@ pub struct InstancePutStateBody { /// The response sent from a request to move an instance into a specific runtime /// state. -#[derive(Serialize, Deserialize, JsonSchema)] +#[derive(Debug, Serialize, Deserialize, JsonSchema)] pub struct InstancePutStateResponse { /// The current runtime state of the instance after handling the request to /// change its state. If the instance's state did not change, this field is diff --git a/sled-agent/src/sim/sled_agent.rs b/sled-agent/src/sim/sled_agent.rs index b772a60347b..c7bc7df9780 100644 --- a/sled-agent/src/sim/sled_agent.rs +++ b/sled-agent/src/sim/sled_agent.rs @@ -44,6 +44,7 @@ use std::collections::{HashMap, HashSet}; use std::net::{IpAddr, Ipv6Addr, SocketAddr}; use std::str::FromStr; use std::sync::Arc; +use std::time::Duration; use uuid::Uuid; /// Simulates management of the control plane on a sled @@ -73,6 +74,7 @@ pub struct SledAgent { config: Config, fake_zones: Mutex, instance_ensure_state_error: Mutex>, + pub log: Logger, } fn extract_targets_from_volume_construction_request( @@ -171,6 +173,7 @@ impl SledAgent { zones: vec![], }), instance_ensure_state_error: Mutex::new(None), + log, }) } @@ -400,7 +403,28 @@ impl SledAgent { )); } InstanceStateRequested::Running => { - propolis_client::types::InstanceStateRequested::Run + let instances = self.instances.clone(); + let log = self.log.new( + o!("component" => "SledAgent-insure_instance_state"), + ); + tokio::spawn(async move { + tokio::time::sleep(Duration::from_secs(10)).await; + match instances + .sim_ensure(&instance_id, current, Some(state)) + .await + { + Ok(state) => { + let instance_state: nexus_client::types::SledInstanceState = state.into(); + info!(log, "sim_ensure success"; "instance_state" => #?instance_state); + } + Err(instance_put_error) => { + error!(log, "sim_ensure failure"; "error" => #?instance_put_error); + } + } + }); + return Ok(InstancePutStateResponse { + updated_runtime: None, + }); } InstanceStateRequested::Stopped => { propolis_client::types::InstanceStateRequested::Stop diff --git a/sled-agent/src/vmm_reservoir.rs b/sled-agent/src/vmm_reservoir.rs index d7b6b64ecf8..b16286f5f5d 100644 --- a/sled-agent/src/vmm_reservoir.rs +++ b/sled-agent/src/vmm_reservoir.rs @@ -119,6 +119,19 @@ impl VmmReservoirManagerHandle { } rx.await.map_err(|_| Error::ReplySenderDropped)? } + + #[cfg(test)] + pub fn stub_for_test() -> Self { + let (tx, _) = flume::bounded(1); + let (size_updated_tx, _) = broadcast::channel(1); + let _manager_handle = Arc::new(thread::spawn(|| {})); + Self { + reservoir_size: Arc::new(AtomicU64::new(0)), + tx, + size_updated_tx, + _manager_handle, + } + } } /// Manage the VMM reservoir in a background thread