From c0ba701fe04d6373b4a1dfa28482b78d727b5de0 Mon Sep 17 00:00:00 2001 From: lif <> Date: Thu, 28 Sep 2023 01:22:21 +0000 Subject: [PATCH 1/8] Add some unit tests for sled-agent Instance creation At time of writing, instance creation roughly looks like: - nexus -> sled-agent: `instance_put_state` - sled-agent: `InstanceManager::ensure_state` - sled-agent: `Instance::propolis_ensure` - sled-agent -> nexus: `cpapi_instances_put` (if not migrating) - sled-agent: `Instance::setup_propolis_locked` (*blocking!*) - `RunningZone::install` and `Zones::boot` - `illumos_utils::svc::wait_for_service` - `self::wait_for_http_server` for propolis-server itself - sled-agent: `Instance::ensure_propolis_and_tasks` - sled-agent: spawn `Instance::monitor_state_task` - sled-agent -> nexus: `cpapi_instances_put` (if not migrating) - sled-agent: return ok result - nexus: `handle_instance_put_result` Or at least, it does in the happy path. omicron#3927 saw propolis zone creation take longer than the minute nexus's call to sled-agent's `instance_put_state`. That might've looked something like: - nexus -> sled-agent: `instance_put_state` - sled-agent: `InstanceManager::ensure_state` - sled-agent: `Instance::propolis_ensure` - sled-agent -> nexus: `cpapi_instances_put` (if not migrating) - sled-agent: `Instance::setup_propolis_locked` (*blocking!*) - `RunningZone::install` and `Zones::boot` - nexus: i've been waiting a whole minute for this. connection timeout! - nexus: `handle_instance_put_result` - sled-agent: [...] return... oh, they hung up. :( To avoid this timeout being implicit at the *Dropshot configuration* layer (that is to say, we should still have *some* timeout), we could consider a small refactor to make `instance_put_state` not a blocking call -- especially since it's already sending nexus updates on its progress via out-of-band `cpapi_instances_put` calls! That might look something like: - nexus -> sled-agent: `instance_put_state` - sled-agent: `InstanceManager::ensure_state` - sled-agent: spawn { - sled-agent: `Instance::propolis_ensure` - sled-agent -> nexus: `cpapi_instances_put` (if not migrating) - sled-agent: `Instance::setup_propolis_locked` (blocking!) - sled-agent: `Instance::ensure_propolis_and_tasks` - sled-agent: spawn `Instance::monitor_state_task` - sled-agent -> nexus: `cpapi_instances_put` (if not migrating) - sled-agent -> nexus: a cpapi call equivalent to the `handle_instance_put_result` nexus currently invokes after getting the response from the blocking call (With a way for nexus to cancel an instance creation by ID, and a timeout in sled-agent itself for terminating the attempt and reporting the failure back to nexus, and a shorter threshold for logging the event of an instance creation taking a long time.) Before such a change, though, we should really have some more tests around sled-agent's instance creation code at all! So here's a few. --- illumos-utils/src/running_zone.rs | 48 ++- sled-agent/src/fakes/nexus.rs | 35 ++- sled-agent/src/instance.rs | 474 +++++++++++++++++++++++++++++ sled-agent/src/instance_manager.rs | 5 + sled-agent/src/nexus.rs | 10 + 5 files changed, 569 insertions(+), 3 deletions(-) diff --git a/illumos-utils/src/running_zone.rs b/illumos-utils/src/running_zone.rs index 0dd8f85e4ec..c529a1b6d45 100644 --- a/illumos-utils/src/running_zone.rs +++ b/illumos-utils/src/running_zone.rs @@ -1193,29 +1193,55 @@ impl ZoneBuilderFactory { /// Created by [ZoneBuilderFactory]. #[derive(Default)] pub struct ZoneBuilder<'a> { + /// Logger to which status messages are written during zone installation. log: Option, + /// Allocates the NIC used for control plane communication. underlay_vnic_allocator: Option<&'a VnicAllocator>, + /// Filesystem path at which the installed zone will reside. zone_root_path: Option<&'a Utf8Path>, + /// The directories that will be searched for the image tarball for the + /// provided zone type ([`Self::with_zone_type`]). zone_image_paths: Option<&'a [Utf8PathBuf]>, + /// The name of the type of zone being created (e.g. "propolis-server") zone_type: Option<&'a str>, - unique_name: Option, // actually optional + /// Unique ID of the instance of the zone being created. (optional) + // *actually* optional (in contrast to other fields that are `Option` for + // builder purposes - that is, skipping this field in the builder will + // still result in an `Ok(InstalledZone)` from `.install()`, rather than + // an `Err(InstallZoneError::IncompleteBuilder)`. + unique_name: Option, + /// ZFS datasets to be accessed from within the zone. datasets: Option<&'a [zone::Dataset]>, + /// Filesystems to mount within the zone. filesystems: Option<&'a [zone::Fs]>, + /// Additional network device names to add to the zone. data_links: Option<&'a [String]>, + /// Device nodes to pass through to the zone. devices: Option<&'a [zone::Device]>, + /// OPTE devices for the guest network interfaces. opte_ports: Option>, - bootstrap_vnic: Option, // actually optional + /// NIC to use for creating a bootstrap address on the switch zone. + // actually optional (as above) + bootstrap_vnic: Option, + /// Physical NICs possibly provisioned to the zone. links: Option>, + /// The maximum set of privileges any process in this zone can obtain. limit_priv: Option>, + /// For unit tests only: if `Some`, then no actual zones will be installed + /// by this builder, and minimal facsimiles of them will be placed in + /// temporary directories according to the contents of the provided + /// `FakeZoneBuilderConfig`. fake_cfg: Option, } impl<'a> ZoneBuilder<'a> { + /// Logger to which status messages are written during zone installation. pub fn with_log(mut self, log: Logger) -> Self { self.log = Some(log); self } + /// Allocates the NIC used for control plane communication. pub fn with_underlay_vnic_allocator( mut self, vnic_allocator: &'a VnicAllocator, @@ -1224,11 +1250,14 @@ impl<'a> ZoneBuilder<'a> { self } + /// Filesystem path at which the installed zone will reside. pub fn with_zone_root_path(mut self, root_path: &'a Utf8Path) -> Self { self.zone_root_path = Some(root_path); self } + /// The directories that will be searched for the image tarball for the + /// provided zone type ([`Self::with_zone_type`]). pub fn with_zone_image_paths( mut self, image_paths: &'a [Utf8PathBuf], @@ -1237,56 +1266,68 @@ impl<'a> ZoneBuilder<'a> { self } + /// The name of the type of zone being created (e.g. "propolis-server") pub fn with_zone_type(mut self, zone_type: &'a str) -> Self { self.zone_type = Some(zone_type); self } + /// Unique ID of the instance of the zone being created. (optional) pub fn with_unique_name(mut self, uuid: Uuid) -> Self { self.unique_name = Some(uuid); self } + /// ZFS datasets to be accessed from within the zone. pub fn with_datasets(mut self, datasets: &'a [zone::Dataset]) -> Self { self.datasets = Some(datasets); self } + /// Filesystems to mount within the zone. pub fn with_filesystems(mut self, filesystems: &'a [zone::Fs]) -> Self { self.filesystems = Some(filesystems); self } + /// Additional network device names to add to the zone. pub fn with_data_links(mut self, links: &'a [String]) -> Self { self.data_links = Some(links); self } + /// Device nodes to pass through to the zone. pub fn with_devices(mut self, devices: &'a [zone::Device]) -> Self { self.devices = Some(devices); self } + /// OPTE devices for the guest network interfaces. pub fn with_opte_ports(mut self, ports: Vec<(Port, PortTicket)>) -> Self { self.opte_ports = Some(ports); self } + /// NIC to use for creating a bootstrap address on the switch zone. + /// (optional) pub fn with_bootstrap_vnic(mut self, vnic: Link) -> Self { self.bootstrap_vnic = Some(vnic); self } + /// Physical NICs possibly provisioned to the zone. pub fn with_links(mut self, links: Vec) -> Self { self.links = Some(links); self } + /// The maximum set of privileges any process in this zone can obtain. pub fn with_limit_priv(mut self, limit_priv: Vec) -> Self { self.limit_priv = Some(limit_priv); self } + // (used in unit tests) fn fake_install(self) -> Result { let zone = self .zone_type @@ -1324,6 +1365,9 @@ impl<'a> ZoneBuilder<'a> { .ok_or(InstallZoneError::IncompleteBuilder) } + /// Create the zone with the provided parameters. + /// Returns `Err(InstallZoneError::IncompleteBuilder)` if a necessary + /// parameter was not provided. pub async fn install(self) -> Result { if self.fake_cfg.is_some() { return self.fake_install(); diff --git a/sled-agent/src/fakes/nexus.rs b/sled-agent/src/fakes/nexus.rs index 5920d40f6ca..4cff340c898 100644 --- a/sled-agent/src/fakes/nexus.rs +++ b/sled-agent/src/fakes/nexus.rs @@ -15,7 +15,9 @@ use hyper::Body; use internal_dns::ServiceName; use nexus_client::types::SledAgentInfo; use omicron_common::api::external::Error; -use omicron_common::api::internal::nexus::UpdateArtifactId; +use omicron_common::api::internal::nexus::{ + SledInstanceState, UpdateArtifactId, +}; use schemars::JsonSchema; use serde::Deserialize; use uuid::Uuid; @@ -44,6 +46,14 @@ pub trait FakeNexusServer: Send + Sync { ) -> Result<(), Error> { Err(Error::internal_error("Not implemented")) } + + fn cpapi_instances_put( + &self, + _instance_id: Uuid, + _new_runtime_state: SledInstanceState, + ) -> Result<(), Error> { + Err(Error::internal_error("Not implemented")) + } } /// Describes the server context type. @@ -107,11 +117,34 @@ async fn sled_agent_put( Ok(HttpResponseUpdatedNoContent()) } +#[derive(Deserialize, JsonSchema)] +struct InstancePathParam { + instance_id: Uuid, +} + +#[endpoint { + method = PUT, + path = "/instances/{instance_id}", +}] +async fn cpapi_instances_put( + request_context: RequestContext, + path_params: Path, + new_runtime_state: TypedBody, +) -> Result { + let context = request_context.context(); + context.cpapi_instances_put( + path_params.into_inner().instance_id, + new_runtime_state.into_inner(), + )?; + Ok(HttpResponseUpdatedNoContent()) +} + fn api() -> ApiDescription { let mut api = ApiDescription::new(); api.register(cpapi_artifact_download).unwrap(); api.register(sled_agent_get).unwrap(); api.register(sled_agent_put).unwrap(); + api.register(cpapi_instances_put).unwrap(); api } diff --git a/sled-agent/src/instance.rs b/sled-agent/src/instance.rs index 5ddca904036..890b30137fd 100644 --- a/sled-agent/src/instance.rs +++ b/sled-agent/src/instance.rs @@ -1521,3 +1521,477 @@ impl InstanceRunner { out } } + +#[cfg(test)] +mod tests { + use super::*; + use crate::fakes::nexus::{FakeNexusServer, ServerContext}; + use crate::nexus::NexusClient; + use crate::zone_bundle::CleanupContext; + use camino_tempfile::Utf8TempDir; + use dns_server::dns_server::ServerHandle as DnsServerHandle; + use dropshot::test_util::LogContext; + use dropshot::{HandlerTaskMode, HttpServer}; + use illumos_utils::dladm::MockDladm; + use illumos_utils::dladm::__mock_MockDladm::__create_vnic::Context as MockDladmCreateVnicContext; + use illumos_utils::dladm::__mock_MockDladm::__delete_vnic::Context as MockDladmDeleteVnicContext; + use illumos_utils::opte::params::DhcpConfig; + use illumos_utils::svc::__wait_for_service::Context as MockWaitForServiceContext; + use illumos_utils::zone::MockZones; + use illumos_utils::zone::__mock_MockZones::__boot::Context as MockZonesBootContext; + use illumos_utils::zone::__mock_MockZones::__id::Context as MockZonesIdContext; + use illumos_utils::zpool::ZpoolName; + use internal_dns::resolver::Resolver; + use internal_dns::ServiceName; + use omicron_common::api::external::{ + ByteCount, Generation, InstanceCpuCount, InstanceState, + }; + use omicron_common::api::internal::nexus::InstanceProperties; + use sled_storage::disk::{RawDisk, SyntheticDisk}; + use sled_storage::manager::FakeStorageManager; + use std::net::Ipv6Addr; + use tokio::sync::watch::Receiver; + use tokio::time::timeout; + + const TIMEOUT_DURATION: tokio::time::Duration = + tokio::time::Duration::from_secs(3); + + struct NexusServer { + observed_runtime_state: + tokio::sync::watch::Sender>, + } + impl FakeNexusServer for NexusServer { + fn cpapi_instances_put( + &self, + _instance_id: Uuid, + new_runtime_state: SledInstanceState, + ) -> Result<(), omicron_common::api::external::Error> { + self.observed_runtime_state.send(Some(new_runtime_state)) + .map_err(|_| omicron_common::api::external::Error::internal_error("couldn't send updated SledInstanceState to test driver")) + } + } + + fn fake_nexus_server( + logctx: &LogContext, + ) -> ( + NexusClient, + HttpServer, + Receiver>, + ) { + let (state_tx, state_rx) = tokio::sync::watch::channel(None); + + let nexus_server = crate::fakes::nexus::start_test_server( + logctx.log.new(o!("component" => "FakeNexusServer")), + Box::new(NexusServer { observed_runtime_state: state_tx }), + ); + let nexus_client = NexusClient::new( + &format!("http://{}", nexus_server.local_addr()), + logctx.log.new(o!("component" => "NexusClient")), + ); + + (nexus_client, nexus_server, state_rx) + } + + fn mock_vnic_contexts( + ) -> (MockDladmCreateVnicContext, MockDladmDeleteVnicContext) { + let create_vnic_ctx = MockDladm::create_vnic_context(); + let delete_vnic_ctx = MockDladm::delete_vnic_context(); + create_vnic_ctx.expect().return_once( + |physical_link: &Etherstub, _, _, _, _| { + assert_eq!(&physical_link.0, "mystub"); + Ok(()) + }, + ); + delete_vnic_ctx.expect().returning(|_| Ok(())); + (create_vnic_ctx, delete_vnic_ctx) + } + + // InstanceManager::ensure_state calls Instance::put_state(Running), + // which calls Instance::propolis_ensure, + // which spawns Instance::monitor_state_task, + // which calls cpapi_instances_put + // and calls Instance::setup_propolis_locked, + // which creates the zone (which isn't real in these tests, of course) + fn mock_zone_contexts( + ) -> (MockZonesBootContext, MockWaitForServiceContext, MockZonesIdContext) + { + let boot_ctx = MockZones::boot_context(); + boot_ctx.expect().return_once(|_| Ok(())); + let wait_ctx = illumos_utils::svc::wait_for_service_context(); + wait_ctx.expect().times(..).returning(|_, _, _| Ok(())); + let zone_id_ctx = MockZones::id_context(); + zone_id_ctx.expect().times(..).returning(|_| Ok(Some(1))); + (boot_ctx, wait_ctx, zone_id_ctx) + } + + async fn dns_server( + logctx: &LogContext, + nexus_server: &HttpServer, + ) -> (DnsServerHandle, Arc, Utf8TempDir) { + let storage_path = + Utf8TempDir::new().expect("Failed to create temporary directory"); + let config_store = dns_server::storage::Config { + keep_old_generations: 3, + storage_path: storage_path.path().to_owned(), + }; + + let (dns_server, dns_dropshot) = dns_server::start_servers( + logctx.log.new(o!("component" => "DnsServer")), + dns_server::storage::Store::new( + logctx.log.new(o!("component" => "DnsStore")), + &config_store, + ) + .unwrap(), + &dns_server::dns_server::Config { + bind_address: "[::1]:0".parse().unwrap(), + }, + &dropshot::ConfigDropshot { + bind_address: "[::1]:0".parse().unwrap(), + request_body_max_bytes: 8 * 1024, + default_handler_task_mode: HandlerTaskMode::Detached, + }, + ) + .await + .expect("starting DNS server"); + + let dns_dropshot_client = dns_service_client::Client::new( + &format!("http://{}", dns_dropshot.local_addr()), + logctx.log.new(o!("component" => "DnsDropshotClient")), + ); + let mut dns_config = internal_dns::DnsConfigBuilder::new(); + let IpAddr::V6(nexus_ip_addr) = nexus_server.local_addr().ip() else { + panic!("IPv6 address required for nexus_server") + }; + let zone = dns_config.host_zone(Uuid::new_v4(), nexus_ip_addr).unwrap(); + dns_config + .service_backend_zone( + ServiceName::Nexus, + &zone, + nexus_server.local_addr().port(), + ) + .unwrap(); + let dns_config = dns_config.build(); + dns_dropshot_client.dns_config_put(&dns_config).await.unwrap(); + + let resolver = Arc::new( + Resolver::new_from_addrs( + logctx.log.new(o!("component" => "Resolver")), + &[dns_server.local_address()], + ) + .unwrap(), + ); + (dns_server, resolver, storage_path) + } + + // note the "mock" here is different from the vnic/zone contexts above. + // this is actually running code for a dropshot server from propolis. + // (might we want a locally-defined fake whose behavior we can control + // more directly from the test driver?) + // TODO: factor out, this is also in sled-agent-sim. + fn propolis_mock_server( + log: &Logger, + ) -> (HttpServer>, PropolisClient) { + let propolis_bind_address = + SocketAddr::new(Ipv6Addr::LOCALHOST.into(), 0); // allocate port + let dropshot_config = dropshot::ConfigDropshot { + bind_address: propolis_bind_address, + ..Default::default() + }; + let propolis_log = log.new(o!("component" => "propolis-server-mock")); + let private = + Arc::new(propolis_mock_server::Context::new(propolis_log)); + info!(log, "Starting mock propolis-server..."); + let dropshot_log = log.new(o!("component" => "dropshot")); + let mock_api = propolis_mock_server::api(); + + let srv = dropshot::HttpServerStarter::new( + &dropshot_config, + mock_api, + private, + &dropshot_log, + ) + .expect("couldn't create mock propolis-server") + .start(); + + let client = propolis_client::Client::new(&format!( + "http://{}", + srv.local_addr() + )); + + (srv, client) + } + + // make a FakeStorageManager with a "U2" upserted + async fn fake_storage_manager_with_u2() -> StorageHandle { + let (storage_manager, storage_handle) = FakeStorageManager::new(); + tokio::spawn(storage_manager.run()); + let external_zpool_name = ZpoolName::new_external(Uuid::new_v4()); + let external_disk: RawDisk = + SyntheticDisk::new(external_zpool_name).into(); + storage_handle.upsert_disk(external_disk).await; + storage_handle + } + + async fn instance_struct( + logctx: &LogContext, + propolis_addr: SocketAddr, + nexus_client_with_resolver: NexusClientWithResolver, + storage_handle: StorageHandle, + ) -> Instance { + let id = Uuid::new_v4(); + let propolis_id = Uuid::new_v4(); + let ticket = InstanceTicket::new_without_manager_for_test(id); + let hardware = InstanceHardware { + properties: InstanceProperties { + ncpus: InstanceCpuCount(1), + memory: ByteCount::from_gibibytes_u32(1), + hostname: "bert".to_string(), + }, + nics: vec![], + source_nat: SourceNatConfig { + ip: IpAddr::V6(Ipv6Addr::UNSPECIFIED), + first_port: 0, + last_port: 0, + }, + ephemeral_ip: None, + floating_ips: vec![], + firewall_rules: vec![], + dhcp_config: DhcpConfig { + dns_servers: vec![], + host_domain: None, + search_domains: vec![], + }, + disks: vec![], + cloud_init_bytes: None, + }; + + let initial_state = InstanceInitialState { + hardware, + instance_runtime: InstanceRuntimeState { + propolis_id: Some(propolis_id), + dst_propolis_id: None, + migration_id: None, + gen: Generation::new(), + time_updated: Default::default(), + }, + vmm_runtime: VmmRuntimeState { + state: InstanceState::Creating, + gen: Generation::new(), + time_updated: Default::default(), + }, + propolis_addr, + }; + + let vnic_allocator = + VnicAllocator::new("Foo", Etherstub("mystub".to_string())); + let port_manager = PortManager::new( + logctx.log.new(o!("component" => "PortManager")), + Ipv6Addr::new(0xfd00, 0x1de, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01), + ); + + let cleanup_context = CleanupContext::default(); + let zone_bundler = ZoneBundler::new( + logctx.log.new(o!("component" => "ZoneBundler")), + storage_handle.clone(), + cleanup_context, + ); + + let services = InstanceManagerServices { + nexus_client: nexus_client_with_resolver, + vnic_allocator, + port_manager, + storage: storage_handle, + zone_bundler, + zone_builder_factory: ZoneBuilderFactory::fake(), + }; + + Instance::new( + logctx.log.new(o!("component" => "Instance")), + id, + propolis_id, + ticket, + initial_state, + services, + ) + .unwrap() + } + + #[tokio::test] + async fn test_instance_create_events_normal() { + let logctx = omicron_test_utils::dev::test_setup_log( + "test_instance_create_events_normal", + ); + + let (propolis_server, _propolis_client) = + propolis_mock_server(&logctx.log); + let propolis_addr = propolis_server.local_addr(); + + // automock'd things used during this test + let _mock_vnic_contexts = mock_vnic_contexts(); + let _mock_zone_contexts = mock_zone_contexts(); + + let (nexus_client, nexus_server, mut state_rx) = + fake_nexus_server(&logctx); + + let (_dns_server, resolver, _dns_config_dir) = + timeout(TIMEOUT_DURATION, dns_server(&logctx, &nexus_server)) + .await + .expect("timed out making DNS server and Resolver"); + + let nexus_client_with_resolver = + NexusClientWithResolver::new_with_client(nexus_client, resolver); + + let storage_handle = fake_storage_manager_with_u2().await; + + let inst = timeout( + TIMEOUT_DURATION, + instance_struct( + &logctx, + propolis_addr, + nexus_client_with_resolver, + storage_handle, + ), + ) + .await + .expect("timed out creating Instance struct"); + + timeout( + TIMEOUT_DURATION, + inst.put_state(InstanceStateRequested::Running), + ) + .await + .expect("timed out waiting for Instance::put_state") + .unwrap(); + + timeout( + TIMEOUT_DURATION, + state_rx.wait_for(|maybe_state| { + maybe_state + .as_ref() + .map(|sled_inst_state| { + sled_inst_state.vmm_state.state + == InstanceState::Running + }) + .unwrap_or(false) + }), + ) + .await + .expect("timed out waiting for InstanceState::Running in FakeNexus") + .unwrap(); + + logctx.cleanup_successful(); + } + + // tests around dropshot request timeouts during the blocking propolis setup + #[tokio::test] + async fn test_instance_create_timeout_while_starting_propolis() { + let logctx = omicron_test_utils::dev::test_setup_log( + "test_instance_create_timeout_while_starting_propolis", + ); + + // automock'd things used during this test + let _mock_vnic_contexts = mock_vnic_contexts(); + let _mock_zone_contexts = mock_zone_contexts(); + + let (nexus_client, nexus_server, state_rx) = fake_nexus_server(&logctx); + + let (_dns_server, resolver, _dns_config_dir) = + timeout(TIMEOUT_DURATION, dns_server(&logctx, &nexus_server)) + .await + .expect("timed out making DNS server and Resolver"); + + let nexus_client_with_resolver = + NexusClientWithResolver::new_with_client(nexus_client, resolver); + + let storage_handle = fake_storage_manager_with_u2().await; + + let inst = timeout( + TIMEOUT_DURATION, + instance_struct( + &logctx, + // we want to test propolis not ever coming up + SocketAddr::V6(SocketAddrV6::new(Ipv6Addr::LOCALHOST, 1, 0, 0)), + nexus_client_with_resolver, + storage_handle, + ), + ) + .await + .expect("timed out creating Instance struct"); + + timeout(TIMEOUT_DURATION, inst.put_state(InstanceStateRequested::Running)) + .await + .expect_err("*should've* timed out waiting for Instance::put_state, but didn't?"); + + if let Some(SledInstanceState { + vmm_state: VmmRuntimeState { state: InstanceState::Running, .. }, + .. + }) = state_rx.borrow().to_owned() + { + panic!("Nexus's InstanceState should never have reached running if zone creation timed out"); + } + + logctx.cleanup_successful(); + } + + #[tokio::test] + async fn test_instance_create_timeout_while_creating_zone() { + let logctx = omicron_test_utils::dev::test_setup_log( + "test_instance_create_timeout_while_creating_zone", + ); + + // automock'd things used during this test + let _mock_vnic_contexts = mock_vnic_contexts(); + + // time out while booting zone, on purpose! + let boot_ctx = MockZones::boot_context(); + boot_ctx.expect().return_once(|_| { + std::thread::sleep(TIMEOUT_DURATION * 2); + Ok(()) + }); + let wait_ctx = illumos_utils::svc::wait_for_service_context(); + wait_ctx.expect().times(..).returning(|_, _, _| Ok(())); + let zone_id_ctx = MockZones::id_context(); + zone_id_ctx.expect().times(..).returning(|_| Ok(Some(1))); + let halt_rm_ctx = MockZones::halt_and_remove_logged_context(); + halt_rm_ctx.expect().times(..).returning(|_, _| Ok(())); + + let (nexus_client, nexus_server, state_rx) = fake_nexus_server(&logctx); + + let (_dns_server, resolver, _dns_config_dir) = + timeout(TIMEOUT_DURATION, dns_server(&logctx, &nexus_server)) + .await + .expect("timed out making DNS server and Resolver"); + + let nexus_client_with_resolver = + NexusClientWithResolver::new_with_client(nexus_client, resolver); + + let storage_handle = fake_storage_manager_with_u2().await; + + let inst = timeout( + TIMEOUT_DURATION, + instance_struct( + &logctx, + // isn't running because the "zone" never "boots" + SocketAddr::V6(SocketAddrV6::new(Ipv6Addr::LOCALHOST, 1, 0, 0)), + nexus_client_with_resolver, + storage_handle, + ), + ) + .await + .expect("timed out creating Instance struct"); + + timeout(TIMEOUT_DURATION, inst.put_state(InstanceStateRequested::Running)) + .await + .expect_err("*should've* timed out waiting for Instance::put_state, but didn't?"); + + if let Some(SledInstanceState { + vmm_state: VmmRuntimeState { state: InstanceState::Running, .. }, + .. + }) = state_rx.borrow().to_owned() + { + panic!("Nexus's InstanceState should never have reached running if zone creation timed out"); + } + + logctx.cleanup_successful(); + } +} diff --git a/sled-agent/src/instance_manager.rs b/sled-agent/src/instance_manager.rs index badaf2d7baa..fee42849f42 100644 --- a/sled-agent/src/instance_manager.rs +++ b/sled-agent/src/instance_manager.rs @@ -734,6 +734,11 @@ impl InstanceTicket { InstanceTicket { id, terminate_tx: Some(terminate_tx) } } + #[cfg(test)] + pub(crate) fn new_without_manager_for_test(id: Uuid) -> Self { + Self { id, inner: None } + } + /// Idempotently removes this instance from the tracked set of /// instances. This acts as an "upcall" for instances to remove /// themselves after stopping. diff --git a/sled-agent/src/nexus.rs b/sled-agent/src/nexus.rs index 12fcc05ce30..3f24c6a8065 100644 --- a/sled-agent/src/nexus.rs +++ b/sled-agent/src/nexus.rs @@ -60,6 +60,16 @@ impl NexusClientWithResolver { } } + // for when we have a NexusClient constructed from a FakeNexusServer + // (no need to expose this function outside of tests) + #[cfg(test)] + pub(crate) fn new_with_client( + client: NexusClient, + resolver: Arc, + ) -> Self { + Self { client, resolver } + } + /// Access the progenitor-based Nexus Client. pub fn client(&self) -> &NexusClient { &self.client From a3da13624d3b8e9db179dbfcd85a53e510f7d4be Mon Sep 17 00:00:00 2001 From: lif <> Date: Sun, 10 Dec 2023 02:06:02 -0800 Subject: [PATCH 2/8] sled-agent: don't block during instance creation request from nexus Alleviating request timeouts occurring when propolis zone installation takes too long (Propolis zone installation took 81 seconds and caused instance start to time out #3927) by making the zone installation not happen during a request handler. Since the instance creation request no longer blocks, we need to wait before proceeding in some cases where we had assumed that a successful return from the Nexus call meant the instance existed, e.g. test_instance_serial now polls for the instance's running state before attempting to send serial console data requests. --- end-to-end-tests/src/instance_launch.rs | 70 ++++- nexus/src/app/instance.rs | 3 + nexus/tests/integration_tests/instances.rs | 21 +- sled-agent/src/fakes/nexus.rs | 1 - sled-agent/src/instance.rs | 286 ++++++++++++++++----- sled-agent/src/instance_manager.rs | 18 +- sled-agent/src/params.rs | 2 +- sled-agent/src/sim/sled_agent.rs | 26 +- 8 files changed, 337 insertions(+), 90 deletions(-) diff --git a/end-to-end-tests/src/instance_launch.rs b/end-to-end-tests/src/instance_launch.rs index 1aae46fe984..377fef4c0bc 100644 --- a/end-to-end-tests/src/instance_launch.rs +++ b/end-to-end-tests/src/instance_launch.rs @@ -7,7 +7,7 @@ use omicron_test_utils::dev::poll::{wait_for_condition, CondCheckError}; use oxide_client::types::{ ByteCount, DiskCreate, DiskSource, ExternalIp, ExternalIpCreate, InstanceCpuCount, InstanceCreate, InstanceDiskAttachment, - InstanceNetworkInterfaceAttachment, SshKeyCreate, + InstanceNetworkInterfaceAttachment, InstanceState, SshKeyCreate, }; use oxide_client::{ClientDisksExt, ClientInstancesExt, ClientSessionExt}; use russh::{ChannelMsg, Disconnect}; @@ -15,7 +15,6 @@ use russh_keys::key::{KeyPair, PublicKey}; use russh_keys::PublicKeyBase64; use std::sync::Arc; use std::time::Duration; -use tokio::time::sleep; #[tokio::test] async fn instance_launch() -> Result<()> { @@ -106,6 +105,19 @@ async fn instance_launch() -> Result<()> { type Error = CondCheckError>; + let instance_state = ctx + .client + .instance_view() + .project(ctx.project_name.clone()) + .instance(instance.name.clone()) + .send() + .await? + .run_state; + + if instance_state == InstanceState::Starting { + return Err(Error::NotYet); + } + let data = String::from_utf8_lossy( &ctx.client .instance_serial_console() @@ -188,19 +200,49 @@ async fn instance_launch() -> Result<()> { // check that we saw it on the console eprintln!("waiting for serial console"); - sleep(Duration::from_secs(5)).await; - let data = String::from_utf8_lossy( - &ctx.client - .instance_serial_console() - .project(ctx.project_name.clone()) - .instance(instance.name.clone()) - .most_recent(1024 * 1024) - .max_bytes(1024 * 1024) - .send() - .await? - .data, + + let data = wait_for_condition( + || async { + type Error = + CondCheckError>; + + let instance_state = ctx + .client + .instance_view() + .project(ctx.project_name.clone()) + .instance(instance.name.clone()) + .send() + .await? + .run_state; + + if instance_state == InstanceState::Starting { + return Err(Error::NotYet); + } + + let data = String::from_utf8_lossy( + &ctx.client + .instance_serial_console() + .project(ctx.project_name.clone()) + .instance(instance.name.clone()) + .most_recent(1024 * 1024) + .max_bytes(1024 * 1024) + .send() + .await + .map_err(|_e| Error::NotYet)? + .data, + ) + .into_owned(); + if data.contains("-----END SSH HOST KEY KEYS-----") { + Ok(data) + } else { + Err(Error::NotYet) + } + }, + &Duration::from_secs(5), + &Duration::from_secs(300), ) - .into_owned(); + .await?; + ensure!( data.contains("Hello, Oxide!"), "string not seen on console\n{}", diff --git a/nexus/src/app/instance.rs b/nexus/src/app/instance.rs index 2d09078e189..2300bd56f26 100644 --- a/nexus/src/app/instance.rs +++ b/nexus/src/app/instance.rs @@ -986,6 +986,9 @@ impl super::Nexus { // // If the operation failed, kick the sled agent error back up to // the caller to let it decide how to handle it. + // + // When creating the zone for the first time, we just get + // Ok(None) here, which is a no-op in write_returned_instance_state. match instance_put_result { Ok(state) => self .write_returned_instance_state(&instance_id, state) diff --git a/nexus/tests/integration_tests/instances.rs b/nexus/tests/integration_tests/instances.rs index 309036256f9..03d5a8c8a99 100644 --- a/nexus/tests/integration_tests/instances.rs +++ b/nexus/tests/integration_tests/instances.rs @@ -67,6 +67,7 @@ use sled_agent_client::TestInterfaces as _; use std::convert::TryFrom; use std::net::Ipv4Addr; use std::sync::Arc; +use std::time::Duration; use uuid::Uuid; use dropshot::test_util::ClientTestContext; @@ -80,6 +81,8 @@ use nexus_test_utils::resource_helpers::{ use nexus_test_utils_macros::nexus_test; use nexus_types::external_api::shared::SiloRole; use omicron_sled_agent::sim; +use omicron_test_utils::dev::poll; +use omicron_test_utils::dev::poll::CondCheckError; type ControlPlaneTestContext = nexus_test_utils::ControlPlaneTestContext; @@ -3794,10 +3797,22 @@ async fn test_instance_serial(cptestctx: &ControlPlaneTestContext) { // Create an instance and poke it to ensure it's running. let instance = create_instance(client, PROJECT_NAME, instance_name).await; - instance_simulate(nexus, &instance.identity.id).await; - let instance_next = instance_get(&client, &instance_url).await; + let instance_next = poll::wait_for_condition( + || async { + instance_simulate(nexus, &instance.identity.id).await; + let instance_next = instance_get(&client, &instance_url).await; + if instance_next.runtime.run_state == InstanceState::Running { + Ok(instance_next) + } else { + Err(CondCheckError::<()>::NotYet) + } + }, + &Duration::from_secs(5), + &Duration::from_secs(60), + ) + .await + .unwrap(); identity_eq(&instance.identity, &instance_next.identity); - assert_eq!(instance_next.runtime.run_state, InstanceState::Running); assert!( instance_next.runtime.time_run_state_updated > instance.runtime.time_run_state_updated diff --git a/sled-agent/src/fakes/nexus.rs b/sled-agent/src/fakes/nexus.rs index 4cff340c898..de37b77bcd5 100644 --- a/sled-agent/src/fakes/nexus.rs +++ b/sled-agent/src/fakes/nexus.rs @@ -121,7 +121,6 @@ async fn sled_agent_put( struct InstancePathParam { instance_id: Uuid, } - #[endpoint { method = PUT, path = "/instances/{instance_id}", diff --git a/sled-agent/src/instance.rs b/sled-agent/src/instance.rs index 890b30137fd..60cb33306bd 100644 --- a/sled-agent/src/instance.rs +++ b/sled-agent/src/instance.rs @@ -30,7 +30,6 @@ use illumos_utils::link::VnicAllocator; use illumos_utils::opte::{DhcpCfg, PortManager}; use illumos_utils::running_zone::{RunningZone, ZoneBuilderFactory}; use illumos_utils::svc::wait_for_service; -use illumos_utils::zone::Zones; use illumos_utils::zone::PROPOLIS_ZONE_PREFIX; use omicron_common::address::NEXUS_INTERNAL_PORT; use omicron_common::api::internal::nexus::{ @@ -52,6 +51,11 @@ use std::sync::Arc; use tokio::sync::{mpsc, oneshot}; use uuid::Uuid; +#[cfg(test)] +use illumos_utils::zone::MockZones as Zones; +#[cfg(not(test))] +use illumos_utils::zone::Zones; + // The depth of the request queue for the instance. const QUEUE_SIZE: usize = 32; @@ -411,7 +415,9 @@ impl InstanceRunner { }, Some(PutState{ state, tx }) => { tx.send(self.put_state(state).await - .map(|r| InstancePutStateResponse { updated_runtime: Some(r) }) + .map(|r| InstancePutStateResponse { + updated_runtime: Some(r), + }) .map_err(|e| e.into())) .map_err(|_| Error::FailedSendClientClosed) }, @@ -1544,21 +1550,29 @@ mod tests { use internal_dns::resolver::Resolver; use internal_dns::ServiceName; use omicron_common::api::external::{ - ByteCount, Generation, InstanceCpuCount, InstanceState, + ByteCount, Generation, Hostname, InstanceCpuCount, InstanceState, }; use omicron_common::api::internal::nexus::InstanceProperties; use sled_storage::disk::{RawDisk, SyntheticDisk}; use sled_storage::manager::FakeStorageManager; use std::net::Ipv6Addr; + use std::str::FromStr; use tokio::sync::watch::Receiver; use tokio::time::timeout; const TIMEOUT_DURATION: tokio::time::Duration = tokio::time::Duration::from_secs(3); + #[derive(Default, Clone)] + enum ReceivedInstanceState { + #[default] + None, + InstancePut(SledInstanceState), + } + struct NexusServer { observed_runtime_state: - tokio::sync::watch::Sender>, + tokio::sync::watch::Sender, } impl FakeNexusServer for NexusServer { fn cpapi_instances_put( @@ -1566,30 +1580,38 @@ mod tests { _instance_id: Uuid, new_runtime_state: SledInstanceState, ) -> Result<(), omicron_common::api::external::Error> { - self.observed_runtime_state.send(Some(new_runtime_state)) - .map_err(|_| omicron_common::api::external::Error::internal_error("couldn't send updated SledInstanceState to test driver")) + self.observed_runtime_state + .send(ReceivedInstanceState::InstancePut(new_runtime_state)) + .map_err(|_| { + omicron_common::api::external::Error::internal_error( + "couldn't send SledInstanceState to test driver", + ) + }) } } - fn fake_nexus_server( - logctx: &LogContext, - ) -> ( - NexusClient, - HttpServer, - Receiver>, - ) { - let (state_tx, state_rx) = tokio::sync::watch::channel(None); - - let nexus_server = crate::fakes::nexus::start_test_server( - logctx.log.new(o!("component" => "FakeNexusServer")), - Box::new(NexusServer { observed_runtime_state: state_tx }), - ); - let nexus_client = NexusClient::new( - &format!("http://{}", nexus_server.local_addr()), - logctx.log.new(o!("component" => "NexusClient")), - ); + struct FakeNexusParts { + nexus_client: NexusClient, + nexus_server: HttpServer, + state_rx: Receiver, + } - (nexus_client, nexus_server, state_rx) + impl FakeNexusParts { + fn new(logctx: &LogContext) -> Self { + let (state_tx, state_rx) = + tokio::sync::watch::channel(ReceivedInstanceState::None); + + let nexus_server = crate::fakes::nexus::start_test_server( + logctx.log.new(o!("component" => "FakeNexusServer")), + Box::new(NexusServer { observed_runtime_state: state_tx }), + ); + let nexus_client = NexusClient::new( + &format!("http://{}", nexus_server.local_addr()), + logctx.log.new(o!("component" => "NexusClient")), + ); + + Self { nexus_client, nexus_server, state_rx } + } } fn mock_vnic_contexts( @@ -1610,7 +1632,7 @@ mod tests { // which calls Instance::propolis_ensure, // which spawns Instance::monitor_state_task, // which calls cpapi_instances_put - // and calls Instance::setup_propolis_locked, + // and calls Instance::setup_propolis_inner, // which creates the zone (which isn't real in these tests, of course) fn mock_zone_contexts( ) -> (MockZonesBootContext, MockWaitForServiceContext, MockZonesIdContext) @@ -1740,12 +1762,38 @@ mod tests { ) -> Instance { let id = Uuid::new_v4(); let propolis_id = Uuid::new_v4(); + let ticket = InstanceTicket::new_without_manager_for_test(id); + + let initial_state = + fake_instance_initial_state(propolis_id, propolis_addr); + + let services = fake_instance_manager_services( + logctx, + storage_handle, + nexus_client_with_resolver, + ); + + Instance::new( + logctx.log.new(o!("component" => "Instance")), + id, + propolis_id, + ticket, + initial_state, + services, + ) + .unwrap() + } + + fn fake_instance_initial_state( + propolis_id: Uuid, + propolis_addr: SocketAddr, + ) -> InstanceInitialState { let hardware = InstanceHardware { properties: InstanceProperties { ncpus: InstanceCpuCount(1), memory: ByteCount::from_gibibytes_u32(1), - hostname: "bert".to_string(), + hostname: Hostname::from_str("bert").unwrap(), }, nics: vec![], source_nat: SourceNatConfig { @@ -1765,7 +1813,7 @@ mod tests { cloud_init_bytes: None, }; - let initial_state = InstanceInitialState { + InstanceInitialState { hardware, instance_runtime: InstanceRuntimeState { propolis_id: Some(propolis_id), @@ -1780,8 +1828,14 @@ mod tests { time_updated: Default::default(), }, propolis_addr, - }; + } + } + fn fake_instance_manager_services( + logctx: &LogContext, + storage_handle: StorageHandle, + nexus_client_with_resolver: NexusClientWithResolver, + ) -> InstanceManagerServices { let vnic_allocator = VnicAllocator::new("Foo", Etherstub("mystub".to_string())); let port_manager = PortManager::new( @@ -1796,24 +1850,14 @@ mod tests { cleanup_context, ); - let services = InstanceManagerServices { + InstanceManagerServices { nexus_client: nexus_client_with_resolver, vnic_allocator, port_manager, storage: storage_handle, zone_bundler, zone_builder_factory: ZoneBuilderFactory::fake(), - }; - - Instance::new( - logctx.log.new(o!("component" => "Instance")), - id, - propolis_id, - ticket, - initial_state, - services, - ) - .unwrap() + } } #[tokio::test] @@ -1830,8 +1874,8 @@ mod tests { let _mock_vnic_contexts = mock_vnic_contexts(); let _mock_zone_contexts = mock_zone_contexts(); - let (nexus_client, nexus_server, mut state_rx) = - fake_nexus_server(&logctx); + let FakeNexusParts { nexus_client, nexus_server, mut state_rx } = + FakeNexusParts::new(&logctx); let (_dns_server, resolver, _dns_config_dir) = timeout(TIMEOUT_DURATION, dns_server(&logctx, &nexus_server)) @@ -1855,29 +1899,30 @@ mod tests { .await .expect("timed out creating Instance struct"); - timeout( - TIMEOUT_DURATION, - inst.put_state(InstanceStateRequested::Running), - ) - .await - .expect("timed out waiting for Instance::put_state") - .unwrap(); + let (put_tx, put_rx) = oneshot::channel(); + + inst.put_state(put_tx, InstanceStateRequested::Running) + .await + .expect("failed to send Instance::put_state"); + + timeout(TIMEOUT_DURATION, put_rx) + .await + .expect("timed out waiting for Instance::put_state result") + .expect("failed to receive Instance::put_state result") + .expect("Instance::put_state failed"); timeout( TIMEOUT_DURATION, - state_rx.wait_for(|maybe_state| { - maybe_state - .as_ref() - .map(|sled_inst_state| { - sled_inst_state.vmm_state.state - == InstanceState::Running - }) - .unwrap_or(false) + state_rx.wait_for(|maybe_state| match maybe_state { + ReceivedInstanceState::InstancePut(sled_inst_state) => { + sled_inst_state.vmm_state.state == InstanceState::Running + } + _ => false, }), ) .await .expect("timed out waiting for InstanceState::Running in FakeNexus") - .unwrap(); + .expect("failed to receive FakeNexus' InstanceState"); logctx.cleanup_successful(); } @@ -1893,7 +1938,8 @@ mod tests { let _mock_vnic_contexts = mock_vnic_contexts(); let _mock_zone_contexts = mock_zone_contexts(); - let (nexus_client, nexus_server, state_rx) = fake_nexus_server(&logctx); + let FakeNexusParts { nexus_client, nexus_server, state_rx } = + FakeNexusParts::new(&logctx); let (_dns_server, resolver, _dns_config_dir) = timeout(TIMEOUT_DURATION, dns_server(&logctx, &nexus_server)) @@ -1918,11 +1964,17 @@ mod tests { .await .expect("timed out creating Instance struct"); - timeout(TIMEOUT_DURATION, inst.put_state(InstanceStateRequested::Running)) + let (put_tx, put_rx) = oneshot::channel(); + + inst.put_state(put_tx, InstanceStateRequested::Running) + .await + .expect("failed to send Instance::put_state"); + + timeout(TIMEOUT_DURATION, put_rx) .await .expect_err("*should've* timed out waiting for Instance::put_state, but didn't?"); - if let Some(SledInstanceState { + if let ReceivedInstanceState::InstancePut(SledInstanceState { vmm_state: VmmRuntimeState { state: InstanceState::Running, .. }, .. }) = state_rx.borrow().to_owned() @@ -1949,13 +2001,12 @@ mod tests { Ok(()) }); let wait_ctx = illumos_utils::svc::wait_for_service_context(); - wait_ctx.expect().times(..).returning(|_, _, _| Ok(())); + wait_ctx.expect().times(1..).returning(|_, _, _| Ok(())); let zone_id_ctx = MockZones::id_context(); - zone_id_ctx.expect().times(..).returning(|_| Ok(Some(1))); - let halt_rm_ctx = MockZones::halt_and_remove_logged_context(); - halt_rm_ctx.expect().times(..).returning(|_, _| Ok(())); + zone_id_ctx.expect().times(1..).returning(|_| Ok(Some(1))); - let (nexus_client, nexus_server, state_rx) = fake_nexus_server(&logctx); + let FakeNexusParts { nexus_client, nexus_server, state_rx } = + FakeNexusParts::new(&logctx); let (_dns_server, resolver, _dns_config_dir) = timeout(TIMEOUT_DURATION, dns_server(&logctx, &nexus_server)) @@ -1980,11 +2031,17 @@ mod tests { .await .expect("timed out creating Instance struct"); - timeout(TIMEOUT_DURATION, inst.put_state(InstanceStateRequested::Running)) + let (put_tx, put_rx) = oneshot::channel(); + + inst.put_state(put_tx, InstanceStateRequested::Running) + .await + .expect("failed to send Instance::put_state"); + + timeout(TIMEOUT_DURATION, put_rx) .await .expect_err("*should've* timed out waiting for Instance::put_state, but didn't?"); - if let Some(SledInstanceState { + if let ReceivedInstanceState::InstancePut(SledInstanceState { vmm_state: VmmRuntimeState { state: InstanceState::Running, .. }, .. }) = state_rx.borrow().to_owned() @@ -1994,4 +2051,97 @@ mod tests { logctx.cleanup_successful(); } + + #[tokio::test] + async fn test_instance_manager_creation() { + let logctx = omicron_test_utils::dev::test_setup_log( + "test_instance_manager_creation", + ); + + // automock'd things used during this test + let _mock_vnic_contexts = mock_vnic_contexts(); + let _mock_zone_contexts = mock_zone_contexts(); + + let storage_handle = fake_storage_manager_with_u2().await; + + let FakeNexusParts { nexus_client, nexus_server, mut state_rx } = + FakeNexusParts::new(&logctx); + + let (_dns_server, resolver, _dns_config_dir) = + timeout(TIMEOUT_DURATION, dns_server(&logctx, &nexus_server)) + .await + .expect("timed out making DNS server and Resolver"); + + let nexus_client_with_resolver = + NexusClientWithResolver::new_with_client(nexus_client, resolver); + + let InstanceManagerServices { + nexus_client, + vnic_allocator: _, + port_manager, + storage, + zone_bundler, + zone_builder_factory, + } = fake_instance_manager_services( + &logctx, + storage_handle, + nexus_client_with_resolver, + ); + + let etherstub = Etherstub("mystub".to_string()); + + let mgr = crate::instance_manager::InstanceManager::new( + logctx.log.new(o!("component" => "InstanceManager")), + nexus_client, + etherstub, + port_manager, + storage, + zone_bundler, + zone_builder_factory, + ) + .unwrap(); + + let (propolis_server, _propolis_client) = + propolis_mock_server(&logctx.log); + let propolis_addr = propolis_server.local_addr(); + + let instance_id = Uuid::new_v4(); + let propolis_id = Uuid::new_v4(); + let InstanceInitialState { + hardware, + instance_runtime, + vmm_runtime, + propolis_addr, + } = fake_instance_initial_state(propolis_id, propolis_addr); + + mgr.ensure_registered( + instance_id, + propolis_id, + hardware, + instance_runtime, + vmm_runtime, + propolis_addr, + ) + .await + .unwrap(); + + mgr.ensure_state(instance_id, InstanceStateRequested::Running) + .await + .unwrap(); + + timeout( + TIMEOUT_DURATION, + state_rx.wait_for(|maybe_state| match maybe_state { + ReceivedInstanceState::InstancePut(sled_inst_state) => { + sled_inst_state.vmm_state.state == InstanceState::Running + } + _ => false, + }), + ) + .await + .expect("timed out waiting for InstanceState::Running in FakeNexus") + .expect("failed to receive FakeNexus' InstanceState"); + + logctx.cleanup_successful(); + } } diff --git a/sled-agent/src/instance_manager.rs b/sled-agent/src/instance_manager.rs index fee42849f42..47c9dfcb58c 100644 --- a/sled-agent/src/instance_manager.rs +++ b/sled-agent/src/instance_manager.rs @@ -185,6 +185,7 @@ impl InstanceManager { target: InstanceStateRequested, ) -> Result { let (tx, rx) = oneshot::channel(); + self.inner .tx .send(InstanceManagerRequest::EnsureState { @@ -194,7 +195,20 @@ impl InstanceManager { }) .await .map_err(|_| Error::FailedSendInstanceManagerClosed)?; - rx.await? + + match target { + // these may involve a long-running zone creation, so avoid HTTP + // request timeouts by decoupling the response + // (see InstanceRunner::put_state) + InstanceStateRequested::MigrationTarget(_) + | InstanceStateRequested::Running => { + // don't error on channel being closed + tokio::spawn(rx); + Ok(InstancePutStateResponse { updated_runtime: None }) + } + InstanceStateRequested::Stopped + | InstanceStateRequested::Reboot => rx.await?, + } } pub async fn put_migration_ids( @@ -736,7 +750,7 @@ impl InstanceTicket { #[cfg(test)] pub(crate) fn new_without_manager_for_test(id: Uuid) -> Self { - Self { id, inner: None } + Self { id, terminate_tx: None } } /// Idempotently removes this instance from the tracked set of diff --git a/sled-agent/src/params.rs b/sled-agent/src/params.rs index f74438a6784..a94b95fc475 100644 --- a/sled-agent/src/params.rs +++ b/sled-agent/src/params.rs @@ -129,7 +129,7 @@ pub struct InstancePutStateBody { /// The response sent from a request to move an instance into a specific runtime /// state. -#[derive(Serialize, Deserialize, JsonSchema)] +#[derive(Debug, Serialize, Deserialize, JsonSchema)] pub struct InstancePutStateResponse { /// The current runtime state of the instance after handling the request to /// change its state. If the instance's state did not change, this field is diff --git a/sled-agent/src/sim/sled_agent.rs b/sled-agent/src/sim/sled_agent.rs index b772a60347b..c7bc7df9780 100644 --- a/sled-agent/src/sim/sled_agent.rs +++ b/sled-agent/src/sim/sled_agent.rs @@ -44,6 +44,7 @@ use std::collections::{HashMap, HashSet}; use std::net::{IpAddr, Ipv6Addr, SocketAddr}; use std::str::FromStr; use std::sync::Arc; +use std::time::Duration; use uuid::Uuid; /// Simulates management of the control plane on a sled @@ -73,6 +74,7 @@ pub struct SledAgent { config: Config, fake_zones: Mutex, instance_ensure_state_error: Mutex>, + pub log: Logger, } fn extract_targets_from_volume_construction_request( @@ -171,6 +173,7 @@ impl SledAgent { zones: vec![], }), instance_ensure_state_error: Mutex::new(None), + log, }) } @@ -400,7 +403,28 @@ impl SledAgent { )); } InstanceStateRequested::Running => { - propolis_client::types::InstanceStateRequested::Run + let instances = self.instances.clone(); + let log = self.log.new( + o!("component" => "SledAgent-insure_instance_state"), + ); + tokio::spawn(async move { + tokio::time::sleep(Duration::from_secs(10)).await; + match instances + .sim_ensure(&instance_id, current, Some(state)) + .await + { + Ok(state) => { + let instance_state: nexus_client::types::SledInstanceState = state.into(); + info!(log, "sim_ensure success"; "instance_state" => #?instance_state); + } + Err(instance_put_error) => { + error!(log, "sim_ensure failure"; "error" => #?instance_put_error); + } + } + }); + return Ok(InstancePutStateResponse { + updated_runtime: None, + }); } InstanceStateRequested::Stopped => { propolis_client::types::InstanceStateRequested::Stop From a0075039b1b173b4ffe6d14c6f78aec03d9c2f0f Mon Sep 17 00:00:00 2001 From: lif <> Date: Wed, 13 Mar 2024 03:30:14 -0700 Subject: [PATCH 3/8] post-rebase updates --- sled-agent/src/instance.rs | 19 ++++++++++++++++++- sled-agent/src/vmm_reservoir.rs | 13 +++++++++++++ 2 files changed, 31 insertions(+), 1 deletion(-) diff --git a/sled-agent/src/instance.rs b/sled-agent/src/instance.rs index 60cb33306bd..c6087a03b71 100644 --- a/sled-agent/src/instance.rs +++ b/sled-agent/src/instance.rs @@ -1533,6 +1533,7 @@ mod tests { use super::*; use crate::fakes::nexus::{FakeNexusServer, ServerContext}; use crate::nexus::NexusClient; + use crate::vmm_reservoir::VmmReservoirManagerHandle; use crate::zone_bundle::CleanupContext; use camino_tempfile::Utf8TempDir; use dns_server::dns_server::ServerHandle as DnsServerHandle; @@ -1774,6 +1775,11 @@ mod tests { nexus_client_with_resolver, ); + let metadata = InstanceMetadata { + silo_id: Uuid::new_v4(), + project_id: Uuid::new_v4(), + }; + Instance::new( logctx.log.new(o!("component" => "Instance")), id, @@ -1781,6 +1787,7 @@ mod tests { ticket, initial_state, services, + metadata, ) .unwrap() } @@ -1856,7 +1863,7 @@ mod tests { port_manager, storage: storage_handle, zone_bundler, - zone_builder_factory: ZoneBuilderFactory::fake(), + zone_builder_factory: ZoneBuilderFactory::fake(None), } } @@ -2090,6 +2097,9 @@ mod tests { let etherstub = Etherstub("mystub".to_string()); + let vmm_reservoir_manager = + VmmReservoirManagerHandle::stub_for_test(); + let mgr = crate::instance_manager::InstanceManager::new( logctx.log.new(o!("component" => "InstanceManager")), nexus_client, @@ -2098,6 +2108,7 @@ mod tests { storage, zone_bundler, zone_builder_factory, + vmm_reservoir_manager, ) .unwrap(); @@ -2114,6 +2125,11 @@ mod tests { propolis_addr, } = fake_instance_initial_state(propolis_id, propolis_addr); + let metadata = InstanceMetadata { + silo_id: Uuid::new_v4(), + project_id: Uuid::new_v4(), + }; + mgr.ensure_registered( instance_id, propolis_id, @@ -2121,6 +2137,7 @@ mod tests { instance_runtime, vmm_runtime, propolis_addr, + metadata, ) .await .unwrap(); diff --git a/sled-agent/src/vmm_reservoir.rs b/sled-agent/src/vmm_reservoir.rs index d7b6b64ecf8..b16286f5f5d 100644 --- a/sled-agent/src/vmm_reservoir.rs +++ b/sled-agent/src/vmm_reservoir.rs @@ -119,6 +119,19 @@ impl VmmReservoirManagerHandle { } rx.await.map_err(|_| Error::ReplySenderDropped)? } + + #[cfg(test)] + pub fn stub_for_test() -> Self { + let (tx, _) = flume::bounded(1); + let (size_updated_tx, _) = broadcast::channel(1); + let _manager_handle = Arc::new(thread::spawn(|| {})); + Self { + reservoir_size: Arc::new(AtomicU64::new(0)), + tx, + size_updated_tx, + _manager_handle, + } + } } /// Manage the VMM reservoir in a background thread From b026a3587318f2071eabdd4e1b71e1cf6b166d02 Mon Sep 17 00:00:00 2001 From: lif <> Date: Wed, 13 Mar 2024 03:48:52 -0700 Subject: [PATCH 4/8] fmt --- sled-agent/src/instance.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sled-agent/src/instance.rs b/sled-agent/src/instance.rs index c6087a03b71..8b22e360dc6 100644 --- a/sled-agent/src/instance.rs +++ b/sled-agent/src/instance.rs @@ -2097,8 +2097,7 @@ mod tests { let etherstub = Etherstub("mystub".to_string()); - let vmm_reservoir_manager = - VmmReservoirManagerHandle::stub_for_test(); + let vmm_reservoir_manager = VmmReservoirManagerHandle::stub_for_test(); let mgr = crate::instance_manager::InstanceManager::new( logctx.log.new(o!("component" => "InstanceManager")), From 71de4336851f9eee5cafbfdd8a5632f487d29cd6 Mon Sep 17 00:00:00 2001 From: lif <> Date: Wed, 13 Mar 2024 14:58:01 -0700 Subject: [PATCH 5/8] clean up fake zone files after tests --- sled-agent/src/instance.rs | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/sled-agent/src/instance.rs b/sled-agent/src/instance.rs index 8b22e360dc6..8e63087e46a 100644 --- a/sled-agent/src/instance.rs +++ b/sled-agent/src/instance.rs @@ -1760,6 +1760,7 @@ mod tests { propolis_addr: SocketAddr, nexus_client_with_resolver: NexusClientWithResolver, storage_handle: StorageHandle, + temp_dir: String, ) -> Instance { let id = Uuid::new_v4(); let propolis_id = Uuid::new_v4(); @@ -1773,6 +1774,7 @@ mod tests { logctx, storage_handle, nexus_client_with_resolver, + temp_dir, ); let metadata = InstanceMetadata { @@ -1842,6 +1844,7 @@ mod tests { logctx: &LogContext, storage_handle: StorageHandle, nexus_client_with_resolver: NexusClientWithResolver, + temp_dir: String, ) -> InstanceManagerServices { let vnic_allocator = VnicAllocator::new("Foo", Etherstub("mystub".to_string())); @@ -1863,7 +1866,7 @@ mod tests { port_manager, storage: storage_handle, zone_bundler, - zone_builder_factory: ZoneBuilderFactory::fake(None), + zone_builder_factory: ZoneBuilderFactory::fake(Some(temp_dir)), } } @@ -1894,6 +1897,9 @@ mod tests { let storage_handle = fake_storage_manager_with_u2().await; + let temp_guard = Utf8TempDir::new().unwrap(); + let temp_dir = temp_guard.path().to_string(); + let inst = timeout( TIMEOUT_DURATION, instance_struct( @@ -1901,6 +1907,7 @@ mod tests { propolis_addr, nexus_client_with_resolver, storage_handle, + temp_dir, ), ) .await @@ -1958,6 +1965,9 @@ mod tests { let storage_handle = fake_storage_manager_with_u2().await; + let temp_guard = Utf8TempDir::new().unwrap(); + let temp_dir = temp_guard.path().to_string(); + let inst = timeout( TIMEOUT_DURATION, instance_struct( @@ -1966,6 +1976,7 @@ mod tests { SocketAddr::V6(SocketAddrV6::new(Ipv6Addr::LOCALHOST, 1, 0, 0)), nexus_client_with_resolver, storage_handle, + temp_dir, ), ) .await @@ -2025,6 +2036,9 @@ mod tests { let storage_handle = fake_storage_manager_with_u2().await; + let temp_guard = Utf8TempDir::new().unwrap(); + let temp_dir = temp_guard.path().to_string(); + let inst = timeout( TIMEOUT_DURATION, instance_struct( @@ -2033,6 +2047,7 @@ mod tests { SocketAddr::V6(SocketAddrV6::new(Ipv6Addr::LOCALHOST, 1, 0, 0)), nexus_client_with_resolver, storage_handle, + temp_dir, ), ) .await @@ -2082,6 +2097,9 @@ mod tests { let nexus_client_with_resolver = NexusClientWithResolver::new_with_client(nexus_client, resolver); + let temp_guard = Utf8TempDir::new().unwrap(); + let temp_dir = temp_guard.path().to_string(); + let InstanceManagerServices { nexus_client, vnic_allocator: _, @@ -2093,6 +2111,7 @@ mod tests { &logctx, storage_handle, nexus_client_with_resolver, + temp_dir, ); let etherstub = Etherstub("mystub".to_string()); From d55db5b1812b43a7b5ac1f4b25f4f941e047875a Mon Sep 17 00:00:00 2001 From: lif <> Date: Wed, 13 Mar 2024 15:14:07 -0700 Subject: [PATCH 6/8] fixup --- sled-agent/src/instance.rs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/sled-agent/src/instance.rs b/sled-agent/src/instance.rs index 8e63087e46a..340edff511a 100644 --- a/sled-agent/src/instance.rs +++ b/sled-agent/src/instance.rs @@ -1760,7 +1760,7 @@ mod tests { propolis_addr: SocketAddr, nexus_client_with_resolver: NexusClientWithResolver, storage_handle: StorageHandle, - temp_dir: String, + temp_dir: &String, ) -> Instance { let id = Uuid::new_v4(); let propolis_id = Uuid::new_v4(); @@ -1844,7 +1844,7 @@ mod tests { logctx: &LogContext, storage_handle: StorageHandle, nexus_client_with_resolver: NexusClientWithResolver, - temp_dir: String, + temp_dir: &String, ) -> InstanceManagerServices { let vnic_allocator = VnicAllocator::new("Foo", Etherstub("mystub".to_string())); @@ -1907,7 +1907,7 @@ mod tests { propolis_addr, nexus_client_with_resolver, storage_handle, - temp_dir, + &temp_dir, ), ) .await @@ -1976,7 +1976,7 @@ mod tests { SocketAddr::V6(SocketAddrV6::new(Ipv6Addr::LOCALHOST, 1, 0, 0)), nexus_client_with_resolver, storage_handle, - temp_dir, + &temp_dir, ), ) .await @@ -2047,7 +2047,7 @@ mod tests { SocketAddr::V6(SocketAddrV6::new(Ipv6Addr::LOCALHOST, 1, 0, 0)), nexus_client_with_resolver, storage_handle, - temp_dir, + &temp_dir, ), ) .await @@ -2111,7 +2111,7 @@ mod tests { &logctx, storage_handle, nexus_client_with_resolver, - temp_dir, + &temp_dir, ); let etherstub = Etherstub("mystub".to_string()); From aa9da2ccab805db63888e404b9c8c5b34ecfbbb5 Mon Sep 17 00:00:00 2001 From: lif <> Date: Wed, 13 Mar 2024 17:05:55 -0700 Subject: [PATCH 7/8] cleanup fmt stable oddity --- sled-agent/src/instance.rs | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/sled-agent/src/instance.rs b/sled-agent/src/instance.rs index 340edff511a..00570c19ecd 100644 --- a/sled-agent/src/instance.rs +++ b/sled-agent/src/instance.rs @@ -415,9 +415,7 @@ impl InstanceRunner { }, Some(PutState{ state, tx }) => { tx.send(self.put_state(state).await - .map(|r| InstancePutStateResponse { - updated_runtime: Some(r), - }) + .map(|r| InstancePutStateResponse { updated_runtime: Some(r) }) .map_err(|e| e.into())) .map_err(|_| Error::FailedSendClientClosed) }, From d5da2d4e86867ead95a548937f01eb405d07fab9 Mon Sep 17 00:00:00 2001 From: lif <> Date: Sat, 16 Mar 2024 03:43:11 -0700 Subject: [PATCH 8/8] use tokio::time::advance --- sled-agent/src/instance.rs | 43 ++++++++++++++++++++++++------ sled-agent/src/instance_manager.rs | 6 +++-- 2 files changed, 39 insertions(+), 10 deletions(-) diff --git a/sled-agent/src/instance.rs b/sled-agent/src/instance.rs index 00570c19ecd..8b6699c8ee1 100644 --- a/sled-agent/src/instance.rs +++ b/sled-agent/src/instance.rs @@ -1560,7 +1560,7 @@ mod tests { use tokio::time::timeout; const TIMEOUT_DURATION: tokio::time::Duration = - tokio::time::Duration::from_secs(3); + tokio::time::Duration::from_secs(30); #[derive(Default, Clone)] enum ReceivedInstanceState { @@ -1748,7 +1748,7 @@ mod tests { tokio::spawn(storage_manager.run()); let external_zpool_name = ZpoolName::new_external(Uuid::new_v4()); let external_disk: RawDisk = - SyntheticDisk::new(external_zpool_name).into(); + SyntheticDisk::new(external_zpool_name, 0).into(); storage_handle.upsert_disk(external_disk).await; storage_handle } @@ -1913,10 +1913,15 @@ mod tests { let (put_tx, put_rx) = oneshot::channel(); + // pretending we're InstanceManager::ensure_state, start our "instance" + // (backed by fakes and propolis_mock_server) inst.put_state(put_tx, InstanceStateRequested::Running) .await .expect("failed to send Instance::put_state"); + // even though we ignore this result at instance creation time in + // practice (to avoid request timeouts), in this test let's make sure + // it actually completes. timeout(TIMEOUT_DURATION, put_rx) .await .expect("timed out waiting for Instance::put_state result") @@ -1982,11 +1987,21 @@ mod tests { let (put_tx, put_rx) = oneshot::channel(); + tokio::time::pause(); + + // pretending we're InstanceManager::ensure_state, try in vain to start + // our "instance", but no propolis server is running inst.put_state(put_tx, InstanceStateRequested::Running) .await .expect("failed to send Instance::put_state"); - timeout(TIMEOUT_DURATION, put_rx) + let timeout_fut = timeout(TIMEOUT_DURATION, put_rx); + + tokio::time::advance(TIMEOUT_DURATION).await; + + tokio::time::resume(); + + timeout_fut .await .expect_err("*should've* timed out waiting for Instance::put_state, but didn't?"); @@ -2010,16 +2025,18 @@ mod tests { // automock'd things used during this test let _mock_vnic_contexts = mock_vnic_contexts(); + let rt_handle = tokio::runtime::Handle::current(); + // time out while booting zone, on purpose! let boot_ctx = MockZones::boot_context(); - boot_ctx.expect().return_once(|_| { - std::thread::sleep(TIMEOUT_DURATION * 2); + boot_ctx.expect().return_once(move |_| { + rt_handle.block_on(tokio::time::sleep(TIMEOUT_DURATION * 2)); Ok(()) }); let wait_ctx = illumos_utils::svc::wait_for_service_context(); - wait_ctx.expect().times(1..).returning(|_, _, _| Ok(())); + wait_ctx.expect().times(..).returning(|_, _, _| Ok(())); let zone_id_ctx = MockZones::id_context(); - zone_id_ctx.expect().times(1..).returning(|_| Ok(Some(1))); + zone_id_ctx.expect().times(..).returning(|_| Ok(Some(1))); let FakeNexusParts { nexus_client, nexus_server, state_rx } = FakeNexusParts::new(&logctx); @@ -2051,13 +2068,23 @@ mod tests { .await .expect("timed out creating Instance struct"); + tokio::time::pause(); + let (put_tx, put_rx) = oneshot::channel(); + // pretending we're InstanceManager::ensure_state, try in vain to start + // our "instance", but the zone never finishes installing inst.put_state(put_tx, InstanceStateRequested::Running) .await .expect("failed to send Instance::put_state"); - timeout(TIMEOUT_DURATION, put_rx) + let timeout_fut = timeout(TIMEOUT_DURATION, put_rx); + + tokio::time::advance(TIMEOUT_DURATION * 2).await; + + tokio::time::resume(); + + timeout_fut .await .expect_err("*should've* timed out waiting for Instance::put_state, but didn't?"); diff --git a/sled-agent/src/instance_manager.rs b/sled-agent/src/instance_manager.rs index 47c9dfcb58c..2c9780b3ce6 100644 --- a/sled-agent/src/instance_manager.rs +++ b/sled-agent/src/instance_manager.rs @@ -185,7 +185,6 @@ impl InstanceManager { target: InstanceStateRequested, ) -> Result { let (tx, rx) = oneshot::channel(); - self.inner .tx .send(InstanceManagerRequest::EnsureState { @@ -202,7 +201,10 @@ impl InstanceManager { // (see InstanceRunner::put_state) InstanceStateRequested::MigrationTarget(_) | InstanceStateRequested::Running => { - // don't error on channel being closed + // We don't want the sending side of the channel to see an + // error if we drop rx without awaiting it. + // Since we don't care about the response here, we spawn rx + // into a task which will await it for us in the background. tokio::spawn(rx); Ok(InstancePutStateResponse { updated_runtime: None }) }