diff --git a/.github/buildomat/jobs/tuf-repo.sh b/.github/buildomat/jobs/tuf-repo.sh index 60c844cea06..fb5457c0a25 100644 --- a/.github/buildomat/jobs/tuf-repo.sh +++ b/.github/buildomat/jobs/tuf-repo.sh @@ -137,15 +137,15 @@ done # Fetch SP images from oxidecomputer/hubris GHA artifacts. HUBRIS_VERSION="1.0.0-alpha+git${HUBRIS_COMMIT:0:11}" -run_id=$(curl --netrc "https://api.github.com/repos/oxidecomputer/hubris/actions/runs?head_sha=$HUBRIS_COMMIT" \ +run_id=$(curl --netrc -fsS "https://api.github.com/repos/oxidecomputer/hubris/actions/runs?head_sha=$HUBRIS_COMMIT" \ | /opt/ooce/bin/jq -r '.workflow_runs[] | select(.path == ".github/workflows/dist.yml") | .id') -artifacts=$(curl --netrc "https://api.github.com/repos/oxidecomputer/hubris/actions/runs/$run_id/artifacts") +artifacts=$(curl --netrc -fsS "https://api.github.com/repos/oxidecomputer/hubris/actions/runs/$run_id/artifacts") for noun in gimlet-c psc-b sidecar-b; do tufaceous_kind=${noun%-?} tufaceous_kind=${tufaceous_kind//sidecar/switch}_sp job_name=dist-ubuntu-latest-$noun url=$(/opt/ooce/bin/jq --arg name "$job_name" -r '.artifacts[] | select(.name == $name) | .archive_download_url' <<<"$artifacts") - curl -L -o /work/$job_name.zip "$url" + curl --netrc -fsSL -o /work/$job_name.zip "$url" cat >>/work/manifest.toml <` - we parse the prefix /// when reading the structure, and validate that the UUID can be utilized. -#[derive(Clone, Debug, Hash, PartialEq, Eq, JsonSchema)] +#[derive( + Clone, Debug, Hash, PartialEq, Eq, JsonSchema, Serialize, Deserialize, +)] pub struct ZpoolName { id: Uuid, kind: ZpoolKind, @@ -323,25 +327,6 @@ impl ZpoolName { } } -impl<'de> Deserialize<'de> for ZpoolName { - fn deserialize(deserializer: D) -> Result - where - D: Deserializer<'de>, - { - let s = String::deserialize(deserializer)?; - ZpoolName::from_str(&s).map_err(serde::de::Error::custom) - } -} - -impl Serialize for ZpoolName { - fn serialize(&self, serializer: S) -> Result - where - S: Serializer, - { - serializer.serialize_str(&self.to_string()) - } -} - impl FromStr for ZpoolName { type Err = String; @@ -374,60 +359,15 @@ impl fmt::Display for ZpoolName { mod test { use super::*; - fn toml_string(s: &str) -> String { - format!("zpool_name = \"{}\"", s) - } - - fn parse_name(s: &str) -> Result { - toml_string(s) - .parse::() - .expect("Cannot parse as TOML value") - .get("zpool_name") - .expect("Missing key") - .clone() - .try_into::() - } - - #[test] - fn test_parse_external_zpool_name() { - let uuid: Uuid = - "d462a7f7-b628-40fe-80ff-4e4189e2d62b".parse().unwrap(); - let good_name = format!("{}{}", ZPOOL_EXTERNAL_PREFIX, uuid); - - let name = parse_name(&good_name).expect("Cannot parse as ZpoolName"); - assert_eq!(uuid, name.id()); - assert_eq!(ZpoolKind::External, name.kind()); - } - #[test] - fn test_parse_internal_zpool_name() { - let uuid: Uuid = - "d462a7f7-b628-40fe-80ff-4e4189e2d62b".parse().unwrap(); - let good_name = format!("{}{}", ZPOOL_INTERNAL_PREFIX, uuid); - - let name = parse_name(&good_name).expect("Cannot parse as ZpoolName"); - assert_eq!(uuid, name.id()); - assert_eq!(ZpoolKind::Internal, name.kind()); - } - - #[test] - fn test_parse_bad_zpool_names() { - let bad_names = vec![ - // Nonsense string - "this string is GARBAGE", - // Missing prefix - "d462a7f7-b628-40fe-80ff-4e4189e2d62b", - // Underscores - "oxp_d462a7f7_b628_40fe_80ff_4e4189e2d62b", - ]; - - for bad_name in &bad_names { - assert!( - parse_name(&bad_name).is_err(), - "Parsing {} should fail", - bad_name - ); - } + fn test_parse_zpool_name_json() { + let _zpool_name: ZpoolName = serde_json::from_str( + r#"{ + "id": "d462a7f7-b628-40fe-80ff-4e4189e2d62b", + "kind": "external" + }"#, + ) + .expect("Could not parse ZpoolName from Json Object"); } #[test] diff --git a/nexus/db-model/src/service_kind.rs b/nexus/db-model/src/service_kind.rs index 3be06c12057..afb29abaa79 100644 --- a/nexus/db-model/src/service_kind.rs +++ b/nexus/db-model/src/service_kind.rs @@ -17,6 +17,9 @@ impl_enum_type!( pub enum ServiceKind; // Enum values + Clickhouse => b"clickhouse" + Cockroach => b"cockroach" + Crucible => b"crucible" CruciblePantry => b"crucible_pantry" Dendrite => b"dendrite" ExternalDns => b"external_dns" @@ -48,6 +51,15 @@ impl From for ServiceKind { impl From for ServiceKind { fn from(k: internal_api::params::ServiceKind) -> Self { match k { + internal_api::params::ServiceKind::Clickhouse => { + ServiceKind::Clickhouse + } + internal_api::params::ServiceKind::Cockroach => { + ServiceKind::Cockroach + } + internal_api::params::ServiceKind::Crucible => { + ServiceKind::Crucible + } internal_api::params::ServiceKind::ExternalDns { .. } => { ServiceKind::ExternalDns } diff --git a/nexus/db-queries/src/db/queries/region_allocation.rs b/nexus/db-queries/src/db/queries/region_allocation.rs index 7ec16a70e9c..4c76689cffc 100644 --- a/nexus/db-queries/src/db/queries/region_allocation.rs +++ b/nexus/db-queries/src/db/queries/region_allocation.rs @@ -74,6 +74,11 @@ impl OldRegions { } /// A subquery to find datasets which could be used for provisioning regions. +/// +/// We only consider datasets which are already allocated as "Crucible". +/// This implicitly distinguishes between "M.2s" and "U.2s" -- Nexus needs to +/// determine during dataset provisioning which devices should be considered for +/// usage as Crucible storage. #[derive(Subquery, QueryId)] #[subquery(name = candidate_datasets)] struct CandidateDatasets { @@ -214,13 +219,14 @@ struct OldPoolUsage { } impl OldPoolUsage { - fn new() -> Self { + fn new(candidate_zpools: &CandidateZpools) -> Self { use crate::db::schema::dataset::dsl as dataset_dsl; Self { query: Box::new( dataset_dsl::dataset .inner_join( - candidate_zpools::dsl::candidate_zpools + candidate_zpools + .query_source() .on(dataset_dsl::pool_id .eq(candidate_zpools::dsl::id)), ) @@ -473,7 +479,7 @@ impl RegionAllocate { extent_count, ); let proposed_changes = ProposedChanges::new(&candidate_regions); - let old_pool_usage = OldPoolUsage::new(); + let old_pool_usage = OldPoolUsage::new(&candidate_zpools); let zpool_size_delta = ZpoolSizeDelta::new(&proposed_changes); let proposed_datasets_fit = ProposedDatasetsFit::new(&old_pool_usage, &zpool_size_delta); diff --git a/nexus/types/src/internal_api/params.rs b/nexus/types/src/internal_api/params.rs index a062b5a9870..9b3a5afecbd 100644 --- a/nexus/types/src/internal_api/params.rs +++ b/nexus/types/src/internal_api/params.rs @@ -175,13 +175,16 @@ pub struct ServiceNic { #[derive(Debug, Serialize, Deserialize, JsonSchema, Clone, PartialEq, Eq)] #[serde(rename_all = "snake_case", tag = "type", content = "content")] pub enum ServiceKind { + Clickhouse, + Cockroach, + Crucible, + CruciblePantry, ExternalDns { external_address: IpAddr, nic: ServiceNic }, InternalDns, Nexus { external_address: IpAddr, nic: ServiceNic }, Oximeter, Dendrite, Tfport, - CruciblePantry, BoundaryNtp { snat: SourceNatConfig, nic: ServiceNic }, InternalNtp, } @@ -190,6 +193,9 @@ impl fmt::Display for ServiceKind { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { use ServiceKind::*; let s = match self { + Clickhouse => "clickhouse", + Cockroach => "cockroach", + Crucible => "crucible", ExternalDns { .. } => "external_dns", InternalDns => "internal_dns", Nexus { .. } => "nexus", diff --git a/openapi/nexus-internal.json b/openapi/nexus-internal.json index 67155bff979..a8f414cf30f 100644 --- a/openapi/nexus-internal.json +++ b/openapi/nexus-internal.json @@ -2776,6 +2776,62 @@ "ServiceKind": { "description": "Describes the purpose of the service.", "oneOf": [ + { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": [ + "clickhouse" + ] + } + }, + "required": [ + "type" + ] + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": [ + "cockroach" + ] + } + }, + "required": [ + "type" + ] + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": [ + "crucible" + ] + } + }, + "required": [ + "type" + ] + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": [ + "crucible_pantry" + ] + } + }, + "required": [ + "type" + ] + }, { "type": "object", "properties": { @@ -2894,20 +2950,6 @@ "type" ] }, - { - "type": "object", - "properties": { - "type": { - "type": "string", - "enum": [ - "crucible_pantry" - ] - } - }, - "required": [ - "type" - ] - }, { "type": "object", "properties": { diff --git a/openapi/sled-agent.json b/openapi/sled-agent.json index f70873f5dd9..de323de0475 100644 --- a/openapi/sled-agent.json +++ b/openapi/sled-agent.json @@ -54,32 +54,6 @@ } } }, - "/filesystem": { - "put": { - "operationId": "filesystems_put", - "requestBody": { - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/DatasetEnsureBody" - } - } - }, - "required": true - }, - "responses": { - "204": { - "description": "resource updated" - }, - "4XX": { - "$ref": "#/components/responses/Error" - }, - "5XX": { - "$ref": "#/components/responses/Error" - } - } - } - }, "/instances/{instance_id}": { "put": { "operationId": "instance_register", @@ -607,48 +581,6 @@ "target" ] }, - "DatasetEnsureBody": { - "description": "Used to request that the Sled initialize multiple datasets.", - "type": "object", - "properties": { - "datasets": { - "type": "array", - "items": { - "$ref": "#/components/schemas/DatasetEnsureRequest" - } - } - }, - "required": [ - "datasets" - ] - }, - "DatasetEnsureRequest": { - "description": "Used to request a new dataset kind exists within a zpool.\n\nMany dataset types are associated with services that will be instantiated when the dataset is detected.", - "type": "object", - "properties": { - "address": { - "type": "string" - }, - "dataset_name": { - "$ref": "#/components/schemas/DatasetName" - }, - "gz_address": { - "nullable": true, - "default": null, - "type": "string", - "format": "ipv6" - }, - "id": { - "type": "string", - "format": "uuid" - } - }, - "required": [ - "address", - "dataset_name", - "id" - ] - }, "DatasetKind": { "description": "The type of a dataset, and an auxiliary information necessary to successfully launch a zone managing the associated data.", "oneOf": [ @@ -697,22 +629,6 @@ { "type": "object", "properties": { - "dns_address": { - "description": "The address at which the external DNS server is reachable.", - "type": "string" - }, - "http_address": { - "description": "The address at which the external DNS server API is reachable.", - "type": "string" - }, - "nic": { - "description": "The service vNIC providing external connectivity using OPTE.", - "allOf": [ - { - "$ref": "#/components/schemas/NetworkInterface" - } - ] - }, "type": { "type": "string", "enum": [ @@ -721,23 +637,12 @@ } }, "required": [ - "dns_address", - "http_address", - "nic", "type" ] }, { "type": "object", "properties": { - "dns_address": { - "description": "The address at which the internal DNS server is reachable.", - "type": "string" - }, - "http_address": { - "description": "The address at which the internal DNS server API is reachable.", - "type": "string" - }, "type": { "type": "string", "enum": [ @@ -746,8 +651,6 @@ } }, "required": [ - "dns_address", - "http_address", "type" ] } @@ -768,6 +671,23 @@ "pool_name" ] }, + "DatasetRequest": { + "description": "Describes a request to provision a specific dataset", + "type": "object", + "properties": { + "id": { + "type": "string", + "format": "uuid" + }, + "name": { + "$ref": "#/components/schemas/DatasetName" + } + }, + "required": [ + "id", + "name" + ] + }, "DiskEnsureBody": { "description": "Sent from to a sled agent to establish the runtime state of a Disk", "type": "object", @@ -1809,7 +1729,7 @@ "pattern": "^(0|[1-9]\\d*)\\.(0|[1-9]\\d*)\\.(0|[1-9]\\d*)(?:-((?:0|[1-9]\\d*|\\d*[a-zA-Z-][0-9a-zA-Z-]*)(?:\\.(?:0|[1-9]\\d*|\\d*[a-zA-Z-][0-9a-zA-Z-]*))*))?(?:\\+([0-9a-zA-Z-]+(?:\\.[0-9a-zA-Z-]+)*))?$" }, "ServiceEnsureBody": { - "description": "Used to request that the Sled initialize multiple services.\n\nThis may be used to record that certain sleds are responsible for launching services which may not be associated with a dataset, such as Nexus.", + "description": "Used to request that the Sled initialize multiple services.", "type": "object", "properties": { "services": { @@ -2091,7 +2011,7 @@ "default": null, "allOf": [ { - "$ref": "#/components/schemas/DatasetName" + "$ref": "#/components/schemas/DatasetRequest" } ] }, diff --git a/openapi/wicketd.json b/openapi/wicketd.json index a0bdafc1a8e..86067d803bf 100644 --- a/openapi/wicketd.json +++ b/openapi/wicketd.json @@ -107,6 +107,30 @@ } } }, + "/bootstrap-sleds": { + "get": { + "summary": "Get wicketd's current view of all sleds visible on the bootstrap network.", + "operationId": "get_bootstrap_sleds", + "responses": { + "200": { + "description": "successful operation", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/BootstrapSledIps" + } + } + } + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + } + }, "/clear-update-state/{type}/{slot}": { "post": { "summary": "Resets update state for a sled.", @@ -664,26 +688,52 @@ "BootstrapSledDescription": { "type": "object", "properties": { - "id": { - "$ref": "#/components/schemas/SpIdentifier" + "baseboard": { + "$ref": "#/components/schemas/Baseboard" }, - "model": { - "type": "string" + "bootstrap_ip": { + "nullable": true, + "description": "The sled's bootstrap address, if the host is on and we've discovered it on the bootstrap network.", + "type": "string", + "format": "ipv6" }, - "revision": { - "type": "integer", - "format": "uint32", - "minimum": 0 + "id": { + "$ref": "#/components/schemas/SpIdentifier" + } + }, + "required": [ + "baseboard", + "id" + ] + }, + "BootstrapSledIp": { + "type": "object", + "properties": { + "baseboard": { + "$ref": "#/components/schemas/Baseboard" }, - "serial_number": { - "type": "string" + "ip": { + "type": "string", + "format": "ipv6" } }, "required": [ - "id", - "model", - "revision", - "serial_number" + "baseboard", + "ip" + ] + }, + "BootstrapSledIps": { + "type": "object", + "properties": { + "sleds": { + "type": "array", + "items": { + "$ref": "#/components/schemas/BootstrapSledIp" + } + } + }, + "required": [ + "sleds" ] }, "CertificateUploadResponse": { diff --git a/package-manifest.toml b/package-manifest.toml index 8826c18db1c..4a06e6d973c 100644 --- a/package-manifest.toml +++ b/package-manifest.toml @@ -300,8 +300,8 @@ only_for_targets.image = "standard" # 2. Copy dendrite.tar.gz from dendrite/out to omicron/out source.type = "prebuilt" source.repo = "dendrite" -source.commit = "9e3764239515a7bb3f09c922cfb88b1be3dade77" -source.sha256 = "13d8ff8374ec0b5d9b681b83aef8a2d4f0aed15d0ad92dc5b04a43d850196309" +source.commit = "3857dac89bf16851df170db2fe3797cec3c1b711" +source.sha256 = "e42742c6d253f99ef280341b6c2c3ab8658dec0595a2ebf65b6f6618e4d34b6a" output.type = "zone" output.intermediate_only = true @@ -325,8 +325,8 @@ only_for_targets.image = "standard" # 2. Copy the output zone image from dendrite/out to omicron/out source.type = "prebuilt" source.repo = "dendrite" -source.commit = "9e3764239515a7bb3f09c922cfb88b1be3dade77" -source.sha256 = "c90dedceec367f5fc8f2c585fda4be1804f59124c067dfc3632cfb7a938b2b4f" +source.commit = "3857dac89bf16851df170db2fe3797cec3c1b711" +source.sha256 = "3a4238cda6c408133c2f4699f2f6575701dc8551199b83b3b3dbd8f0d0b1a512" output.type = "zone" output.intermediate_only = true @@ -343,8 +343,8 @@ only_for_targets.image = "standard" # 2. Copy dendrite.tar.gz from dendrite/out to omicron/out/dendrite-softnpu.tar.gz source.type = "prebuilt" source.repo = "dendrite" -source.commit = "9e3764239515a7bb3f09c922cfb88b1be3dade77" -source.sha256 = "60633b939e9492c6e5302d9ea4c373ffa7d652be1ca7b1f43524f4da07cf14c7" +source.commit = "3857dac89bf16851df170db2fe3797cec3c1b711" +source.sha256 = "dcaab63d5bc1a786eb0bd2868a14a0e72ede879867fc49c0c368861c9e4f9f1f" output.type = "zone" output.intermediate_only = true diff --git a/sled-agent/src/http_entrypoints.rs b/sled-agent/src/http_entrypoints.rs index accf6072541..ac83db25812 100644 --- a/sled-agent/src/http_entrypoints.rs +++ b/sled-agent/src/http_entrypoints.rs @@ -5,10 +5,9 @@ //! HTTP entrypoint functions for the sled agent's exposed API use crate::params::{ - DatasetEnsureBody, DiskEnsureBody, InstanceEnsureBody, - InstancePutMigrationIdsBody, InstancePutStateBody, - InstancePutStateResponse, InstanceUnregisterResponse, ServiceEnsureBody, - SledRole, TimeSync, VpcFirewallRulesEnsureBody, Zpool, + DiskEnsureBody, InstanceEnsureBody, InstancePutMigrationIdsBody, + InstancePutStateBody, InstancePutStateResponse, InstanceUnregisterResponse, + ServiceEnsureBody, SledRole, TimeSync, VpcFirewallRulesEnsureBody, Zpool, }; use dropshot::{ endpoint, ApiDescription, HttpError, HttpResponseOk, @@ -31,7 +30,6 @@ type SledApiDescription = ApiDescription; pub fn api() -> SledApiDescription { fn register_endpoints(api: &mut SledApiDescription) -> Result<(), String> { api.register(disk_put)?; - api.register(filesystems_put)?; api.register(instance_issue_disk_snapshot_request)?; api.register(instance_put_migration_ids)?; api.register(instance_put_state)?; @@ -109,20 +107,6 @@ async fn sled_role_get( Ok(HttpResponseOk(sa.get_role().await)) } -#[endpoint { - method = PUT, - path = "/filesystem", -}] -async fn filesystems_put( - rqctx: RequestContext, - body: TypedBody, -) -> Result { - let sa = rqctx.context(); - let body_args = body.into_inner(); - sa.filesystems_ensure(body_args).await.map_err(|e| Error::from(e))?; - Ok(HttpResponseUpdatedNoContent()) -} - /// Path parameters for Instance requests (sled agent API) #[derive(Deserialize, JsonSchema)] struct InstancePathParam { diff --git a/sled-agent/src/params.rs b/sled-agent/src/params.rs index 38e2513d477..787a7329bba 100644 --- a/sled-agent/src/params.rs +++ b/sled-agent/src/params.rs @@ -216,52 +216,8 @@ pub enum DatasetKind { CockroachDb, Crucible, Clickhouse, - ExternalDns { - /// The address at which the external DNS server API is reachable. - http_address: SocketAddrV6, - /// The address at which the external DNS server is reachable. - dns_address: SocketAddr, - /// The service vNIC providing external connectivity using OPTE. - nic: NetworkInterface, - }, - InternalDns { - /// The address at which the internal DNS server API is reachable. - http_address: SocketAddrV6, - /// The address at which the internal DNS server is reachable. - dns_address: SocketAddrV6, - }, -} - -impl DatasetKind { - /// Returns the type of the zone which manages this dataset. - pub fn zone_type(&self) -> ZoneType { - match *self { - DatasetKind::CockroachDb => ZoneType::CockroachDb, - DatasetKind::Crucible => ZoneType::Crucible, - DatasetKind::Clickhouse => ZoneType::Clickhouse, - DatasetKind::ExternalDns { .. } => ZoneType::ExternalDns, - DatasetKind::InternalDns { .. } => ZoneType::InternalDns, - } - } - - /// Returns the service type which runs in the zone managing this dataset. - /// - /// NOTE: This interface is only viable because datasets run a single - /// service in their zone. If that precondition is no longer true, this - /// interface should be re-visited. - pub fn service_type(&self) -> ServiceType { - match self.clone() { - DatasetKind::CockroachDb => ServiceType::CockroachDb, - DatasetKind::Crucible => ServiceType::Crucible, - DatasetKind::Clickhouse => ServiceType::Clickhouse, - DatasetKind::ExternalDns { http_address, dns_address, nic } => { - ServiceType::ExternalDns { http_address, dns_address, nic } - } - DatasetKind::InternalDns { http_address, dns_address } => { - ServiceType::InternalDns { http_address, dns_address } - } - } - } + ExternalDns, + InternalDns, } impl From for sled_agent_client::types::DatasetKind { @@ -271,17 +227,8 @@ impl From for sled_agent_client::types::DatasetKind { CockroachDb => Self::CockroachDb, Crucible => Self::Crucible, Clickhouse => Self::Clickhouse, - ExternalDns { http_address, dns_address, nic } => { - Self::ExternalDns { - http_address: http_address.to_string(), - dns_address: dns_address.to_string(), - nic: nic.into(), - } - } - InternalDns { http_address, dns_address } => Self::InternalDns { - http_address: http_address.to_string(), - dns_address: dns_address.to_string(), - }, + ExternalDns => Self::ExternalDns, + InternalDns => Self::InternalDns, } } } @@ -290,11 +237,11 @@ impl From for nexus_client::types::DatasetKind { fn from(k: DatasetKind) -> Self { use DatasetKind::*; match k { - CockroachDb { .. } => Self::Cockroach, + CockroachDb => Self::Cockroach, Crucible => Self::Crucible, Clickhouse => Self::Clickhouse, - ExternalDns { .. } => Self::ExternalDns, - InternalDns { .. } => Self::InternalDns, + ExternalDns => Self::ExternalDns, + InternalDns => Self::InternalDns, } } } @@ -313,43 +260,6 @@ impl std::fmt::Display for DatasetKind { } } -/// Used to request that the Sled initialize multiple datasets. -#[derive(Clone, Debug, Deserialize, Serialize, JsonSchema, PartialEq)] -pub struct DatasetEnsureBody { - pub datasets: Vec, -} - -/// Used to request a new dataset kind exists within a zpool. -/// -/// Many dataset types are associated with services that will be -/// instantiated when the dataset is detected. -#[derive(Clone, Debug, Deserialize, Serialize, JsonSchema, PartialEq)] -pub struct DatasetEnsureRequest { - // The UUID of the dataset, as well as the service using it directly. - pub id: Uuid, - // The name (and UUID) of the Zpool which we are inserting into. - pub dataset_name: crate::storage::dataset::DatasetName, - // The address on which the zone will listen for requests. - pub address: SocketAddrV6, - // The addresses in the global zone which should be created, if necessary - // to route to the service. - #[serde(default)] - pub gz_address: Option, -} - -impl From - for sled_agent_client::types::DatasetEnsureRequest -{ - fn from(p: DatasetEnsureRequest) -> Self { - Self { - id: p.id, - dataset_name: p.dataset_name.into(), - address: p.address.to_string(), - gz_address: p.gz_address, - } - } -} - /// Describes service-specific parameters. #[derive( Clone, Debug, Deserialize, Serialize, JsonSchema, PartialEq, Eq, Hash, @@ -585,6 +495,21 @@ impl std::fmt::Display for ZoneType { } } +/// Describes a request to provision a specific dataset +#[derive( + Clone, Debug, Deserialize, Serialize, JsonSchema, PartialEq, Eq, Hash, +)] +pub struct DatasetRequest { + pub id: Uuid, + pub name: crate::storage::dataset::DatasetName, +} + +impl From for sled_agent_client::types::DatasetRequest { + fn from(d: DatasetRequest) -> Self { + Self { id: d.id, name: d.name.into() } + } +} + /// Describes a request to create a zone running one or more services. #[derive( Clone, Debug, Deserialize, Serialize, JsonSchema, PartialEq, Eq, Hash, @@ -598,7 +523,7 @@ pub struct ServiceZoneRequest { pub addresses: Vec, // Datasets which should be managed by this service. #[serde(default)] - pub dataset: Option, + pub dataset: Option, // The addresses in the global zone which should be created, if necessary // to route to the service. // @@ -623,7 +548,7 @@ impl ServiceZoneRequest { // The name of a unique identifier for the zone, if one is necessary. pub fn zone_name_unique_identifier(&self) -> Option { - self.dataset.as_ref().map(|d| d.pool().to_string()) + self.dataset.as_ref().map(|d| d.name.pool().to_string()) } } @@ -670,10 +595,6 @@ impl TryFrom } /// Used to request that the Sled initialize multiple services. -/// -/// This may be used to record that certain sleds are responsible for -/// launching services which may not be associated with a dataset, such -/// as Nexus. #[derive(Clone, Debug, Deserialize, Serialize, JsonSchema, PartialEq)] pub struct ServiceEnsureBody { pub services: Vec, diff --git a/sled-agent/src/rack_setup/plan/service.rs b/sled-agent/src/rack_setup/plan/service.rs index ec10ce0bd78..6bbe29f179c 100644 --- a/sled-agent/src/rack_setup/plan/service.rs +++ b/sled-agent/src/rack_setup/plan/service.rs @@ -7,7 +7,7 @@ use crate::bootstrap::params::StartSledAgentRequest; use crate::ledger::{Ledger, Ledgerable}; use crate::params::{ - DatasetEnsureRequest, ServiceType, ServiceZoneRequest, ServiceZoneService, + DatasetRequest, ServiceType, ServiceZoneRequest, ServiceZoneService, ZoneType, }; use crate::rack_setup::config::SetupServiceConfig as Config; @@ -93,10 +93,6 @@ pub enum PlanError { #[derive(Clone, Debug, Default, Deserialize, Serialize, PartialEq)] pub struct SledRequest { - /// Datasets to be created. - #[serde(default, rename = "dataset")] - pub datasets: Vec, - /// Services to be instantiated. #[serde(default, rename = "service")] pub services: Vec, @@ -303,25 +299,15 @@ impl Plan { svc_port_builder.next_dns(id, &mut services_ip_pool)?; let dns_port = omicron_common::address::DNS_PORT; let dns_address = SocketAddr::new(external_ip, dns_port); - let dataset_kind = crate::params::DatasetKind::ExternalDns { - http_address, - dns_address, - nic: nic.clone(), - }; + let dataset_kind = crate::params::DatasetKind::ExternalDns; let dataset_name = DatasetName::new(u2_zpools[0].clone(), dataset_kind); - request.datasets.push(DatasetEnsureRequest { - id, - dataset_name: dataset_name.clone(), - address: http_address, - gz_address: None, - }); request.services.push(ServiceZoneRequest { id, zone_type: ZoneType::ExternalDns, - addresses: vec![internal_ip], - dataset: Some(dataset_name), + addresses: vec![*http_address.ip()], + dataset: Some(DatasetRequest { id, name: dataset_name }), gz_addresses: vec![], services: vec![ServiceZoneService { id, @@ -406,19 +392,26 @@ impl Plan { let id = Uuid::new_v4(); let ip = addr_alloc.next().expect("Not enough addrs"); let port = omicron_common::address::COCKROACH_PORT; - let address = SocketAddrV6::new(ip, port, 0, 0); let zone = dns_builder.host_zone(id, ip).unwrap(); dns_builder .service_backend_zone(ServiceName::Cockroach, &zone, port) .unwrap(); - request.datasets.push(DatasetEnsureRequest { + request.services.push(ServiceZoneRequest { id, - dataset_name: DatasetName::new( - u2_zpools[0].clone(), - crate::params::DatasetKind::CockroachDb, - ), - address, - gz_address: None, + zone_type: ZoneType::CockroachDb, + addresses: vec![ip], + dataset: Some(DatasetRequest { + id, + name: DatasetName::new( + u2_zpools[0].clone(), + crate::params::DatasetKind::CockroachDb, + ), + }), + gz_addresses: vec![], + services: vec![ServiceZoneService { + id, + details: ServiceType::CockroachDb, + }], }); } @@ -427,19 +420,26 @@ impl Plan { let id = Uuid::new_v4(); let ip = addr_alloc.next().expect("Not enough addrs"); let port = omicron_common::address::CLICKHOUSE_PORT; - let address = SocketAddrV6::new(ip, port, 0, 0); let zone = dns_builder.host_zone(id, ip).unwrap(); dns_builder .service_backend_zone(ServiceName::Clickhouse, &zone, port) .unwrap(); - request.datasets.push(DatasetEnsureRequest { + request.services.push(ServiceZoneRequest { id, - dataset_name: DatasetName::new( - u2_zpools[0].clone(), - crate::params::DatasetKind::Clickhouse, - ), - address, - gz_address: None, + zone_type: ZoneType::Clickhouse, + addresses: vec![ip], + dataset: Some(DatasetRequest { + id, + name: DatasetName::new( + u2_zpools[0].clone(), + crate::params::DatasetKind::Clickhouse, + ), + }), + gz_addresses: vec![], + services: vec![ServiceZoneService { + id, + details: ServiceType::Clickhouse, + }], }); } @@ -449,7 +449,6 @@ impl Plan { for pool in &u2_zpools { let ip = addr_alloc.next().expect("Not enough addrs"); let port = omicron_common::address::CRUCIBLE_PORT; - let address = SocketAddrV6::new(ip, port, 0, 0); let id = Uuid::new_v4(); let zone = dns_builder.host_zone(id, ip).unwrap(); dns_builder @@ -460,14 +459,22 @@ impl Plan { ) .unwrap(); - request.datasets.push(DatasetEnsureRequest { + request.services.push(ServiceZoneRequest { id, - dataset_name: DatasetName::new( - pool.clone(), - crate::params::DatasetKind::Crucible, - ), - address, - gz_address: None, + zone_type: ZoneType::Crucible, + addresses: vec![ip], + dataset: Some(DatasetRequest { + id, + name: DatasetName::new( + pool.clone(), + crate::params::DatasetKind::Crucible, + ), + }), + gz_addresses: vec![], + services: vec![ServiceZoneService { + id, + details: ServiceType::Crucible, + }], }); } @@ -475,20 +482,12 @@ impl Plan { // responsibility of being internal DNS servers. if idx < dns_subnets.len() { let dns_subnet = &dns_subnets[idx]; - let dns_ip = dns_subnet.dns_address().ip(); - let dns_address = SocketAddrV6::new(dns_ip, DNS_PORT, 0, 0); - let http_address = - SocketAddrV6::new(dns_ip, DNS_HTTP_PORT, 0, 0); - let id = Uuid::new_v4(); - let zone = dns_builder.host_zone(id, dns_ip).unwrap(); - let dataset_name = DatasetName::new( - u2_zpools[0].clone(), - crate::params::DatasetKind::InternalDns { - http_address, - dns_address, - }, - ); + let ip = dns_subnet.dns_address().ip(); + let http_address = SocketAddrV6::new(ip, DNS_HTTP_PORT, 0, 0); + let dns_address = SocketAddrV6::new(ip, DNS_PORT, 0, 0); + let id = Uuid::new_v4(); + let zone = dns_builder.host_zone(id, ip).unwrap(); dns_builder .service_backend_zone( ServiceName::InternalDns, @@ -496,11 +495,24 @@ impl Plan { DNS_HTTP_PORT, ) .unwrap(); - request.datasets.push(DatasetEnsureRequest { + let dataset_name = DatasetName::new( + u2_zpools[0].clone(), + crate::params::DatasetKind::InternalDns, + ); + + request.services.push(ServiceZoneRequest { id, - dataset_name, - address: http_address, - gz_address: Some(dns_subnet.gz_address().ip()), + zone_type: ZoneType::InternalDns, + addresses: vec![ip], + dataset: Some(DatasetRequest { id, name: dataset_name }), + gz_addresses: vec![dns_subnet.gz_address().ip()], + services: vec![ServiceZoneService { + id, + details: ServiceType::InternalDns { + http_address, + dns_address, + }, + }], }); } diff --git a/sled-agent/src/rack_setup/service.rs b/sled-agent/src/rack_setup/service.rs index 2cc5adb43d1..4400c810968 100644 --- a/sled-agent/src/rack_setup/service.rs +++ b/sled-agent/src/rack_setup/service.rs @@ -62,8 +62,8 @@ use crate::bootstrap::rss_handle::BootstrapAgentHandle; use crate::ledger::{Ledger, Ledgerable}; use crate::nexus::d2n_params; use crate::params::{ - AutonomousServiceOnlyError, DatasetEnsureRequest, DatasetKind, ServiceType, - ServiceZoneRequest, TimeSync, ZoneType, + AutonomousServiceOnlyError, DatasetKind, ServiceType, ServiceZoneRequest, + ServiceZoneService, TimeSync, ZoneType, }; use crate::rack_setup::plan::service::{ Plan as ServicePlan, PlanError as ServicePlanError, @@ -86,8 +86,9 @@ use nexus_client::{ }; use omicron_common::address::Ipv6Subnet; use omicron_common::address::{ - get_sled_address, CRUCIBLE_PANTRY_PORT, DENDRITE_PORT, NEXUS_INTERNAL_PORT, - NTP_PORT, OXIMETER_PORT, + get_sled_address, CLICKHOUSE_PORT, COCKROACH_PORT, CRUCIBLE_PANTRY_PORT, + CRUCIBLE_PORT, DENDRITE_PORT, DNS_HTTP_PORT, NEXUS_INTERNAL_PORT, NTP_PORT, + OXIMETER_PORT, }; use omicron_common::api::internal::shared::{PortFec, PortSpeed}; use omicron_common::backoff::{ @@ -281,51 +282,6 @@ impl ServiceInner { ServiceInner { log } } - async fn initialize_datasets( - &self, - sled_address: SocketAddrV6, - datasets: &Vec, - ) -> Result<(), SetupServiceError> { - let dur = std::time::Duration::from_secs(60); - - let client = reqwest::ClientBuilder::new() - .connect_timeout(dur) - .timeout(dur) - .build() - .map_err(SetupServiceError::HttpClient)?; - let client = SledAgentClient::new_with_client( - &format!("http://{}", sled_address), - client, - self.log.new(o!("SledAgentClient" => sled_address.to_string())), - ); - - let datasets = - datasets.iter().map(|d| d.clone().into()).collect::>(); - - info!(self.log, "sending dataset requests..."); - let filesystem_put = || async { - info!(self.log, "creating new filesystems: {:?}", datasets); - client - .filesystems_put(&SledAgentTypes::DatasetEnsureBody { - datasets: datasets.clone(), - }) - .await - .map_err(BackoffError::transient)?; - Ok::<(), BackoffError>>(()) - }; - let log_failure = |error, _| { - warn!(self.log, "failed to create filesystem"; "error" => ?error); - }; - retry_notify( - retry_policy_internal_service_aggressive(), - filesystem_put, - log_failure, - ) - .await?; - - Ok(()) - } - async fn initialize_services( &self, sled_address: SocketAddrV6, @@ -384,22 +340,19 @@ impl ServiceInner { // Start up the internal DNS services futures::future::join_all(service_plan.services.iter().map( |(sled_address, services_request)| async move { - let datasets: Vec<_> = services_request - .datasets + let services: Vec<_> = services_request + .services .iter() - .filter_map(|dataset| { - if matches!( - dataset.dataset_name.dataset(), - DatasetKind::InternalDns { .. } - ) { - Some(dataset.clone()) + .filter_map(|service| { + if matches!(service.zone_type, ZoneType::InternalDns,) { + Some(service.clone()) } else { None } }) .collect(); - if !datasets.is_empty() { - self.initialize_datasets(*sled_address, &datasets).await?; + if !services.is_empty() { + self.initialize_services(*sled_address, &services).await?; } Ok(()) }, @@ -416,11 +369,16 @@ impl ServiceInner { |(_, services_request)| { // iterate services for this sled let dns_addrs: Vec = services_request - .datasets + .services .iter() - .filter_map(|dataset| { - match dataset.dataset_name.dataset() { - DatasetKind::InternalDns { http_address, .. } => Some(*http_address), + .filter_map(|service| { + match &service.services[0] { + ServiceZoneService { + details: ServiceType::InternalDns { http_address, .. }, + .. + } => { + Some(*http_address) + }, _ => None, } }) @@ -762,25 +720,89 @@ impl ServiceInner { kind: NexusTypes::ServiceKind::InternalNtp, }); } - details => { + ServiceType::Clickhouse => { + services.push(NexusTypes::ServicePutRequest { + service_id, + zone_id, + sled_id, + address: SocketAddrV6::new( + zone.addresses[0], + CLICKHOUSE_PORT, + 0, + 0, + ) + .to_string(), + kind: NexusTypes::ServiceKind::Clickhouse, + }); + } + ServiceType::Crucible => { + services.push(NexusTypes::ServicePutRequest { + service_id, + zone_id, + sled_id, + address: SocketAddrV6::new( + zone.addresses[0], + CRUCIBLE_PORT, + 0, + 0, + ) + .to_string(), + kind: NexusTypes::ServiceKind::Crucible, + }); + } + ServiceType::CockroachDb => { + services.push(NexusTypes::ServicePutRequest { + service_id, + zone_id, + sled_id, + address: SocketAddrV6::new( + zone.addresses[0], + COCKROACH_PORT, + 0, + 0, + ) + .to_string(), + kind: NexusTypes::ServiceKind::Cockroach, + }); + } + ServiceType::ManagementGatewayService + | ServiceType::Wicketd { .. } + | ServiceType::Maghemite { .. } + | ServiceType::Tfport { .. } => { return Err(SetupServiceError::BadConfig(format!( "RSS should not request service of type: {}", - details + svc.details ))); } } } } - for dataset in service_request.datasets.iter() { - datasets.push(NexusTypes::DatasetCreateRequest { - zpool_id: dataset.dataset_name.pool().id(), - dataset_id: dataset.id, - request: NexusTypes::DatasetPutRequest { - address: dataset.address.to_string(), - kind: dataset.dataset_name.dataset().clone().into(), - }, - }) + for service in service_request.services.iter() { + if let Some(dataset) = &service.dataset { + let port = match dataset.name.dataset() { + DatasetKind::CockroachDb => COCKROACH_PORT, + DatasetKind::Clickhouse => CLICKHOUSE_PORT, + DatasetKind::Crucible => CRUCIBLE_PORT, + DatasetKind::ExternalDns => DNS_HTTP_PORT, + DatasetKind::InternalDns => DNS_HTTP_PORT, + }; + + datasets.push(NexusTypes::DatasetCreateRequest { + zpool_id: dataset.name.pool().id(), + dataset_id: dataset.id, + request: NexusTypes::DatasetPutRequest { + address: SocketAddrV6::new( + service.addresses[0], + port, + 0, + 0, + ) + .to_string(), + kind: dataset.name.dataset().clone().into(), + }, + }) + } } } let internal_services_ip_pool_ranges = config @@ -1186,31 +1208,9 @@ impl ServiceInner { // Wait until time is synchronized on all sleds before proceeding. self.wait_for_timesync(&sled_addresses).await?; - // Issue the dataset initialization requests to all sleds. - futures::future::join_all(service_plan.services.iter().map( - |(sled_address, services_request)| async move { - self.initialize_datasets( - *sled_address, - &services_request.datasets, - ) - .await?; - Ok(()) - }, - )) - .await - .into_iter() - .collect::>()?; - - info!(self.log, "Finished setting up agents and datasets"); + info!(self.log, "Finished setting up Internal DNS and NTP"); // Issue service initialization requests. - // - // NOTE: This must happen *after* the dataset initialization, - // to ensure that CockroachDB has been initialized before Nexus - // starts. - // - // If Nexus was more resilient to concurrent initialization - // of CRDB, this requirement could be relaxed. futures::future::join_all(service_plan.services.iter().map( |(sled_address, services_request)| async move { // With the current implementation of "initialize_services", diff --git a/sled-agent/src/services.rs b/sled-agent/src/services.rs index 97cdd930136..55d680dcd27 100644 --- a/sled-agent/src/services.rs +++ b/sled-agent/src/services.rs @@ -217,7 +217,6 @@ impl Config { // The filename of the ledger, within the provided directory. const SERVICES_LEDGER_FILENAME: &str = "services.toml"; -const STORAGE_SERVICES_LEDGER_FILENAME: &str = "storage-services.toml"; // A wrapper around `ZoneRequest`, which allows it to be serialized // to a toml file. @@ -306,8 +305,6 @@ pub struct ServiceManagerInner { sidecar_revision: SidecarRevision, // Zones representing running services zones: Mutex>, - // Zones representing services which own datasets - dataset_zones: Mutex>, underlay_vnic_allocator: VnicAllocator, underlay_vnic: EtherstubVnic, bootstrap_vnic_allocator: VnicAllocator, @@ -380,7 +377,6 @@ impl ServiceManager { sidecar_revision, switch_zone_maghemite_links, zones: Mutex::new(vec![]), - dataset_zones: Mutex::new(vec![]), underlay_vnic_allocator: VnicAllocator::new( "Service", underlay_etherstub, @@ -430,22 +426,7 @@ impl ServiceManager { .collect() } - async fn all_storage_service_ledgers(&self) -> Vec { - if let Some(dir) = self.inner.ledger_directory_override.get() { - return vec![dir.join(STORAGE_SERVICES_LEDGER_FILENAME)]; - } - - self.inner - .storage - .resources() - .all_m2_mountpoints(sled_hardware::disk::CONFIG_DATASET) - .await - .into_iter() - .map(|p| p.join(STORAGE_SERVICES_LEDGER_FILENAME)) - .collect() - } - - pub async fn load_non_storage_services(&self) -> Result<(), Error> { + pub async fn load_services(&self) -> Result<(), Error> { let log = &self.inner.log; let mut existing_zones = self.inner.zones.lock().await; let Some(ledger) = Ledger::::new( @@ -521,25 +502,6 @@ impl ServiceManager { Ok(()) } - pub async fn load_storage_services(&self) -> Result<(), Error> { - let log = &self.inner.log; - let mut existing_zones = self.inner.dataset_zones.lock().await; - let Some(ledger) = Ledger::::new( - log, - self.all_storage_service_ledgers().await, - ) - .await else { - return Ok(()); - }; - let services = ledger.data(); - self.initialize_services_locked( - &mut existing_zones, - &services.requests, - ) - .await?; - Ok(()) - } - /// Loads services from the services manager, and returns once all requested /// services have been started. pub async fn sled_agent_started( @@ -565,20 +527,12 @@ impl ServiceManager { .map_err(|_| "already set".to_string()) .expect("Sled Agent should only start once"); - self.load_non_storage_services().await.map_err(|e| { - error!(self.inner.log, "failed to launch non-storage services"; "error" => e.to_string()); - e - })?; - // TODO(https://github.com/oxidecomputer/omicron/issues/2973): // These will fail if the disks aren't attached. // Should we have a retry loop here? Kinda like we have with the switch // / NTP zone? - // - // NOTE: We could totally do the same thing with - // "load_non_storage_services". - self.load_storage_services().await.map_err(|e| { - error!(self.inner.log, "failed to launch storage services"; "error" => e.to_string()); + self.load_services().await.map_err(|e| { + error!(self.inner.log, "failed to launch services"; "error" => e.to_string()); e })?; @@ -929,7 +883,7 @@ impl ServiceManager { .zone .dataset .iter() - .map(|d| zone::Dataset { name: d.full() }) + .map(|d| zone::Dataset { name: d.name.full() }) .collect::>(); let devices: Vec = device_names @@ -1110,16 +1064,17 @@ impl ServiceManager { let listen_addr = &request.zone.addresses[0].to_string(); let listen_port = &CRUCIBLE_PORT.to_string(); - let dataset = request + let dataset_name = request .zone .dataset .as_ref() + .map(|d| d.name.full()) .expect("Crucible requires dataset"); let uuid = &Uuid::new_v4().to_string(); let config = PropertyGroupBuilder::new("config") .add_property("datalink", "astring", datalink) .add_property("gateway", "astring", gateway) - .add_property("dataset", "astring", &dataset.full()) + .add_property("dataset", "astring", &dataset_name) .add_property("listen_addr", "astring", listen_addr) .add_property("listen_port", "astring", listen_port) .add_property("uuid", "astring", uuid) @@ -1926,63 +1881,6 @@ impl ServiceManager { Ok(()) } - /// Ensures that a storage zone be initialized. - /// - /// These services will be instantiated by this function, and will be - /// recorded to a local file to ensure they start automatically on next - /// boot. - pub async fn ensure_storage_service( - &self, - request: ServiceZoneRequest, - ) -> Result<(), Error> { - let log = &self.inner.log; - let mut existing_zones = self.inner.dataset_zones.lock().await; - - // Read the existing set of services from the ledger. - let service_paths = self.all_storage_service_ledgers().await; - let mut ledger = - match Ledger::::new(log, service_paths.clone()) - .await - { - Some(ledger) => ledger, - None => Ledger::::new_with( - log, - service_paths.clone(), - AllZoneRequests::default(), - ), - }; - let ledger_zone_requests = ledger.data_mut(); - - if !ledger_zone_requests - .requests - .iter() - .any(|zone_request| zone_request.zone.id == request.id) - { - // If this is a new request, provision a zone filesystem on the same - // disk as the dataset. - let dataset = request - .dataset - .as_ref() - .expect("Storage services should have a dataset"); - let root = dataset - .pool() - .dataset_mountpoint(sled_hardware::disk::ZONE_DATASET); - ledger_zone_requests - .requests - .push(ZoneRequest { zone: request, root }); - } - - self.initialize_services_locked( - &mut existing_zones, - &ledger_zone_requests.requests, - ) - .await?; - - ledger.commit().await?; - - Ok(()) - } - pub fn boottime_rewrite(&self, zones: &Vec) { if self .inner diff --git a/sled-agent/src/sled_agent.rs b/sled-agent/src/sled_agent.rs index 9a0de6f0e67..55fcf7ae933 100644 --- a/sled-agent/src/sled_agent.rs +++ b/sled-agent/src/sled_agent.rs @@ -9,10 +9,10 @@ use crate::config::Config; use crate::instance_manager::InstanceManager; use crate::nexus::{LazyNexusClient, NexusRequestQueue}; use crate::params::{ - DatasetEnsureBody, DiskStateRequested, InstanceHardware, - InstanceMigrationSourceParams, InstancePutStateResponse, - InstanceStateRequested, InstanceUnregisterResponse, ServiceEnsureBody, - ServiceZoneService, SledRole, TimeSync, VpcFirewallRule, Zpool, + DiskStateRequested, InstanceHardware, InstanceMigrationSourceParams, + InstancePutStateResponse, InstanceStateRequested, + InstanceUnregisterResponse, ServiceEnsureBody, SledRole, TimeSync, + VpcFirewallRule, Zpool, }; use crate::services::{self, ServiceManager}; use crate::storage_manager::{self, StorageManager}; @@ -528,6 +528,25 @@ impl SledAgent { &self, requested_services: ServiceEnsureBody, ) -> Result<(), Error> { + let datasets: Vec<_> = requested_services + .services + .iter() + .filter_map(|service| service.dataset.clone()) + .collect(); + + // TODO: + // - If these are the set of filesystems, we should also consider + // removing the ones which are not listed here. + // - It's probably worth sending a bulk request to the storage system, + // rather than requesting individual datasets. + for dataset in &datasets { + // First, ensure the dataset exists + self.inner + .storage + .upsert_filesystem(dataset.id, dataset.name.clone()) + .await?; + } + self.inner.services.ensure_all_services(requested_services).await?; Ok(()) } @@ -547,59 +566,6 @@ impl SledAgent { } } - /// Ensures that all filesystem type exists within the zpool. - pub async fn filesystems_ensure( - &self, - requested_datasets: DatasetEnsureBody, - ) -> Result<(), Error> { - // TODO: - // - If these are the set of filesystems, we should also consider - // removing the ones which are not listed here. - // - It's probably worth sending a bulk request to the storage system, - // rather than requesting individual datasets. - for dataset in &requested_datasets.datasets { - let dataset_id = dataset.id; - - // First, ensure the dataset exists - self.inner - .storage - .upsert_filesystem(dataset_id, dataset.dataset_name.clone()) - .await?; - } - - for dataset in &requested_datasets.datasets { - let dataset_id = dataset.id; - let address = dataset.address; - let gz_address = dataset.gz_address; - - // NOTE: We use the "dataset_id" as the "service_id" here. - // - // Since datasets are tightly coupled with their own services - e.g., - // from the perspective of Nexus, provisioning a dataset implies the - // sled should start a service - this is ID re-use is reasonable. - // - // If Nexus ever wants sleds to provision datasets independently of - // launching services, this ID type overlap should be reconsidered. - let service_type = dataset.dataset_name.dataset().service_type(); - let services = vec![ServiceZoneService { - id: dataset_id, - details: service_type, - }]; - - // Next, ensure a zone exists to manage storage for that dataset - let request = crate::params::ServiceZoneRequest { - id: dataset_id, - zone_type: dataset.dataset_name.dataset().zone_type(), - addresses: vec![*address.ip()], - dataset: Some(dataset.dataset_name.clone()), - gz_addresses: gz_address.into_iter().collect(), - services, - }; - self.inner.services.ensure_storage_service(request).await?; - } - Ok(()) - } - /// Idempotently ensures that a given instance is registered with this sled, /// i.e., that it can be addressed by future calls to /// [`instance_ensure_state`]. diff --git a/sled-hardware/src/lib.rs b/sled-hardware/src/lib.rs index 60cb83ca7c1..4de5e94aa58 100644 --- a/sled-hardware/src/lib.rs +++ b/sled-hardware/src/lib.rs @@ -74,7 +74,16 @@ pub enum SledMode { /// Describes properties that should uniquely identify a Gimlet. #[derive( - Clone, Debug, PartialEq, Eq, Hash, Serialize, Deserialize, JsonSchema, + Clone, + Debug, + PartialOrd, + Ord, + PartialEq, + Eq, + Hash, + Serialize, + Deserialize, + JsonSchema, )] #[serde(tag = "type", rename_all = "snake_case")] pub enum Baseboard { diff --git a/tools/dendrite_openapi_version b/tools/dendrite_openapi_version index 83845c8add8..57dc977b9b0 100644 --- a/tools/dendrite_openapi_version +++ b/tools/dendrite_openapi_version @@ -1,2 +1,2 @@ -COMMIT="9e3764239515a7bb3f09c922cfb88b1be3dade77" -SHA2="885f05cb273d22a1481f693552cee25265992520a7ab029937d66b4dc50f5038" +COMMIT="3857dac89bf16851df170db2fe3797cec3c1b711" +SHA2="c687851c097dfba4f2006fafd5043e4507c1a162fce8289ecc02098157356608" diff --git a/tools/dendrite_stub_checksums b/tools/dendrite_stub_checksums index 7cdd0c8a969..e9f56d58ae6 100644 --- a/tools/dendrite_stub_checksums +++ b/tools/dendrite_stub_checksums @@ -1,3 +1,3 @@ -CIDL_SHA256_ILLUMOS="13d8ff8374ec0b5d9b681b83aef8a2d4f0aed15d0ad92dc5b04a43d850196309" -CIDL_SHA256_LINUX_DPD="ceecf310f7ca84660423259f3d9bca778e91c81a03f99cc6cf2bb40cd631598f" -CIDL_SHA256_LINUX_SWADM="af3a5d12b17eb353a513a3e1c06505b20b279ae7f26fef94152d5af06757a3d4" +CIDL_SHA256_ILLUMOS="e42742c6d253f99ef280341b6c2c3ab8658dec0595a2ebf65b6f6618e4d34b6a" +CIDL_SHA256_LINUX_DPD="14e8bfacb9367abb868901306224ab80c10c1e567484b79bf35637e9e603c5f3" +CIDL_SHA256_LINUX_SWADM="6d1b9e59e2fe0436b15f0280d26c1327a34c6fb1445ad69eca80fd112dd9a2dc" diff --git a/wicket/src/rack_setup/config_toml.rs b/wicket/src/rack_setup/config_toml.rs index fbd55e7051d..992b0f254cf 100644 --- a/wicket/src/rack_setup/config_toml.rs +++ b/wicket/src/rack_setup/config_toml.rs @@ -106,6 +106,29 @@ fn format_multiline_array(array: &mut Array) { } fn build_sleds_array(sleds: &[BootstrapSledDescription]) -> Array { + // Helper function to build the comment attached to a given sled. + fn sled_comment(sled: &BootstrapSledDescription, end: &str) -> String { + use wicketd_client::types::Baseboard; + let ip = sled + .bootstrap_ip + .map(|ip| Cow::from(format!("{ip}"))) + .unwrap_or_else(|| Cow::from("IP address UNKNOWN")); + match &sled.baseboard { + Baseboard::Gimlet { identifier, model, revision } => { + format!( + " # {identifier} (model {model} revision {revision}, {ip})\ + {end}" + ) + } + Baseboard::Unknown => { + format!(" # UNKNOWN SLED ({ip}){end}") + } + Baseboard::Pc { identifier, model } => { + format!(" # NON-GIMLET {identifier} (model {model}, {ip}){end}") + } + } + } + let mut array = Array::new(); let mut prev: Option<&BootstrapSledDescription> = None; @@ -121,13 +144,7 @@ fn build_sleds_array(sleds: &[BootstrapSledDescription]) -> Array { // We have to attach the comment for each sled on the _next_ item in the // array, so here we set our prefix to be the previous item's details. if let Some(prev) = prev { - decor.set_prefix(format!( - " # {serial} (model {model}, revision {rev}){sep}", - serial = prev.serial_number, - model = prev.model, - rev = prev.revision, - sep = ARRAY_SEP, - )); + decor.set_prefix(sled_comment(prev, ARRAY_SEP)); } else { decor.set_prefix(ARRAY_SEP); } @@ -139,12 +156,7 @@ fn build_sleds_array(sleds: &[BootstrapSledDescription]) -> Array { // Because we attach comments to previous items, we also need to add a // comment to the last element. if let Some(prev) = prev { - array.set_trailing(format!( - " # {serial} (model {model}, revision {rev})\n", - serial = prev.serial_number, - model = prev.model, - rev = prev.revision, - )); + array.set_trailing(sled_comment(prev, "\n")); array.set_trailing_comma(true); } @@ -189,7 +201,9 @@ fn populate_network_table( mod tests { use super::*; use omicron_common::api::internal::shared::RackNetworkConfig as InternalRackNetworkConfig; + use std::net::Ipv6Addr; use wicket_common::rack_setup::PutRssUserConfigInsensitive; + use wicketd_client::types::Baseboard; use wicketd_client::types::PortFec; use wicketd_client::types::PortSpeed; use wicketd_client::types::SpIdentifier; @@ -258,15 +272,21 @@ mod tests { bootstrap_sleds: vec![ BootstrapSledDescription { id: SpIdentifier { slot: 1, type_: SpType::Sled }, - model: "model1".into(), - revision: 3, - serial_number: "serial 1 2 3".into(), + baseboard: Baseboard::Gimlet { + model: "model1".into(), + revision: 3, + identifier: "serial 1 2 3".into(), + }, + bootstrap_ip: None, }, BootstrapSledDescription { id: SpIdentifier { slot: 5, type_: SpType::Sled }, - model: "model2".into(), - revision: 5, - serial_number: "serial 4 5 6".into(), + baseboard: Baseboard::Gimlet { + model: "model2".into(), + revision: 5, + identifier: "serial 4 5 6".into(), + }, + bootstrap_ip: Some(Ipv6Addr::LOCALHOST), }, ], dns_servers: vec!["1.1.1.1".into(), "2.2.2.2".into()], diff --git a/wicketd-client/src/lib.rs b/wicketd-client/src/lib.rs index 672da3fcde8..1f130a19151 100644 --- a/wicketd-client/src/lib.rs +++ b/wicketd-client/src/lib.rs @@ -37,6 +37,7 @@ progenitor::generate_api!( Ipv4Range = { derives = [ PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize ] }, Ipv6Range = { derives = [ PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize ] }, IpRange = { derives = [ PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize ] }, + Baseboard = { derives = [ PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize ] }, BootstrapSledDescription = { derives = [ PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize ] }, RackNetworkConfig = { derives = [ PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize ] }, CurrentRssUserConfigInsensitive = { derives = [ PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize ] }, diff --git a/wicketd/Cargo.toml b/wicketd/Cargo.toml index f6b16e64135..fbdb3d9944a 100644 --- a/wicketd/Cargo.toml +++ b/wicketd/Cargo.toml @@ -36,6 +36,7 @@ toml.workspace = true uuid.workspace = true bootstrap-agent-client.workspace = true +ddm-admin-client.workspace = true gateway-client.workspace = true installinator-artifactd.workspace = true installinator-common.workspace = true diff --git a/wicketd/src/bootstrap_addrs.rs b/wicketd/src/bootstrap_addrs.rs new file mode 100644 index 00000000000..f6936fd3326 --- /dev/null +++ b/wicketd/src/bootstrap_addrs.rs @@ -0,0 +1,175 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use ddm_admin_client::Client as DdmAdminClient; +use futures::stream::FuturesUnordered; +use sled_hardware::underlay::BootstrapInterface; +use sled_hardware::Baseboard; +use slog::warn; +use slog::Logger; +use std::collections::BTreeMap; +use std::net::Ipv6Addr; +use std::sync::Arc; +use std::sync::Mutex; +use std::time::Duration; +use tokio::task::JoinHandle; +use tokio_stream::StreamExt; + +pub(crate) struct BootstrapPeers { + // We use a standard mutex here, not a tokio mutex, even though this is + // shared with a tokio task. We only keep it locked long enough to insert a + // new entry or clone it. + sleds: Arc>>, + inner_task: JoinHandle<()>, +} + +impl Drop for BootstrapPeers { + fn drop(&mut self) { + self.inner_task.abort(); + } +} + +#[allow(dead_code)] // TODO REMOVE +impl BootstrapPeers { + pub(crate) fn new(log: &Logger) -> Self { + let log = log.new(slog::o!("component" => "BootstrapPeers")); + let sleds = Arc::default(); + let inner_task = tokio::spawn(scan_for_peers(Arc::clone(&sleds), log)); + Self { sleds, inner_task } + } + + pub(crate) fn sleds(&self) -> BTreeMap { + self.sleds.lock().unwrap().clone() + } +} + +async fn scan_for_peers( + sleds: Arc>>, + log: Logger, +) { + // How frequently do we attempt to refresh the set of peers? This does not + // count the time it takes to do each refresh, which is potentially + // significant if any prefixes reported by DDM are not running responsive + // sled-agents, since we'll wait for them to time out. + const SLEEP_BETWEEN_REFRESH: Duration = Duration::from_secs(30); + + let ddm_client = make_ddm_admin_client(&log).await; + + // We only share `sleds` with the `BootstrapPeers` that created us, and it + // only ever reads the current value: we are the only one that changes it. + // We keep the previous version we set it to so if the set of addresses + // remains unchanged (which we expect nearly all the time), we don't bother + // locking and copying the new set in. + let mut prev_sleds = None; + loop { + // Ask mg-ddm for a list of bootstrap address prefixes. + let addrs = possible_sled_agent_addrs(&ddm_client, &log).await; + + // Query the sled-agent on each prefix for its baseboard, dropping any + // addresses that fail to return. + let mut addrs_to_sleds = addrs + .map(|ip| { + let log = &log; + async move { + let client = bootstrap_agent_client::Client::new( + &format!("http://[{ip}]"), + log.clone(), + ); + let result = client.baseboard_get().await; + + (ip, result) + } + }) + .collect::>(); + + let mut all_sleds = BTreeMap::new(); + while let Some((ip, result)) = addrs_to_sleds.next().await { + match result { + Ok(baseboard) => { + // Convert from progenitor type back to `sled-hardware` + // type. + let baseboard = match baseboard.into_inner() { + bootstrap_agent_client::types::Baseboard::Gimlet { + identifier, + model, + revision, + } => Baseboard::new_gimlet(identifier, model, revision), + bootstrap_agent_client::types::Baseboard::Unknown => { + Baseboard::unknown() + } + bootstrap_agent_client::types::Baseboard::Pc { + identifier, + model, + } => Baseboard::new_pc(identifier, model), + }; + + all_sleds.insert(baseboard, ip); + } + Err(err) => { + warn!( + log, "Failed to get baseboard for {ip}"; + "err" => #%err, + ); + } + } + } + + // Did our set of peers change? If so, update both `sleds` (shared with + // our parent `BootstrapPeers`) and `prev_sleds` (our local cache). + if Some(&all_sleds) != prev_sleds.as_ref() { + *sleds.lock().unwrap() = all_sleds.clone(); + prev_sleds = Some(all_sleds); + } + + tokio::time::sleep(SLEEP_BETWEEN_REFRESH).await; + } +} + +async fn possible_sled_agent_addrs( + ddm_client: &DdmAdminClient, + log: &Logger, +) -> impl Iterator { + // TODO: Should we use `backoff` here instead of a loop/sleep? We're talking + // to a service's admin interface on localhost within our own switch zone, + // and we're only asking for its current state. Backoff should be + // unnecessary, I think? + const RETRY: Duration = Duration::from_secs(5); + + loop { + match ddm_client + .derive_bootstrap_addrs_from_prefixes(&[ + BootstrapInterface::GlobalZone, + ]) + .await + { + Ok(addrs) => return addrs, + Err(err) => { + warn!( + log, "Failed to get prefixes from ddm"; + "err" => #%err, + ); + tokio::time::sleep(RETRY).await; + } + } + } +} + +async fn make_ddm_admin_client(log: &Logger) -> DdmAdminClient { + const DDM_CONSTRUCT_RETRY: Duration = Duration::from_secs(1); + + // We don't really expect this to fail ever, so just keep retrying + // indefinitely if it does. + loop { + match DdmAdminClient::localhost(log) { + Ok(client) => return client, + Err(err) => { + warn!( + log, "Failed to construct DdmAdminClient"; + "err" => #%err, + ); + tokio::time::sleep(DDM_CONSTRUCT_RETRY).await; + } + } + } +} diff --git a/wicketd/src/context.rs b/wicketd/src/context.rs index 1142ee91444..a2075d051bf 100644 --- a/wicketd/src/context.rs +++ b/wicketd/src/context.rs @@ -7,6 +7,7 @@ use std::sync::Arc; use std::sync::Mutex; +use crate::bootstrap_addrs::BootstrapPeers; use crate::rss_config::CurrentRssConfig; use crate::update_tracker::UpdateTracker; use crate::MgsHandle; @@ -16,6 +17,7 @@ use sled_hardware::Baseboard; pub struct ServerContext { pub mgs_handle: MgsHandle, pub mgs_client: gateway_client::Client, + pub(crate) bootstrap_peers: BootstrapPeers, pub(crate) update_tracker: Arc, pub(crate) baseboard: Option, pub(crate) rss_config: Mutex, diff --git a/wicketd/src/http_entrypoints.rs b/wicketd/src/http_entrypoints.rs index 6d77ed15c89..f2c00197bf9 100644 --- a/wicketd/src/http_entrypoints.rs +++ b/wicketd/src/http_entrypoints.rs @@ -32,6 +32,7 @@ use serde::Serialize; use sled_hardware::Baseboard; use std::collections::BTreeMap; use std::collections::BTreeSet; +use std::net::Ipv6Addr; use std::time::Duration; use uuid::Uuid; use wicket_common::rack_setup::PutRssUserConfigInsensitive; @@ -46,6 +47,7 @@ pub fn api() -> WicketdApiDescription { fn register_endpoints( api: &mut WicketdApiDescription, ) -> Result<(), String> { + api.register(get_bootstrap_sleds)?; api.register(get_rss_config)?; api.register(put_rss_config)?; api.register(put_rss_config_recovery_user_password_hash)?; @@ -71,6 +73,57 @@ pub fn api() -> WicketdApiDescription { api } +#[derive( + Clone, + Debug, + Serialize, + Deserialize, + JsonSchema, + PartialEq, + Eq, + PartialOrd, + Ord, +)] +pub struct BootstrapSledIp { + pub baseboard: Baseboard, + pub ip: Ipv6Addr, +} + +#[derive( + Clone, + Debug, + Serialize, + Deserialize, + JsonSchema, + PartialEq, + Eq, + PartialOrd, + Ord, +)] +pub struct BootstrapSledIps { + pub sleds: Vec, +} + +/// Get wicketd's current view of all sleds visible on the bootstrap network. +#[endpoint { + method = GET, + path = "/bootstrap-sleds" +}] +async fn get_bootstrap_sleds( + rqctx: RequestContext, +) -> Result, HttpError> { + let ctx = rqctx.context(); + + let sleds = ctx + .bootstrap_peers + .sleds() + .into_iter() + .map(|(baseboard, ip)| BootstrapSledIp { baseboard, ip }) + .collect(); + + Ok(HttpResponseOk(BootstrapSledIps { sleds })) +} + #[derive( Clone, Debug, @@ -84,9 +137,10 @@ pub fn api() -> WicketdApiDescription { )] pub struct BootstrapSledDescription { pub id: SpIdentifier, - pub serial_number: String, - pub model: String, - pub revision: u32, + pub baseboard: Baseboard, + /// The sled's bootstrap address, if the host is on and we've discovered it + /// on the bootstrap network. + pub bootstrap_ip: Option, } // This is the subset of `RackInitializeRequest` that the user fills in in clear @@ -152,7 +206,10 @@ async fn get_rss_config( let inventory = inventory_or_unavail(&ctx.mgs_handle).await?; let mut config = ctx.rss_config.lock().unwrap(); - config.populate_available_bootstrap_sleds_from_inventory(&inventory); + config.populate_available_bootstrap_sleds_from_inventory( + &inventory, + &ctx.bootstrap_peers, + ); Ok(HttpResponseOk((&*config).into())) } @@ -176,7 +233,10 @@ async fn put_rss_config( let inventory = inventory_or_unavail(&ctx.mgs_handle).await?; let mut config = ctx.rss_config.lock().unwrap(); - config.populate_available_bootstrap_sleds_from_inventory(&inventory); + config.populate_available_bootstrap_sleds_from_inventory( + &inventory, + &ctx.bootstrap_peers, + ); config .update(body.into_inner(), ctx.baseboard.as_ref()) .map_err(|err| HttpError::for_bad_request(None, err))?; diff --git a/wicketd/src/lib.rs b/wicketd/src/lib.rs index 61c729c8393..52c750f3412 100644 --- a/wicketd/src/lib.rs +++ b/wicketd/src/lib.rs @@ -3,6 +3,7 @@ // file, You can obtain one at https://mozilla.org/MPL/2.0/. mod artifacts; +mod bootstrap_addrs; mod config; mod context; mod http_entrypoints; @@ -14,6 +15,7 @@ mod update_tracker; use anyhow::{anyhow, Result}; use artifacts::{WicketdArtifactServer, WicketdArtifactStore}; +use bootstrap_addrs::BootstrapPeers; pub use config::Config; pub(crate) use context::ServerContext; pub use installinator_progress::{IprUpdateTracker, RunningUpdateState}; @@ -97,6 +99,8 @@ impl Server { ipr_update_tracker.clone(), )); + let bootstrap_peers = BootstrapPeers::new(&log); + let wicketd_server = { let log = log.new(o!("component" => "dropshot (wicketd)")); let mgs_client = make_mgs_client(log.clone(), args.mgs_address); @@ -106,6 +110,7 @@ impl Server { ServerContext { mgs_handle, mgs_client, + bootstrap_peers, update_tracker: update_tracker.clone(), baseboard: args.baseboard, rss_config: Default::default(), diff --git a/wicketd/src/rss_config.rs b/wicketd/src/rss_config.rs index dfe766f57f5..4c0b057f133 100644 --- a/wicketd/src/rss_config.rs +++ b/wicketd/src/rss_config.rs @@ -4,6 +4,7 @@ //! Support for user-provided RSS configuration options. +use crate::bootstrap_addrs::BootstrapPeers; use crate::http_entrypoints::BootstrapSledDescription; use crate::http_entrypoints::CertificateUploadResponse; use crate::http_entrypoints::CurrentRssUserConfig; @@ -51,7 +52,10 @@ impl CurrentRssConfig { pub(crate) fn populate_available_bootstrap_sleds_from_inventory( &mut self, inventory: &RackV1Inventory, + bootstrap_peers: &BootstrapPeers, ) { + let bootstrap_sleds = bootstrap_peers.sleds(); + self.inventory = inventory .sps .iter() @@ -60,11 +64,16 @@ impl CurrentRssConfig { return None; } let state = sp.state.as_ref()?; + let baseboard = Baseboard::new_gimlet( + state.serial_number.clone(), + state.model.clone(), + state.revision.into(), + ); + let bootstrap_ip = bootstrap_sleds.get(&baseboard).copied(); Some(BootstrapSledDescription { id: sp.id, - serial_number: state.serial_number.clone(), - model: state.model.clone(), - revision: state.revision, + baseboard, + bootstrap_ip, }) }) .collect(); @@ -148,17 +157,12 @@ impl CurrentRssConfig { // First, confirm we have ourself in the inventory _and_ the user didn't // remove us from the list. - if let Some(Baseboard::Gimlet { identifier, model, revision }) = - our_baseboard - { + if let Some(our_baseboard @ Baseboard::Gimlet { .. }) = our_baseboard { let our_slot = self .inventory .iter() .find_map(|sled| { - if &sled.serial_number == identifier - && &sled.model == model - && i64::from(sled.revision) == *revision - { + if sled.baseboard == *our_baseboard { Some(sled.id.slot) } else { None @@ -167,13 +171,13 @@ impl CurrentRssConfig { .ok_or_else(|| { format!( "Inventory is missing the scrimlet where wicketd is \ - running ({identifier}, model {model} rev {revision})", + running ({our_baseboard:?})", ) })?; if !value.bootstrap_sleds.contains(&our_slot) { return Err(format!( "Cannot remove the scrimlet where wicketd is running \ - (sled {our_slot}: {identifier}, model {model} rev {revision}) \ + (sled {our_slot}: {our_baseboard:?}) \ from bootstrap_sleds" )); }