Skip to content

[sled-agent-config-reconciler] Inventory types for reconciler status #8180

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 12 commits into
base: main
Choose a base branch
from
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions nexus-sled-agent-shared/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ edition = "2021"
workspace = true

[dependencies]
chrono.workspace = true
daft.workspace = true
id-map.workspace = true
illumos-utils.workspace = true
Expand Down
55 changes: 54 additions & 1 deletion nexus-sled-agent-shared/src/inventory.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,11 @@

//! Inventory types shared between Nexus and sled-agent.

use std::collections::BTreeMap;
use std::net::{IpAddr, Ipv6Addr, SocketAddr, SocketAddrV6};
use std::time::Duration;

use chrono::{DateTime, Utc};
use daft::Diffable;
use id_map::IdMap;
use id_map::IdMappable;
Expand All @@ -21,8 +24,8 @@ use omicron_common::{
},
zpool_name::ZpoolName,
};
use omicron_uuid_kinds::MupdateOverrideUuid;
use omicron_uuid_kinds::{DatasetUuid, OmicronZoneUuid};
use omicron_uuid_kinds::{MupdateOverrideUuid, PhysicalDiskUuid};
use omicron_uuid_kinds::{SledUuid, ZpoolUuid};
use schemars::JsonSchema;
use serde::{Deserialize, Serialize};
Expand Down Expand Up @@ -115,6 +118,56 @@ pub struct Inventory {
pub omicron_physical_disks_generation: Generation,
}

/// Describes the last attempt made by the sled-agent-config-reconciler to
/// reconcile the current sled config against the actual state of the sled.
#[derive(Clone, Debug, PartialEq, Eq, Deserialize, JsonSchema, Serialize)]
#[serde(rename_all = "snake_case")]
pub struct ConfigReconcilerInventory {
pub last_reconciled_config: OmicronSledConfig,
pub external_disks:
BTreeMap<PhysicalDiskUuid, ConfigReconcilerInventoryResult>,
pub datasets: BTreeMap<DatasetUuid, ConfigReconcilerInventoryResult>,
pub zones: BTreeMap<OmicronZoneUuid, ConfigReconcilerInventoryResult>,
}

#[derive(Clone, Debug, PartialEq, Eq, Deserialize, JsonSchema, Serialize)]
#[serde(tag = "result", rename_all = "snake_case")]
pub enum ConfigReconcilerInventoryResult {
Ok,
Err { message: String },
}

impl From<Result<(), String>> for ConfigReconcilerInventoryResult {
fn from(result: Result<(), String>) -> Self {
match result {
Ok(()) => Self::Ok,
Err(message) => Self::Err { message },
}
}
}

/// Status of the sled-agent-config-reconciler task.
#[derive(Clone, Debug, PartialEq, Eq, Deserialize, JsonSchema, Serialize)]
#[serde(tag = "status", rename_all = "snake_case")]
pub enum ConfigReconcilerInventoryStatus {
/// The reconciler task has not yet run for the first time since sled-agent
/// started.
NotYetRun,
/// The reconciler task is actively running.
Running {
config: OmicronSledConfig,
started_at: DateTime<Utc>,
running_for: Duration,
},
/// The reconciler task is currently idle, but previously did complete a
/// reconciliation attempt.
///
/// This variant does not include the `OmicronSledConfig` used in the last
/// attempt, because that's always available via
/// [`ConfigReconcilerInventory::last_reconciled_config`].
Idle { completed_at: DateTime<Utc>, ran_for: Duration },
}

/// Describes the role of the sled within the rack.
///
/// Note that this may change if the sled is physically moved
Expand Down
71 changes: 68 additions & 3 deletions sled-agent/config-reconciler/src/dataset_serialization_task.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
//! using oneshot channels to send responses".

use crate::CurrentlyManagedZpoolsReceiver;
use crate::InventoryError;
use camino::Utf8PathBuf;
use debug_ignore::DebugIgnore;
use futures::StreamExt;
Expand All @@ -26,6 +27,7 @@ use illumos_utils::zfs::WhichDatasets;
use illumos_utils::zfs::Zfs;
use illumos_utils::zpool::PathInPool;
use illumos_utils::zpool::ZpoolOrRamdisk;
use nexus_sled_agent_shared::inventory::ConfigReconcilerInventoryResult;
use nexus_sled_agent_shared::inventory::InventoryDataset;
use omicron_common::disk::DatasetConfig;
use omicron_common::disk::DatasetKind;
Expand All @@ -34,6 +36,7 @@ use omicron_common::disk::SharedDatasetConfig;
use omicron_common::zpool_name::ZpoolName;
use omicron_uuid_kinds::DatasetUuid;
use sled_storage::config::MountConfig;
use sled_storage::dataset::CRYPT_DATASET;
use sled_storage::dataset::U2_DEBUG_DATASET;
use sled_storage::dataset::ZONE_DATASET;
use sled_storage::manager::NestedDatasetConfig;
Expand Down Expand Up @@ -169,6 +172,25 @@ impl DatasetEnsureResult {
})
}

pub(crate) fn to_inventory(
&self,
) -> BTreeMap<DatasetUuid, ConfigReconcilerInventoryResult> {
self.0
.iter()
.map(|dataset| match &dataset.state {
DatasetState::Ensured => {
(dataset.config.id, ConfigReconcilerInventoryResult::Ok)
}
DatasetState::FailedToEnsure(err) => (
dataset.config.id,
ConfigReconcilerInventoryResult::Err {
message: InlineErrorChain::new(err).to_string(),
},
),
})
.collect()
}

pub(crate) fn all_mounted_debug_datasets<'a>(
&'a self,
mount_config: &'a MountConfig,
Expand Down Expand Up @@ -285,9 +307,11 @@ impl DatasetTaskHandle {

pub async fn inventory(
&self,
_zpools: BTreeSet<ZpoolName>,
) -> Result<Vec<InventoryDataset>, DatasetTaskError> {
unimplemented!()
zpools: BTreeSet<ZpoolName>,
) -> Result<Result<Vec<InventoryDataset>, InventoryError>, DatasetTaskError>
{
self.try_send_request(|tx| DatasetTaskRequest::Inventory { zpools, tx })
.await
}

pub async fn datasets_ensure(
Expand Down Expand Up @@ -398,6 +422,9 @@ impl DatasetTask {
) {
// In all cases, we don't care if the receiver is gone.
match request {
DatasetTaskRequest::Inventory { zpools, tx } => {
_ = tx.0.send(self.inventory(zpools, zfs).await);
}
DatasetTaskRequest::DatasetsEnsure { datasets, tx } => {
self.datasets_ensure(datasets, zfs).await;
_ = tx.0.send(self.datasets.clone());
Expand All @@ -419,6 +446,38 @@ impl DatasetTask {
}
}

async fn inventory<T: ZfsImpl>(
&mut self,
zpools: BTreeSet<ZpoolName>,
zfs: &T,
) -> Result<Vec<InventoryDataset>, InventoryError> {
let datasets_of_interest = zpools
.iter()
.flat_map(|zpool| {
[
// We care about the zpool itself, and all direct children.
zpool.to_string(),
// Likewise, we care about the encrypted dataset, and all
// direct children.
format!("{zpool}/{CRYPT_DATASET}"),
// The zone dataset gives us additional context on "what
// zones have datasets provisioned".
format!("{zpool}/{ZONE_DATASET}"),
]
})
.collect::<Vec<_>>();

let props = zfs
.get_dataset_properties(
&datasets_of_interest,
WhichDatasets::SelfAndChildren,
)
.await
.map_err(InventoryError::ListDatasetProperties)?;

Ok(props.into_iter().map(From::from).collect())
}

async fn datasets_ensure<T: ZfsImpl>(
&mut self,
config: IdMap<DatasetConfig>,
Expand Down Expand Up @@ -947,6 +1006,12 @@ impl DatasetTask {

#[derive(Debug)]
enum DatasetTaskRequest {
Inventory {
zpools: BTreeSet<ZpoolName>,
tx: DebugIgnore<
oneshot::Sender<Result<Vec<InventoryDataset>, InventoryError>>,
>,
},
DatasetsEnsure {
datasets: IdMap<DatasetConfig>,
tx: DebugIgnore<oneshot::Sender<DatasetEnsureResult>>,
Expand Down
72 changes: 70 additions & 2 deletions sled-agent/config-reconciler/src/handle.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
use camino::Utf8PathBuf;
use illumos_utils::zpool::PathInPool;
use key_manager::StorageKeyRequester;
use nexus_sled_agent_shared::inventory::ConfigReconcilerInventory;
use nexus_sled_agent_shared::inventory::ConfigReconcilerInventoryStatus;
use nexus_sled_agent_shared::inventory::InventoryDataset;
use nexus_sled_agent_shared::inventory::InventoryDisk;
use nexus_sled_agent_shared::inventory::InventoryZpool;
Expand Down Expand Up @@ -48,6 +50,7 @@ use crate::dataset_serialization_task::DatasetTaskHandle;
use crate::dataset_serialization_task::NestedDatasetMountError;
use crate::dump_setup_task;
use crate::internal_disks::InternalDisksReceiver;
use crate::ledger::CurrentSledConfig;
use crate::ledger::LedgerTaskHandle;
use crate::raw_disks;
use crate::raw_disks::RawDisksReceiver;
Expand All @@ -57,6 +60,16 @@ use crate::reconciler_task::CurrentlyManagedZpools;
use crate::reconciler_task::CurrentlyManagedZpoolsReceiver;
use crate::reconciler_task::ReconcilerResult;

#[derive(Debug, thiserror::Error)]
pub enum InventoryError {
#[error("ledger contents not yet available")]
LedgerContentsNotAvailable,
#[error("could not contact dataset task")]
DatasetTaskError(#[from] DatasetTaskError),
#[error("could not list dataset properties")]
ListDatasetProperties(#[source] anyhow::Error),
}

#[derive(Debug, Clone, Copy)]
pub enum TimeSyncConfig {
// Waits for NTP to confirm that time has been synchronized.
Expand Down Expand Up @@ -331,8 +344,55 @@ impl ConfigReconcilerHandle {
}

/// Collect inventory fields relevant to config reconciliation.
pub fn inventory(&self) -> ReconcilerInventory {
unimplemented!()
pub async fn inventory(
&self,
log: &Logger,
) -> Result<ReconcilerInventory, InventoryError> {
let ledgered_sled_config = match self
.ledger_task
.get()
.map(LedgerTaskHandle::current_config)
{
// If we haven't yet spawned the ledger task, or we have but
// it's still waiting on disks, we don't know whether we have a
// ledgered sled config. It's not reasonable to report `None` in
// this case (since `None` means "we don't have a config"), so
// bail out.
//
// This shouldn't happen in practice: sled-agent should both wait
// for the boot disk and spawn the reconciler task before starting
// the dropshot server that allows Nexus to collect inventory.
None | Some(CurrentSledConfig::WaitingForInternalDisks) => {
return Err(InventoryError::LedgerContentsNotAvailable);
}
Some(CurrentSledConfig::WaitingForInitialConfig) => None,
Some(CurrentSledConfig::Ledgered(config)) => Some(config),
};

let zpools = self.currently_managed_zpools_rx.to_inventory(log).await;

let datasets = self
.dataset_task
.inventory(zpools.iter().map(|&(name, _)| name).collect())
.await??;

let (reconciler_status, last_reconciliation) =
self.reconciler_result_rx.borrow().to_inventory();

Ok(ReconcilerInventory {
disks: self.raw_disks_tx.to_inventory(),
zpools: zpools
.into_iter()
.map(|(name, total_size)| InventoryZpool {
id: name.id(),
total_size,
})
.collect(),
datasets,
ledgered_sled_config,
reconciler_status,
last_reconciliation,
})
}
}

Expand All @@ -346,12 +406,20 @@ struct ReconcilerTaskDependencies {
reconciler_task_log: Logger,
}

/// Fields of sled-agent inventory reported by the config reconciler subsystem.
///
/// Note that much like inventory in general, these fields are not collected
/// atomically; if there are active changes being made while this struct is
/// being assembled, different fields may have be populated from different
/// states of the world.
#[derive(Debug)]
pub struct ReconcilerInventory {
pub disks: Vec<InventoryDisk>,
pub zpools: Vec<InventoryZpool>,
pub datasets: Vec<InventoryDataset>,
pub ledgered_sled_config: Option<OmicronSledConfig>,
pub reconciler_status: ConfigReconcilerInventoryStatus,
pub last_reconciliation: Option<ConfigReconcilerInventory>,
}

#[derive(Debug, Clone)]
Expand Down
10 changes: 9 additions & 1 deletion sled-agent/config-reconciler/src/ledger.rs
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,7 @@ pub(crate) enum CurrentSledConfig {
#[derive(Debug)]
pub(crate) struct LedgerTaskHandle {
request_tx: mpsc::Sender<LedgerTaskRequest>,
current_config_rx: watch::Receiver<CurrentSledConfig>,
}

impl LedgerTaskHandle {
Expand Down Expand Up @@ -160,7 +161,14 @@ impl LedgerTaskHandle {
.run(),
);

(Self { request_tx }, current_config_rx)
(
Self { request_tx, current_config_rx: current_config_rx.clone() },
current_config_rx,
)
}

pub(crate) fn current_config(&self) -> CurrentSledConfig {
self.current_config_rx.borrow().clone()
}

pub async fn set_new_config(
Expand Down
1 change: 1 addition & 0 deletions sled-agent/config-reconciler/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ pub use dataset_serialization_task::NestedDatasetMountError;
pub use handle::AvailableDatasetsReceiver;
pub use handle::ConfigReconcilerHandle;
pub use handle::ConfigReconcilerSpawnToken;
pub use handle::InventoryError;
pub use handle::ReconcilerInventory;
pub use handle::TimeSyncConfig;
pub use internal_disks::InternalDisks;
Expand Down
Loading
Loading