Skip to content

[nexus] the support bundle task should execute diag commands concurrently #7461

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
98 changes: 74 additions & 24 deletions nexus/src/app/background/tasks/support_bundle_collector.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ use camino_tempfile::tempfile_in;
use camino_tempfile::Utf8TempDir;
use futures::future::BoxFuture;
use futures::FutureExt;
use futures::StreamExt;
use nexus_db_model::SupportBundle;
use nexus_db_model::SupportBundleState;
use nexus_db_queries::authz;
Expand All @@ -34,6 +35,7 @@ use omicron_uuid_kinds::SupportBundleUuid;
use omicron_uuid_kinds::ZpoolUuid;
use serde_json::json;
use sha2::{Digest, Sha256};
use std::future::Future;
use std::io::Write;
use std::sync::Arc;
use tokio::io::AsyncReadExt;
Expand Down Expand Up @@ -599,24 +601,65 @@ impl BundleCollection<'_> {
continue;
};

write_command_result_or_error(
&sled_path,
"dladm",
sled_client.support_dladm_info().await,
)
.await?;
write_command_result_or_error(
&sled_path,
"ipadm",
sled_client.support_ipadm_info().await,
)
.await?;
write_command_result_or_error(
&sled_path,
"zoneadm",
sled_client.support_zoneadm_info().await,
)
.await?;
// NB: As new sled-diagnostic commands are added they should
// be added to this array so that their output can be saved
// within the support bundle.
let mut diag_cmds = futures::stream::iter([
save_diag_cmd_output_or_error(
&sled_path,
"zoneadm",
sled_client.support_zoneadm_info(),
)
.boxed(),
save_diag_cmd_output_or_error(
&sled_path,
"dladm",
sled_client.support_dladm_info(),
)
.boxed(),
save_diag_cmd_output_or_error(
&sled_path,
"ipadm",
sled_client.support_ipadm_info(),
)
.boxed(),
save_diag_cmd_output_or_error(
&sled_path,
"pargs",
sled_client.support_pargs_info(),
)
.boxed(),
save_diag_cmd_output_or_error(
&sled_path,
"pfiles",
sled_client.support_pfiles_info(),
)
.boxed(),
save_diag_cmd_output_or_error(
&sled_path,
"pstack",
sled_client.support_pstack_info(),
)
.boxed(),
])
// Currently we execute up to 10 commands concurrently which
// might be doing their own concurrent work, for example
// collectiong `pstack` output of every Oxide process that is
// found on a sled.
.buffer_unordered(10);

while let Some(result) = diag_cmds.next().await {
// Log that we failed to write the diag command output to a
// file but don't return early as we wish to get as much
// information as we can.
if let Err(e) = result {
error!(
&self.log,
"failed to write diagnostic command output to \
file: {e}"
);
}
}
}
}

Expand Down Expand Up @@ -728,14 +771,21 @@ async fn sha2_hash(file: &mut tokio::fs::File) -> anyhow::Result<ArtifactHash> {
Ok(ArtifactHash(digest.as_slice().try_into()?))
}

async fn write_command_result_or_error<D: std::fmt::Debug>(
/// Run a `sled-dianostics` future and save its output to a corresponding file.
async fn save_diag_cmd_output_or_error<F, D: std::fmt::Debug>(
path: &Utf8Path,
command: &str,
result: Result<
sled_agent_client::ResponseValue<D>,
sled_agent_client::Error<sled_agent_client::types::Error>,
>,
) -> anyhow::Result<()> {
future: F,
) -> anyhow::Result<()>
where
F: Future<
Output = Result<
sled_agent_client::ResponseValue<D>,
sled_agent_client::Error<sled_agent_client::types::Error>,
>,
> + Send,
{
let result = future.await;
match result {
Ok(result) => {
let output = result.into_inner();
Expand Down
Loading