Skip to content

Commit f6f85a7

Browse files
authored
[nexus] the support bundle task should execute diag commands concurrently (#7461)
This PR adds some of the `sled-diagnostics` crates commands that were not yet being collected. Additionally we now have an array of commands that will be ran concurrently (currently limited to 10 at a time) that we can add to as more support commands become available.
1 parent ff715c8 commit f6f85a7

File tree

1 file changed

+74
-24
lines changed

1 file changed

+74
-24
lines changed

nexus/src/app/background/tasks/support_bundle_collector.rs

Lines changed: 74 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ use camino_tempfile::Utf8TempDir;
1111
use camino_tempfile::tempdir_in;
1212
use camino_tempfile::tempfile_in;
1313
use futures::FutureExt;
14+
use futures::StreamExt;
1415
use futures::future::BoxFuture;
1516
use nexus_db_model::SupportBundle;
1617
use nexus_db_model::SupportBundleState;
@@ -34,6 +35,7 @@ use omicron_uuid_kinds::SupportBundleUuid;
3435
use omicron_uuid_kinds::ZpoolUuid;
3536
use serde_json::json;
3637
use sha2::{Digest, Sha256};
38+
use std::future::Future;
3739
use std::io::Write;
3840
use std::sync::Arc;
3941
use tokio::io::AsyncReadExt;
@@ -599,24 +601,65 @@ impl BundleCollection<'_> {
599601
continue;
600602
};
601603

602-
write_command_result_or_error(
603-
&sled_path,
604-
"dladm",
605-
sled_client.support_dladm_info().await,
606-
)
607-
.await?;
608-
write_command_result_or_error(
609-
&sled_path,
610-
"ipadm",
611-
sled_client.support_ipadm_info().await,
612-
)
613-
.await?;
614-
write_command_result_or_error(
615-
&sled_path,
616-
"zoneadm",
617-
sled_client.support_zoneadm_info().await,
618-
)
619-
.await?;
604+
// NB: As new sled-diagnostic commands are added they should
605+
// be added to this array so that their output can be saved
606+
// within the support bundle.
607+
let mut diag_cmds = futures::stream::iter([
608+
save_diag_cmd_output_or_error(
609+
&sled_path,
610+
"zoneadm",
611+
sled_client.support_zoneadm_info(),
612+
)
613+
.boxed(),
614+
save_diag_cmd_output_or_error(
615+
&sled_path,
616+
"dladm",
617+
sled_client.support_dladm_info(),
618+
)
619+
.boxed(),
620+
save_diag_cmd_output_or_error(
621+
&sled_path,
622+
"ipadm",
623+
sled_client.support_ipadm_info(),
624+
)
625+
.boxed(),
626+
save_diag_cmd_output_or_error(
627+
&sled_path,
628+
"pargs",
629+
sled_client.support_pargs_info(),
630+
)
631+
.boxed(),
632+
save_diag_cmd_output_or_error(
633+
&sled_path,
634+
"pfiles",
635+
sled_client.support_pfiles_info(),
636+
)
637+
.boxed(),
638+
save_diag_cmd_output_or_error(
639+
&sled_path,
640+
"pstack",
641+
sled_client.support_pstack_info(),
642+
)
643+
.boxed(),
644+
])
645+
// Currently we execute up to 10 commands concurrently which
646+
// might be doing their own concurrent work, for example
647+
// collectiong `pstack` output of every Oxide process that is
648+
// found on a sled.
649+
.buffer_unordered(10);
650+
651+
while let Some(result) = diag_cmds.next().await {
652+
// Log that we failed to write the diag command output to a
653+
// file but don't return early as we wish to get as much
654+
// information as we can.
655+
if let Err(e) = result {
656+
error!(
657+
&self.log,
658+
"failed to write diagnostic command output to \
659+
file: {e}"
660+
);
661+
}
662+
}
620663
}
621664
}
622665

@@ -728,14 +771,21 @@ async fn sha2_hash(file: &mut tokio::fs::File) -> anyhow::Result<ArtifactHash> {
728771
Ok(ArtifactHash(digest.as_slice().try_into()?))
729772
}
730773

731-
async fn write_command_result_or_error<D: std::fmt::Debug>(
774+
/// Run a `sled-dianostics` future and save its output to a corresponding file.
775+
async fn save_diag_cmd_output_or_error<F, D: std::fmt::Debug>(
732776
path: &Utf8Path,
733777
command: &str,
734-
result: Result<
735-
sled_agent_client::ResponseValue<D>,
736-
sled_agent_client::Error<sled_agent_client::types::Error>,
737-
>,
738-
) -> anyhow::Result<()> {
778+
future: F,
779+
) -> anyhow::Result<()>
780+
where
781+
F: Future<
782+
Output = Result<
783+
sled_agent_client::ResponseValue<D>,
784+
sled_agent_client::Error<sled_agent_client::types::Error>,
785+
>,
786+
> + Send,
787+
{
788+
let result = future.await;
739789
match result {
740790
Ok(result) => {
741791
let output = result.into_inner();

0 commit comments

Comments
 (0)