Skip to content

Commit b606a57

Browse files
author
lif
committed
wip: give sled-agent the generation of the mark-as-failure
1 parent 0c86105 commit b606a57

File tree

5 files changed

+42
-13
lines changed

5 files changed

+42
-13
lines changed

end-to-end-tests/src/instance_launch.rs

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,9 @@ use anyhow::{ensure, Context as _, Result};
55
use async_trait::async_trait;
66
use omicron_test_utils::dev::poll::{wait_for_condition, CondCheckError};
77
use oxide_client::types::{
8-
ByteCount, DiskCreate, DiskSource, ExternalIpCreate, InstanceCpuCount,
9-
InstanceCreate, InstanceDiskAttachment, InstanceNetworkInterfaceAttachment,
10-
InstanceState, SshKeyCreate,
8+
ByteCount, DiskCreate, DiskSource, ExternalIp, ExternalIpCreate,
9+
InstanceCpuCount, InstanceCreate, InstanceDiskAttachment,
10+
InstanceNetworkInterfaceAttachment, InstanceState, SshKeyCreate,
1111
};
1212
use oxide_client::{ClientDisksExt, ClientInstancesExt, ClientSessionExt};
1313
use russh::{ChannelMsg, Disconnect};
@@ -223,7 +223,8 @@ async fn instance_launch() -> Result<()> {
223223
.most_recent(1024 * 1024)
224224
.max_bytes(1024 * 1024)
225225
.send()
226-
.await?
226+
.await
227+
.map_err(|e| Error::NotYet)?
227228
.data,
228229
)
229230
.into_owned();

nexus/src/app/instance.rs

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ use nexus_db_queries::db::identity::Resource;
2828
use nexus_db_queries::db::lookup;
2929
use nexus_db_queries::db::lookup::LookupPath;
3030
use nexus_types::external_api::views;
31+
use nexus_types::internal_api::views::HandleInstancePutResultResult;
3132
use omicron_common::address::PROPOLIS_PORT;
3233
use omicron_common::api::external::http_pagination::PaginatedBy;
3334
use omicron_common::api::external::ByteCount;
@@ -1040,7 +1041,7 @@ impl super::Nexus {
10401041
opctx: &OpContext,
10411042
instance_id: &Uuid,
10421043
result: Result<nexus::SledInstanceState, Error>,
1043-
) -> Result<views::HandleInstancePutResultResult, Error> {
1044+
) -> Result<HandleInstancePutResultResult, Error> {
10441045
let (.., authz_instance) = LookupPath::new(&opctx, &self.db_datastore)
10451046
.instance_id(*instance_id)
10461047
.lookup_for(authz::Action::Modify)
@@ -1053,10 +1054,16 @@ impl super::Nexus {
10531054
.await?;
10541055
if !inst_updated && !vmm_updated {
10551056
// the generation number bumped up by the timeout task.
1056-
// TODO check actual state / put it in the TimedOut variant?
1057-
Ok(views::HandleInstancePutResultResult::TimedOut)
1057+
// TODO check actual state value (need more specificity than these two bools?)
1058+
let state = self
1059+
.db_datastore
1060+
.instance_fetch_with_vmm(opctx, &authz_instance)
1061+
.await?;
1062+
Ok(HandleInstancePutResultResult::TimedOut {
1063+
generation: *state.instance().runtime().gen,
1064+
})
10581065
} else {
1059-
Ok(views::HandleInstancePutResultResult::Ok)
1066+
Ok(HandleInstancePutResultResult::Ok)
10601067
}
10611068
}
10621069
Err(error) => {

nexus/types/src/internal_api/views.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ use chrono::DateTime;
66
use chrono::Utc;
77
use futures::future::ready;
88
use futures::stream::StreamExt;
9+
use omicron_common::api::external::Generation;
910
use omicron_common::api::external::ObjectStream;
1011
use schemars::JsonSchema;
1112
use serde::Serialize;
@@ -304,5 +305,5 @@ pub struct LastResultCompleted {
304305
#[serde(rename_all = "snake_case", tag = "last_result", content = "details")]
305306
pub enum HandleInstancePutResultResult {
306307
Ok,
307-
TimedOut,
308+
TimedOut { generation: Generation },
308309
}

openapi/nexus-internal.json

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3826,6 +3826,17 @@
38263826
{
38273827
"type": "object",
38283828
"properties": {
3829+
"details": {
3830+
"type": "object",
3831+
"properties": {
3832+
"generation": {
3833+
"$ref": "#/components/schemas/Generation"
3834+
}
3835+
},
3836+
"required": [
3837+
"generation"
3838+
]
3839+
},
38293840
"last_result": {
38303841
"type": "string",
38313842
"enum": [
@@ -3834,6 +3845,7 @@
38343845
}
38353846
},
38363847
"required": [
3848+
"details",
38373849
"last_result"
38383850
]
38393851
}

sled-agent/src/instance_manager.rs

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -387,8 +387,12 @@ impl InstanceManager {
387387
.map(nexus_client::ResponseValue::into_inner)
388388
{
389389
Ok(HandleInstancePutResultResult::Ok) => {}
390-
Ok(HandleInstancePutResultResult::TimedOut) => {
391-
error!(log, "Nexus gave up waiting for us to finish creating the propolis zone and abandoned the instance. Rudely terminating it from our side."; "instance_id" => %instance_id);
390+
Ok(
391+
HandleInstancePutResultResult::TimedOut {
392+
generation,
393+
},
394+
) => {
395+
error!(log, "Nexus gave up waiting for us to finish creating the propolis zone and abandoned the instance. Rudely terminating it from our side."; "instance_id" => %instance_id, "generation" => %generation);
392396
if let Err(err) = instance.terminate().await
393397
{
394398
error!(log, "Couldn't terminate instance whose creation was timed-out by Nexus"; "instance_id" => %instance_id, "err" => %err);
@@ -413,8 +417,12 @@ impl InstanceManager {
413417
.map(nexus_client::ResponseValue::into_inner)
414418
{
415419
Ok(HandleInstancePutResultResult::Ok) => {}
416-
Ok(HandleInstancePutResultResult::TimedOut) => {
417-
error!(log, "Nexus gave up waiting for us to finish creating the propolis zone and abandoned the instance, but the instance also explicitly failed on our side. Rudely terminating what remains of it."; "instance_id" => %instance_id);
420+
Ok(
421+
HandleInstancePutResultResult::TimedOut {
422+
generation,
423+
},
424+
) => {
425+
error!(log, "Nexus gave up waiting for us to finish creating the propolis zone and abandoned the instance, but the instance also explicitly failed on our side. Rudely terminating what remains of it."; "instance_id" => %instance_id, "generation" => %generation);
418426
if let Err(err) = instance.terminate().await
419427
{
420428
error!(log, "Couldn't terminate faulted instance (whose creation was also timed-out by Nexus)"; "instance_id" => %instance_id, "err" => %err);

0 commit comments

Comments
 (0)