Skip to content

Commit b6f0150

Browse files
author
lif
committed
wip: plumbing timeout state in internal api response
1 parent 12e2dcb commit b6f0150

File tree

5 files changed

+101
-48
lines changed

5 files changed

+101
-48
lines changed

nexus/src/app/instance.rs

Lines changed: 20 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ use nexus_db_queries::db::datastore::InstanceAndActiveVmm;
2626
use nexus_db_queries::db::identity::Resource;
2727
use nexus_db_queries::db::lookup;
2828
use nexus_db_queries::db::lookup::LookupPath;
29+
use nexus_types::internal_api::views::HandleInstancePutResultResult;
2930
use omicron_common::address::PROPOLIS_PORT;
3031
use omicron_common::api::external::http_pagination::PaginatedBy;
3132
use omicron_common::api::external::ByteCount;
@@ -140,6 +141,7 @@ impl From<InstanceStateChangeError> for omicron_common::api::external::Error {
140141

141142
/// The kinds of state changes that can be requested of an instance's current
142143
/// VMM (i.e. the VMM pointed to be the instance's `propolis_id` field).
144+
#[derive(Clone)]
143145
pub(crate) enum InstanceStateChangeRequest {
144146
Run,
145147
Reboot,
@@ -943,7 +945,9 @@ impl super::Nexus {
943945
let instance_put_result = sa
944946
.instance_put_state(
945947
&instance_id,
946-
&InstancePutStateBody { state: requested.into() },
948+
&InstancePutStateBody {
949+
state: requested.clone().into(),
950+
},
947951
)
948952
.await
949953
.map(|res| res.into_inner().updated_runtime.map(Into::into))
@@ -982,8 +986,8 @@ impl super::Nexus {
982986
tokio::spawn(async {
983987
tokio::time::sleep(Duration::from_secs(120))
984988
.await;
985-
todo: fail instance
986-
})
989+
todo!("fail instance")
990+
});
987991
}
988992
self.write_returned_instance_state(&instance_id, state)
989993
.await
@@ -1009,17 +1013,25 @@ impl super::Nexus {
10091013
opctx: &OpContext,
10101014
instance_id: &Uuid,
10111015
result: Result<nexus::SledInstanceState, Error>,
1012-
) -> Result<(), Error> {
1016+
) -> Result<HandleInstancePutResultResult, Error> {
10131017
let (.., authz_instance) = LookupPath::new(&opctx, &self.db_datastore)
10141018
.instance_id(*instance_id)
10151019
.lookup_for(authz::Action::Modify)
10161020
.await?;
10171021

10181022
match result {
1019-
Ok(new_state) => self
1020-
.write_returned_instance_state(instance_id, Some(new_state))
1021-
.await
1022-
.map(|_| ()),
1023+
Ok(new_state) => {
1024+
let (inst_updated, vmm_updated) = self
1025+
.write_returned_instance_state(instance_id, Some(new_state))
1026+
.await?;
1027+
if !inst_updated && !vmm_updated {
1028+
// the generation number bumped up by the timeout task.
1029+
// TODO check actual state / put it in the TimedOut variant?
1030+
Ok(HandleInstancePutResultResult::TimedOut)
1031+
} else {
1032+
Ok(HandleInstancePutResultResult::Ok)
1033+
}
1034+
}
10231035
Err(error) => {
10241036
let state = self
10251037
.db_datastore

nexus/src/internal_api/http_entrypoints.rs

Lines changed: 6 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -304,20 +304,11 @@ async fn cpapi_handle_instance_put_success(
304304
// TODO: if instance_handle_creation_result errors because nexus gave
305305
// up waiting and marked the instance as failed, tell sled-agent to
306306
// destroy the instance
307-
match nexus
307+
nexus
308308
.instance_handle_creation_result(&opctx, &path.instance_id, result)
309309
.await
310-
{
311-
Ok(_) => Ok(HandleInstancePutResultResult::Ok.into()),
312-
Err(err) => {
313-
if todo {
314-
Ok(HandleInstancePutResultResult::TimedOut.into())
315-
} else {
316-
Err(err)
317-
}
318-
}
319-
}
320-
Ok(HttpResponseUpdatedNoContent())
310+
.map(HttpResponseOk)
311+
.map_err(Into::into)
321312
};
322313
apictx.internal_latencies.instrument_dropshot_handler(&rqctx, handler).await
323314
}
@@ -341,19 +332,11 @@ async fn cpapi_handle_instance_put_failure(
341332
});
342333
let opctx = crate::context::op_context_for_internal_api(&rqctx).await;
343334
let handler = async {
344-
match nexus
335+
nexus
345336
.instance_handle_creation_result(&opctx, &path.instance_id, result)
346337
.await
347-
{
348-
Ok(_) => Ok(HandleInstancePutResultResult::Ok.into()),
349-
Err(err) => {
350-
if todo {
351-
Ok(HandleInstancePutResultResult::TimedOut.into())
352-
} else {
353-
Err(err)
354-
}
355-
}
356-
}
338+
.map(HttpResponseOk)
339+
.map_err(Into::into)
357340
};
358341
apictx.internal_latencies.instrument_dropshot_handler(&rqctx, handler).await
359342
}

nexus/types/src/internal_api/views.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -301,6 +301,7 @@ pub struct LastResultCompleted {
301301
/// relevant, or if it's timed out during creation and been marked as failed
302302
/// (such that sled-agent can destroy the tardy instance)
303303
#[derive(Serialize, JsonSchema)]
304+
#[serde(rename_all = "snake_case", tag = "last_result", content = "details")]
304305
pub enum HandleInstancePutResultResult {
305306
Ok,
306307
TimedOut,

openapi/nexus-internal.json

Lines changed: 55 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -466,8 +466,8 @@
466466
},
467467
"/instances/{instance_id}/creation-failure": {
468468
"put": {
469-
"summary": "Asynchronously report the unsuccessful result of a potentially long-running",
470-
"description": "instance_put call to sled-agent made during instance creation.",
469+
"summary": "Asynchronously report the unsuccessful result of certain instance_put calls",
470+
"description": "(such as the potentially long-running one made during instance creation)",
471471
"operationId": "cpapi_handle_instance_put_failure",
472472
"parameters": [
473473
{
@@ -492,8 +492,15 @@
492492
"required": true
493493
},
494494
"responses": {
495-
"204": {
496-
"description": "resource updated"
495+
"200": {
496+
"description": "successful operation",
497+
"content": {
498+
"application/json": {
499+
"schema": {
500+
"$ref": "#/components/schemas/HandleInstancePutResultResult"
501+
}
502+
}
503+
}
497504
},
498505
"4XX": {
499506
"$ref": "#/components/responses/Error"
@@ -506,8 +513,8 @@
506513
},
507514
"/instances/{instance_id}/creation-success": {
508515
"put": {
509-
"summary": "Asynchronously report the successful result of a potentially long-running",
510-
"description": "instance_put call to sled-agent made during instance creation.",
516+
"summary": "Asynchronously report the successful result of certain instance_put calls",
517+
"description": "(such as the potentially long-running one made during instance creation)",
511518
"operationId": "cpapi_handle_instance_put_success",
512519
"parameters": [
513520
{
@@ -531,8 +538,15 @@
531538
"required": true
532539
},
533540
"responses": {
534-
"204": {
535-
"description": "resource updated"
541+
"200": {
542+
"description": "successful operation",
543+
"content": {
544+
"application/json": {
545+
"schema": {
546+
"$ref": "#/components/schemas/HandleInstancePutResultResult"
547+
}
548+
}
549+
}
536550
},
537551
"4XX": {
538552
"$ref": "#/components/responses/Error"
@@ -3753,6 +3767,39 @@
37533767
"format": "uint64",
37543768
"minimum": 0
37553769
},
3770+
"HandleInstancePutResultResult": {
3771+
"description": "Tells sled-agent whether an instance whose status it's reporting is still relevant, or if it's timed out during creation and been marked as failed (such that sled-agent can destroy the tardy instance)",
3772+
"oneOf": [
3773+
{
3774+
"type": "object",
3775+
"properties": {
3776+
"last_result": {
3777+
"type": "string",
3778+
"enum": [
3779+
"ok"
3780+
]
3781+
}
3782+
},
3783+
"required": [
3784+
"last_result"
3785+
]
3786+
},
3787+
{
3788+
"type": "object",
3789+
"properties": {
3790+
"last_result": {
3791+
"type": "string",
3792+
"enum": [
3793+
"timed_out"
3794+
]
3795+
}
3796+
},
3797+
"required": [
3798+
"last_result"
3799+
]
3800+
}
3801+
]
3802+
},
37563803
"HistogramError": {
37573804
"description": "Errors related to constructing histograms or adding samples into them.",
37583805
"oneOf": [

sled-agent/src/instance_manager.rs

Lines changed: 19 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ use illumos_utils::link::VnicAllocator;
1919
use illumos_utils::opte::PortManager;
2020
use illumos_utils::running_zone::ZoneBuilderFactory;
2121
use illumos_utils::vmm_reservoir;
22+
use nexus_client::types::HandleInstancePutResultResult;
2223
use omicron_common::api::external::ByteCount;
2324
use omicron_common::api::internal::nexus::InstanceRuntimeState;
2425
use omicron_common::api::internal::nexus::SledInstanceState;
@@ -382,10 +383,12 @@ impl InstanceManager {
382383
&instance_state,
383384
)
384385
.await
386+
.map(nexus_client::ResponseValue::into_inner)
385387
{
386-
Ok(
387-
TODO HandleInstancePutResultResult::Ok
388-
) => {}
388+
Ok(HandleInstancePutResultResult::Ok) => {}
389+
Ok(HandleInstancePutResultResult::TimedOut) => {
390+
todo!("nexus doesn't want us any more, terminate instance")
391+
}
389392
Err(err) => {
390393
error!(log, "Failed to inform Nexus of instance_put success";
391394
"err" => %err,
@@ -396,18 +399,25 @@ impl InstanceManager {
396399
}
397400
}
398401
Err(instance_put_error) => {
399-
if let Err(err) = nexus_client
402+
match nexus_client
400403
.cpapi_handle_instance_put_failure(
401404
&instance_id,
402405
&instance_put_error.to_string(),
403406
)
404407
.await
408+
.map(nexus_client::ResponseValue::into_inner)
405409
{
406-
error!(log, "Failed to inform Nexus of instance_put failure";
407-
"err" => %err,
408-
"instance_id" => %instance_id,
409-
"instance_put_error" => %instance_put_error,
410-
);
410+
Ok(HandleInstancePutResultResult::Ok) => {}
411+
Ok(HandleInstancePutResultResult::TimedOut) => {
412+
todo!("well, i guess this is less awkward but clean up if we have to")
413+
}
414+
Err(err) => {
415+
error!(log, "Failed to inform Nexus of instance_put failure";
416+
"err" => %err,
417+
"instance_id" => %instance_id,
418+
"instance_put_error" => %instance_put_error,
419+
);
420+
}
411421
}
412422
}
413423
};

0 commit comments

Comments
 (0)