Skip to content

Commit 3bce9e8

Browse files
author
lif
committed
WIP: stringing internal api stuff together on both ends
1 parent 6817d9c commit 3bce9e8

File tree

4 files changed

+195
-27
lines changed

4 files changed

+195
-27
lines changed

nexus/src/app/instance.rs

Lines changed: 27 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -976,31 +976,42 @@ impl super::Nexus {
976976
&self,
977977
opctx: &OpContext,
978978
instance_id: &Uuid,
979-
result: Result<
980-
Option<nexus::SledInstanceState>,
981-
sled_agent_client::types::Error,
982-
>,
979+
result: Result<nexus::SledInstanceState, Error>,
983980
) -> Result<(), Error> {
984-
let (.., authz_instance, db_instance) =
985-
LookupPath::new(&opctx, &self.db_datastore)
986-
.instance_id(*instance_id)
987-
.lookup_for(authz::Action::Modify)
988-
.await?;
981+
let (.., authz_instance) = LookupPath::new(&opctx, &self.db_datastore)
982+
.instance_id(*instance_id)
983+
.lookup_for(authz::Action::Modify)
984+
.await?;
989985

990986
let state = self
991987
.db_datastore
992988
.instance_fetch_with_vmm(opctx, &authz_instance)
993989
.await?;
994990

995991
// TODO: add param for sled-agent to show its 'previous' and compare with this
992+
// to validate consistency between nexus and sled-agent
996993
let prev_instance_runtime = &state.instance().runtime_state;
997994

998-
self.handle_instance_put_result(
999-
instance_id,
1000-
prev_instance_runtime,
1001-
result.map_err(Into::into), // TODO: this isn't real
1002-
).await?;
1003-
todo!()
995+
match result {
996+
Ok(new_state) => self
997+
.db_datastore
998+
.instance_and_vmm_update_runtime(
999+
instance_id,
1000+
&new_state.instance_state.into(),
1001+
&new_state.propolis_id,
1002+
&new_state.vmm_state.into(),
1003+
)
1004+
.await
1005+
.map(|_| ()),
1006+
Err(error) => {
1007+
self.mark_instance_failed(
1008+
instance_id,
1009+
prev_instance_runtime,
1010+
error,
1011+
)
1012+
.await
1013+
}
1014+
}
10041015
}
10051016

10061017
/// Modifies the runtime state of the Instance as requested. This generally
@@ -1296,7 +1307,7 @@ impl super::Nexus {
12961307
&self,
12971308
instance_id: &Uuid,
12981309
prev_instance_runtime: &db::model::InstanceRuntimeState,
1299-
reason: &SledAgentInstancePutError,
1310+
reason: impl std::fmt::Debug,
13001311
) -> Result<(), Error> {
13011312
error!(self.log, "marking instance failed due to sled agent API error";
13021313
"instance_id" => %instance_id,

nexus/src/internal_api/http_entrypoints.rs

Lines changed: 35 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,8 @@ pub(crate) fn internal_api() -> NexusApiDescription {
5858
api.register(physical_disk_delete)?;
5959
api.register(zpool_put)?;
6060
api.register(cpapi_instances_put)?;
61+
api.register(cpapi_handle_instance_put_success)?;
62+
api.register(cpapi_handle_instance_put_failure)?;
6163
api.register(cpapi_disks_put)?;
6264
api.register(cpapi_volume_remove_read_only_parent)?;
6365
api.register(cpapi_disk_remove_read_only_parent)?;
@@ -269,24 +271,48 @@ async fn cpapi_instances_put(
269271
apictx.internal_latencies.instrument_dropshot_handler(&rqctx, handler).await
270272
}
271273

272-
/// Asynchronously report the result of a potentially long-running
274+
/// Asynchronously report the successful result of a potentially long-running
273275
/// instance_put call to sled-agent made during instance creation.
274276
#[endpoint {
275-
method = PUT,
276-
path = "/instances/{instance_id}/creation-result",
277+
method = PUT,
278+
path = "/instances/{instance_id}/creation-success",
279+
}]
280+
async fn cpapi_handle_instance_put_success(
281+
rqctx: RequestContext<Arc<ServerContext>>,
282+
path_params: Path<InstancePathParam>,
283+
instance_state: TypedBody<SledInstanceState>,
284+
) -> Result<HttpResponseUpdatedNoContent, HttpError> {
285+
let apictx = rqctx.context();
286+
let nexus = &apictx.nexus;
287+
let path = path_params.into_inner();
288+
let result = Ok(instance_state.into_inner());
289+
let opctx = crate::context::op_context_for_internal_api(&rqctx).await;
290+
let handler = async {
291+
nexus
292+
.instance_handle_creation_result(&opctx, &path.instance_id, result)
293+
.await?;
294+
Ok(HttpResponseUpdatedNoContent())
295+
};
296+
apictx.internal_latencies.instrument_dropshot_handler(&rqctx, handler).await
297+
}
298+
299+
/// Asynchronously report the unsuccessful result of a potentially long-running
300+
/// instance_put call to sled-agent made during instance creation.
301+
#[endpoint {
302+
method = PUT,
303+
path = "/instances/{instance_id}/creation-failure",
277304
}]
278-
async fn cpapi_handle_instance_put_result(
305+
async fn cpapi_handle_instance_put_failure(
279306
rqctx: RequestContext<Arc<ServerContext>>,
280307
path_params: Path<InstancePathParam>,
281-
result: TypedBody<Result<
282-
Option<SledInstanceState>,
283-
sled_agent_client::types::Error
284-
>>,
308+
error: TypedBody<String>,
285309
) -> Result<HttpResponseUpdatedNoContent, HttpError> {
286310
let apictx = rqctx.context();
287311
let nexus = &apictx.nexus;
288312
let path = path_params.into_inner();
289-
let result = result.into_inner();
313+
let result = Err(omicron_common::api::external::Error::InternalError {
314+
internal_message: error.into_inner(),
315+
});
290316
let opctx = crate::context::op_context_for_internal_api(&rqctx).await;
291317
let handler = async {
292318
nexus

openapi/nexus-internal.json

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -230,6 +230,85 @@
230230
}
231231
}
232232
},
233+
"/instances/{instance_id}/creation-failure": {
234+
"put": {
235+
"summary": "Asynchronously report the unsuccessful result of a potentially long-running",
236+
"description": "instance_put call to sled-agent made during instance creation.",
237+
"operationId": "cpapi_handle_instance_put_failure",
238+
"parameters": [
239+
{
240+
"in": "path",
241+
"name": "instance_id",
242+
"required": true,
243+
"schema": {
244+
"type": "string",
245+
"format": "uuid"
246+
}
247+
}
248+
],
249+
"requestBody": {
250+
"content": {
251+
"application/json": {
252+
"schema": {
253+
"title": "String",
254+
"type": "string"
255+
}
256+
}
257+
},
258+
"required": true
259+
},
260+
"responses": {
261+
"204": {
262+
"description": "resource updated"
263+
},
264+
"4XX": {
265+
"$ref": "#/components/responses/Error"
266+
},
267+
"5XX": {
268+
"$ref": "#/components/responses/Error"
269+
}
270+
}
271+
}
272+
},
273+
"/instances/{instance_id}/creation-success": {
274+
"put": {
275+
"summary": "Asynchronously report the successful result of a potentially long-running",
276+
"description": "instance_put call to sled-agent made during instance creation.",
277+
"operationId": "cpapi_handle_instance_put_success",
278+
"parameters": [
279+
{
280+
"in": "path",
281+
"name": "instance_id",
282+
"required": true,
283+
"schema": {
284+
"type": "string",
285+
"format": "uuid"
286+
}
287+
}
288+
],
289+
"requestBody": {
290+
"content": {
291+
"application/json": {
292+
"schema": {
293+
"$ref": "#/components/schemas/SledInstanceState"
294+
}
295+
}
296+
},
297+
"required": true
298+
},
299+
"responses": {
300+
"204": {
301+
"description": "resource updated"
302+
},
303+
"4XX": {
304+
"$ref": "#/components/responses/Error"
305+
},
306+
"5XX": {
307+
"$ref": "#/components/responses/Error"
308+
}
309+
}
310+
}
311+
},
233312
"/metrics/collect/{producer_id}": {
234313
"get": {
235314
"summary": "Endpoint for oximeter to collect nexus server metrics.",

sled-agent/src/instance_manager.rs

Lines changed: 54 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -362,8 +362,60 @@ impl InstanceManager {
362362
}
363363
};
364364

365-
let new_state = instance.put_state(target).await?;
366-
Ok(InstancePutStateResponse { updated_runtime: Some(new_state) })
365+
match target {
366+
// these may involve a long-running zone creation, so avoid HTTP
367+
// request timeouts by decoupling the response
368+
InstanceStateRequested::MigrationTarget(_)
369+
| InstanceStateRequested::Running => {
370+
let inst_mgr = self.inner.clone();
371+
let log =
372+
self.inner.log.new(o!("component" => "InstanceManager"));
373+
tokio::spawn(async move {
374+
let nexus_client = inst_mgr.nexus_client.client();
375+
match instance.put_state(target).await {
376+
Ok(state) => {
377+
let instance_state: nexus_client::types::SledInstanceState = state.into();
378+
if let Err(err) = nexus_client
379+
.cpapi_handle_instance_put_success(
380+
&instance_id,
381+
&instance_state,
382+
)
383+
.await
384+
{
385+
error!(log, "Failed to inform Nexus of instance_put success";
386+
"err" => %err,
387+
"instance_state" => ?instance_state,
388+
"instance_id" => %instance_id,
389+
);
390+
}
391+
}
392+
Err(instance_put_error) => {
393+
if let Err(err) = nexus_client
394+
.cpapi_handle_instance_put_failure(
395+
&instance_id,
396+
&instance_put_error.to_string(),
397+
)
398+
.await
399+
{
400+
error!(log, "Failed to inform Nexus of instance_put failure";
401+
"err" => %err,
402+
"instance_id" => %instance_id,
403+
"instance_put_error" => %instance_put_error,
404+
);
405+
}
406+
}
407+
};
408+
});
409+
Ok(InstancePutStateResponse { updated_runtime: None })
410+
}
411+
InstanceStateRequested::Stopped
412+
| InstanceStateRequested::Reboot => {
413+
let new_state = instance.put_state(target).await?;
414+
Ok(InstancePutStateResponse {
415+
updated_runtime: Some(new_state),
416+
})
417+
}
418+
}
367419
}
368420

369421
/// Idempotently attempts to set the instance's migration IDs to the

0 commit comments

Comments
 (0)