Skip to content

Commit 12e2dcb

Browse files
author
lif
committed
messy WIP
1 parent be37bfe commit 12e2dcb

File tree

4 files changed

+93
-30
lines changed

4 files changed

+93
-30
lines changed

nexus/src/app/instance.rs

Lines changed: 33 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ use sled_agent_client::types::InstancePutStateBody;
5555
use std::matches;
5656
use std::net::SocketAddr;
5757
use std::sync::Arc;
58+
use std::time::Duration;
5859
use tokio::io::{AsyncRead, AsyncWrite};
5960
use uuid::Uuid;
6061

@@ -956,14 +957,39 @@ impl super::Nexus {
956957
// the caller to let it decide how to handle it.
957958
//
958959
// When creating the zone for the first time, we just get
959-
// Ok(None) here, which is a no-op. We later asynchronously get
960-
// a cpapi call invoking Self::write_returned_instance_state
960+
// Ok(None) here, which is a no-op in write_returned_instance_state.
961+
// (We later asynchronously receive a cpapi call that invokes
962+
// write_returned_instance_state with the outcome.)
961963
match instance_put_result {
962-
Ok(state) => self
963-
.write_returned_instance_state(&instance_id, state)
964-
.await
965-
.map(|_| ())
966-
.map_err(Into::into),
964+
Ok(state) => {
965+
if state.is_none()
966+
&& matches!(
967+
requested,
968+
InstanceStateChangeRequest::Run
969+
| InstanceStateChangeRequest::Migrate(..)
970+
)
971+
{
972+
// TODO: This is fragile -- suppose the nexus with
973+
// this task crashes *and* the instance creation
974+
// happens to also hang. The new nexus won't know
975+
// to give up on the instance! In an ideal world
976+
// this would be handled by a RPW that polls all
977+
// the instances nexus knows about and finds any
978+
// that have been in a still-starting state for too
979+
// long -- say, InstanceRuntimeState::time_updated
980+
// plus the timeout, assuming time_updated is the
981+
// right point to measure from.
982+
tokio::spawn(async {
983+
tokio::time::sleep(Duration::from_secs(120))
984+
.await;
985+
todo: fail instance
986+
})
987+
}
988+
self.write_returned_instance_state(&instance_id, state)
989+
.await
990+
.map(|_| ())
991+
.map_err(Into::into)
992+
}
967993
Err(e) => Err(InstanceStateChangeError::SledAgent(e)),
968994
}
969995
}

nexus/src/internal_api/http_entrypoints.rs

Lines changed: 40 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ use nexus_types::internal_api::params::SwitchPutRequest;
3131
use nexus_types::internal_api::params::SwitchPutResponse;
3232
use nexus_types::internal_api::views::to_list;
3333
use nexus_types::internal_api::views::BackgroundTask;
34+
use nexus_types::internal_api::views::HandleInstancePutResultResult;
3435
use nexus_types::internal_api::views::Saga;
3536
use omicron_common::api::external::http_pagination::data_page_params_for;
3637
use omicron_common::api::external::http_pagination::PaginatedById;
@@ -283,42 +284,55 @@ async fn cpapi_instances_put(
283284
apictx.internal_latencies.instrument_dropshot_handler(&rqctx, handler).await
284285
}
285286

286-
/// Asynchronously report the successful result of a potentially long-running
287-
/// instance_put call to sled-agent made during instance creation.
287+
/// Asynchronously report the successful result of certain instance_put calls
288+
/// (such as the potentially long-running one made during instance creation)
288289
#[endpoint {
289-
method = PUT,
290-
path = "/instances/{instance_id}/creation-success",
291-
}]
290+
method = PUT,
291+
path = "/instances/{instance_id}/creation-success",
292+
}]
292293
async fn cpapi_handle_instance_put_success(
293294
rqctx: RequestContext<Arc<ServerContext>>,
294295
path_params: Path<InstancePathParam>,
295296
instance_state: TypedBody<SledInstanceState>,
296-
) -> Result<HttpResponseUpdatedNoContent, HttpError> {
297+
) -> Result<HttpResponseOk<HandleInstancePutResultResult>, HttpError> {
297298
let apictx = rqctx.context();
298299
let nexus = &apictx.nexus;
299300
let path = path_params.into_inner();
300301
let result = Ok(instance_state.into_inner());
301302
let opctx = crate::context::op_context_for_internal_api(&rqctx).await;
302303
let handler = async {
303-
nexus
304+
// TODO: if instance_handle_creation_result errors because nexus gave
305+
// up waiting and marked the instance as failed, tell sled-agent to
306+
// destroy the instance
307+
match nexus
304308
.instance_handle_creation_result(&opctx, &path.instance_id, result)
305-
.await?;
309+
.await
310+
{
311+
Ok(_) => Ok(HandleInstancePutResultResult::Ok.into()),
312+
Err(err) => {
313+
if todo {
314+
Ok(HandleInstancePutResultResult::TimedOut.into())
315+
} else {
316+
Err(err)
317+
}
318+
}
319+
}
306320
Ok(HttpResponseUpdatedNoContent())
307321
};
308322
apictx.internal_latencies.instrument_dropshot_handler(&rqctx, handler).await
309323
}
310324

311-
/// Asynchronously report the unsuccessful result of a potentially long-running
312-
/// instance_put call to sled-agent made during instance creation.
325+
/// Asynchronously report the unsuccessful result of certain instance_put calls
326+
/// (such as the potentially long-running one made during instance creation)
313327
#[endpoint {
314-
method = PUT,
315-
path = "/instances/{instance_id}/creation-failure",
316-
}]
328+
method = PUT,
329+
path = "/instances/{instance_id}/creation-failure",
330+
}]
317331
async fn cpapi_handle_instance_put_failure(
318332
rqctx: RequestContext<Arc<ServerContext>>,
319333
path_params: Path<InstancePathParam>,
320334
error: TypedBody<String>,
321-
) -> Result<HttpResponseUpdatedNoContent, HttpError> {
335+
) -> Result<HttpResponseOk<HandleInstancePutResultResult>, HttpError> {
322336
let apictx = rqctx.context();
323337
let nexus = &apictx.nexus;
324338
let path = path_params.into_inner();
@@ -327,10 +341,19 @@ async fn cpapi_handle_instance_put_failure(
327341
});
328342
let opctx = crate::context::op_context_for_internal_api(&rqctx).await;
329343
let handler = async {
330-
nexus
344+
match nexus
331345
.instance_handle_creation_result(&opctx, &path.instance_id, result)
332-
.await?;
333-
Ok(HttpResponseUpdatedNoContent())
346+
.await
347+
{
348+
Ok(_) => Ok(HandleInstancePutResultResult::Ok.into()),
349+
Err(err) => {
350+
if todo {
351+
Ok(HandleInstancePutResultResult::TimedOut.into())
352+
} else {
353+
Err(err)
354+
}
355+
}
356+
}
334357
};
335358
apictx.internal_latencies.instrument_dropshot_handler(&rqctx, handler).await
336359
}

nexus/types/src/internal_api/views.rs

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -296,3 +296,12 @@ pub struct LastResultCompleted {
296296
/// arbitrary datum emitted by the background task
297297
pub details: serde_json::Value,
298298
}
299+
300+
/// Tells sled-agent whether an instance whose status it's reporting is still
301+
/// relevant, or if it's timed out during creation and been marked as failed
302+
/// (such that sled-agent can destroy the tardy instance)
303+
#[derive(Serialize, JsonSchema)]
304+
pub enum HandleInstancePutResultResult {
305+
Ok,
306+
TimedOut,
307+
}

sled-agent/src/instance_manager.rs

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -376,18 +376,23 @@ impl InstanceManager {
376376
match instance.put_state(target).await {
377377
Ok(state) => {
378378
let instance_state: nexus_client::types::SledInstanceState = state.into();
379-
if let Err(err) = nexus_client
379+
match nexus_client
380380
.cpapi_handle_instance_put_success(
381381
&instance_id,
382382
&instance_state,
383383
)
384384
.await
385385
{
386-
error!(log, "Failed to inform Nexus of instance_put success";
387-
"err" => %err,
388-
"instance_state" => ?instance_state,
389-
"instance_id" => %instance_id,
390-
);
386+
Ok(
387+
TODO HandleInstancePutResultResult::Ok
388+
) => {}
389+
Err(err) => {
390+
error!(log, "Failed to inform Nexus of instance_put success";
391+
"err" => %err,
392+
"instance_state" => ?instance_state,
393+
"instance_id" => %instance_id,
394+
)
395+
}
391396
}
392397
}
393398
Err(instance_put_error) => {

0 commit comments

Comments
 (0)