Skip to content

Commit 4878b9b

Browse files
authored
fix(bootstrap): auto-cleanup Docker resources on failed gateway deploy (#464)
1 parent a912848 commit 4878b9b

File tree

2 files changed

+210
-107
lines changed

2 files changed

+210
-107
lines changed

crates/openshell-bootstrap/src/errors.rs

Lines changed: 86 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -175,21 +175,19 @@ fn diagnose_corrupted_state(gateway_name: &str) -> GatewayFailureDiagnosis {
175175
GatewayFailureDiagnosis {
176176
summary: "Corrupted cluster state".to_string(),
177177
explanation: "The gateway cluster has corrupted internal state, likely from a previous \
178-
interrupted startup or unclean shutdown."
178+
interrupted startup or unclean shutdown. Resources from the failed deploy have been \
179+
automatically cleaned up."
179180
.to_string(),
180181
recovery_steps: vec![
182+
RecoveryStep::new("Retry the gateway start (cleanup was automatic)"),
181183
RecoveryStep::with_command(
182-
"Destroy and recreate the gateway",
184+
"If the retry fails, manually destroy and recreate",
183185
format!(
184186
"openshell gateway destroy --name {gateway_name} && openshell gateway start"
185187
),
186188
),
187-
RecoveryStep::with_command(
188-
"If that fails, remove the volume for a clean slate",
189-
format!("docker volume rm openshell-cluster-{gateway_name}"),
190-
),
191189
],
192-
retryable: false,
190+
retryable: true,
193191
}
194192
}
195193

@@ -483,6 +481,87 @@ mod tests {
483481
assert!(d.summary.contains("Corrupted"));
484482
}
485483

484+
#[test]
485+
fn test_diagnose_corrupted_state_is_retryable_after_auto_cleanup() {
486+
// After the auto-cleanup fix (#463), corrupted state errors should be
487+
// marked retryable because deploy_gateway_with_logs now automatically
488+
// cleans up Docker resources on failure.
489+
let d = diagnose_failure(
490+
"mygw",
491+
"K8s namespace not ready",
492+
Some("configmaps \"extension-apiserver-authentication\" is forbidden"),
493+
)
494+
.expect("should match corrupted state pattern");
495+
assert!(
496+
d.retryable,
497+
"corrupted state should be retryable after auto-cleanup"
498+
);
499+
assert!(
500+
d.explanation.contains("automatically cleaned up"),
501+
"explanation should mention automatic cleanup, got: {}",
502+
d.explanation
503+
);
504+
}
505+
506+
#[test]
507+
fn test_diagnose_corrupted_state_recovery_no_manual_volume_rm() {
508+
// The recovery steps should no longer include a manual docker volume rm
509+
// command, since cleanup is now automatic. The first step should tell
510+
// the user to simply retry.
511+
let d = diagnose_failure("mygw", "cannot get resource \"namespaces\"", None)
512+
.expect("should match corrupted state pattern");
513+
514+
let all_commands: Vec<String> = d
515+
.recovery_steps
516+
.iter()
517+
.filter_map(|s| s.command.clone())
518+
.collect();
519+
let all_commands_joined = all_commands.join(" ");
520+
521+
assert!(
522+
!all_commands_joined.contains("docker volume rm"),
523+
"recovery steps should not include manual docker volume rm, got: {all_commands_joined}"
524+
);
525+
526+
// First step should be a description-only step (no command) about retrying
527+
assert!(
528+
d.recovery_steps[0].command.is_none(),
529+
"first recovery step should be description-only (automatic cleanup)"
530+
);
531+
assert!(
532+
d.recovery_steps[0]
533+
.description
534+
.contains("cleanup was automatic"),
535+
"first recovery step should mention automatic cleanup"
536+
);
537+
}
538+
539+
#[test]
540+
fn test_diagnose_corrupted_state_fallback_step_includes_gateway_name() {
541+
// The fallback recovery step should interpolate the gateway name so
542+
// users can copy-paste the command.
543+
let d = diagnose_failure("my-gateway", "is forbidden", None)
544+
.expect("should match corrupted state pattern");
545+
546+
assert!(
547+
d.recovery_steps.len() >= 2,
548+
"should have at least 2 recovery steps"
549+
);
550+
let fallback = &d.recovery_steps[1];
551+
let cmd = fallback
552+
.command
553+
.as_deref()
554+
.expect("fallback step should have a command");
555+
assert!(
556+
cmd.contains("my-gateway"),
557+
"fallback command should contain gateway name, got: {cmd}"
558+
);
559+
assert!(
560+
cmd.contains("openshell gateway destroy"),
561+
"fallback command should include gateway destroy, got: {cmd}"
562+
);
563+
}
564+
486565
#[test]
487566
fn test_diagnose_no_default_route() {
488567
let diagnosis = diagnose_failure(

crates/openshell-bootstrap/src/lib.rs

Lines changed: 124 additions & 100 deletions
Original file line numberDiff line numberDiff line change
@@ -400,119 +400,143 @@ where
400400
));
401401
}
402402

403-
ensure_container(
404-
&target_docker,
405-
&name,
406-
&image_ref,
407-
&extra_sans,
408-
ssh_gateway_host.as_deref(),
409-
port,
410-
disable_tls,
411-
disable_gateway_auth,
412-
registry_username.as_deref(),
413-
registry_token.as_deref(),
414-
gpu,
415-
)
416-
.await?;
417-
start_container(&target_docker, &name).await?;
418-
419-
// Clean up stale k3s nodes left over from previous container instances that
420-
// used the same persistent volume. Without this, pods remain scheduled on
421-
// NotReady ghost nodes and the health check will time out.
422-
match clean_stale_nodes(&target_docker, &name).await {
423-
Ok(0) => {}
424-
Ok(n) => tracing::debug!("removed {n} stale node(s)"),
425-
Err(err) => {
426-
tracing::debug!("stale node cleanup failed (non-fatal): {err}");
403+
// From this point on, Docker resources (container, volume, network) are
404+
// being created. If any subsequent step fails, we must clean up to avoid
405+
// leaving an orphaned volume in a corrupted state that blocks retries.
406+
// See: https://github.com/NVIDIA/OpenShell/issues/463
407+
let deploy_result: Result<GatewayMetadata> = async {
408+
ensure_container(
409+
&target_docker,
410+
&name,
411+
&image_ref,
412+
&extra_sans,
413+
ssh_gateway_host.as_deref(),
414+
port,
415+
disable_tls,
416+
disable_gateway_auth,
417+
registry_username.as_deref(),
418+
registry_token.as_deref(),
419+
gpu,
420+
)
421+
.await?;
422+
start_container(&target_docker, &name).await?;
423+
424+
// Clean up stale k3s nodes left over from previous container instances that
425+
// used the same persistent volume. Without this, pods remain scheduled on
426+
// NotReady ghost nodes and the health check will time out.
427+
match clean_stale_nodes(&target_docker, &name).await {
428+
Ok(0) => {}
429+
Ok(n) => tracing::debug!("removed {n} stale node(s)"),
430+
Err(err) => {
431+
tracing::debug!("stale node cleanup failed (non-fatal): {err}");
432+
}
427433
}
428-
}
429434

430-
// Reconcile PKI: reuse existing cluster TLS secrets if they are complete and
431-
// valid; only generate fresh PKI when secrets are missing, incomplete,
432-
// malformed, or expiring within MIN_REMAINING_VALIDITY_DAYS.
433-
//
434-
// Ordering is: reconcile secrets -> (if rotated and workload exists:
435-
// rollout restart and wait) -> persist CLI-side bundle.
436-
//
437-
// We check workload presence before reconciliation. On a fresh/recreated
438-
// cluster, secrets are always newly generated and a restart is unnecessary.
439-
// Restarting only when workload pre-existed avoids extra rollout latency.
440-
let workload_existed_before_pki = openshell_workload_exists(&target_docker, &name).await?;
441-
let (pki_bundle, rotated) = reconcile_pki(&target_docker, &name, &extra_sans, &log).await?;
442-
443-
if rotated && workload_existed_before_pki {
444-
// If an openshell workload is already running, it must be restarted so
445-
// it picks up the new TLS secrets before we write CLI-side certs.
446-
// A failed rollout is a hard error — CLI certs must not be persisted
447-
// if the server cannot come up with the new PKI.
448-
restart_openshell_deployment(&target_docker, &name).await?;
449-
}
435+
// Reconcile PKI: reuse existing cluster TLS secrets if they are complete and
436+
// valid; only generate fresh PKI when secrets are missing, incomplete,
437+
// malformed, or expiring within MIN_REMAINING_VALIDITY_DAYS.
438+
//
439+
// Ordering is: reconcile secrets -> (if rotated and workload exists:
440+
// rollout restart and wait) -> persist CLI-side bundle.
441+
//
442+
// We check workload presence before reconciliation. On a fresh/recreated
443+
// cluster, secrets are always newly generated and a restart is unnecessary.
444+
// Restarting only when workload pre-existed avoids extra rollout latency.
445+
let workload_existed_before_pki = openshell_workload_exists(&target_docker, &name).await?;
446+
let (pki_bundle, rotated) = reconcile_pki(&target_docker, &name, &extra_sans, &log).await?;
447+
448+
if rotated && workload_existed_before_pki {
449+
// If an openshell workload is already running, it must be restarted so
450+
// it picks up the new TLS secrets before we write CLI-side certs.
451+
// A failed rollout is a hard error — CLI certs must not be persisted
452+
// if the server cannot come up with the new PKI.
453+
restart_openshell_deployment(&target_docker, &name).await?;
454+
}
450455

451-
store_pki_bundle(&name, &pki_bundle)?;
456+
store_pki_bundle(&name, &pki_bundle)?;
457+
458+
// Push locally-built component images into the k3s containerd runtime.
459+
// This is the "push" path for local development — images are exported from
460+
// the local Docker daemon and streamed into the cluster's containerd so
461+
// k3s can resolve them without pulling from the remote registry.
462+
if remote_opts.is_none()
463+
&& let Ok(push_images_str) = std::env::var("OPENSHELL_PUSH_IMAGES")
464+
{
465+
let images: Vec<&str> = push_images_str
466+
.split(',')
467+
.map(str::trim)
468+
.filter(|s| !s.is_empty())
469+
.collect();
470+
if !images.is_empty() {
471+
log("[status] Deploying components".to_string());
472+
let local_docker = Docker::connect_with_local_defaults().into_diagnostic()?;
473+
let container = container_name(&name);
474+
let on_log_ref = Arc::clone(&on_log);
475+
let mut push_log = move |msg: String| {
476+
if let Ok(mut f) = on_log_ref.lock() {
477+
f(msg);
478+
}
479+
};
480+
push::push_local_images(
481+
&local_docker,
482+
&target_docker,
483+
&container,
484+
&images,
485+
&mut push_log,
486+
)
487+
.await?;
452488

453-
// Push locally-built component images into the k3s containerd runtime.
454-
// This is the "push" path for local development — images are exported from
455-
// the local Docker daemon and streamed into the cluster's containerd so
456-
// k3s can resolve them without pulling from the remote registry.
457-
if remote_opts.is_none()
458-
&& let Ok(push_images_str) = std::env::var("OPENSHELL_PUSH_IMAGES")
459-
{
460-
let images: Vec<&str> = push_images_str
461-
.split(',')
462-
.map(str::trim)
463-
.filter(|s| !s.is_empty())
464-
.collect();
465-
if !images.is_empty() {
466-
log("[status] Deploying components".to_string());
467-
let local_docker = Docker::connect_with_local_defaults().into_diagnostic()?;
468-
let container = container_name(&name);
489+
restart_openshell_deployment(&target_docker, &name).await?;
490+
}
491+
}
492+
493+
log("[status] Starting gateway".to_string());
494+
{
495+
// Create a short-lived closure that locks on each call rather than holding
496+
// the MutexGuard across await points.
469497
let on_log_ref = Arc::clone(&on_log);
470-
let mut push_log = move |msg: String| {
498+
let mut gateway_log = move |msg: String| {
471499
if let Ok(mut f) = on_log_ref.lock() {
472500
f(msg);
473501
}
474502
};
475-
push::push_local_images(
476-
&local_docker,
477-
&target_docker,
478-
&container,
479-
&images,
480-
&mut push_log,
481-
)
482-
.await?;
483-
484-
restart_openshell_deployment(&target_docker, &name).await?;
503+
wait_for_gateway_ready(&target_docker, &name, &mut gateway_log).await?;
485504
}
486-
}
487505

488-
log("[status] Starting gateway".to_string());
489-
{
490-
// Create a short-lived closure that locks on each call rather than holding
491-
// the MutexGuard across await points.
492-
let on_log_ref = Arc::clone(&on_log);
493-
let mut gateway_log = move |msg: String| {
494-
if let Ok(mut f) = on_log_ref.lock() {
495-
f(msg);
506+
// Create and store gateway metadata.
507+
let metadata = create_gateway_metadata_with_host(
508+
&name,
509+
remote_opts.as_ref(),
510+
port,
511+
ssh_gateway_host.as_deref(),
512+
disable_tls,
513+
);
514+
store_gateway_metadata(&name, &metadata)?;
515+
516+
Ok(metadata)
517+
}
518+
.await;
519+
520+
match deploy_result {
521+
Ok(metadata) => Ok(GatewayHandle {
522+
name,
523+
metadata,
524+
docker: target_docker,
525+
}),
526+
Err(deploy_err) => {
527+
// Automatically clean up Docker resources (volume, container, network,
528+
// image) so the environment is left in a retryable state.
529+
tracing::info!("deploy failed, cleaning up gateway resources for '{name}'");
530+
if let Err(cleanup_err) = destroy_gateway_resources(&target_docker, &name).await {
531+
tracing::warn!(
532+
"automatic cleanup after failed deploy also failed: {cleanup_err}. \
533+
Manual cleanup may be required: \
534+
openshell gateway destroy --name {name}"
535+
);
496536
}
497-
};
498-
wait_for_gateway_ready(&target_docker, &name, &mut gateway_log).await?;
537+
Err(deploy_err)
538+
}
499539
}
500-
501-
// Create and store gateway metadata.
502-
let metadata = create_gateway_metadata_with_host(
503-
&name,
504-
remote_opts.as_ref(),
505-
port,
506-
ssh_gateway_host.as_deref(),
507-
disable_tls,
508-
);
509-
store_gateway_metadata(&name, &metadata)?;
510-
511-
Ok(GatewayHandle {
512-
name,
513-
metadata,
514-
docker: target_docker,
515-
})
516540
}
517541

518542
/// Get a handle to an existing gateway.

0 commit comments

Comments
 (0)