@@ -400,119 +400,143 @@ where
400400 ) ) ;
401401 }
402402
403- ensure_container (
404- & target_docker,
405- & name,
406- & image_ref,
407- & extra_sans,
408- ssh_gateway_host. as_deref ( ) ,
409- port,
410- disable_tls,
411- disable_gateway_auth,
412- registry_username. as_deref ( ) ,
413- registry_token. as_deref ( ) ,
414- gpu,
415- )
416- . await ?;
417- start_container ( & target_docker, & name) . await ?;
418-
419- // Clean up stale k3s nodes left over from previous container instances that
420- // used the same persistent volume. Without this, pods remain scheduled on
421- // NotReady ghost nodes and the health check will time out.
422- match clean_stale_nodes ( & target_docker, & name) . await {
423- Ok ( 0 ) => { }
424- Ok ( n) => tracing:: debug!( "removed {n} stale node(s)" ) ,
425- Err ( err) => {
426- tracing:: debug!( "stale node cleanup failed (non-fatal): {err}" ) ;
403+ // From this point on, Docker resources (container, volume, network) are
404+ // being created. If any subsequent step fails, we must clean up to avoid
405+ // leaving an orphaned volume in a corrupted state that blocks retries.
406+ // See: https://github.com/NVIDIA/OpenShell/issues/463
407+ let deploy_result: Result < GatewayMetadata > = async {
408+ ensure_container (
409+ & target_docker,
410+ & name,
411+ & image_ref,
412+ & extra_sans,
413+ ssh_gateway_host. as_deref ( ) ,
414+ port,
415+ disable_tls,
416+ disable_gateway_auth,
417+ registry_username. as_deref ( ) ,
418+ registry_token. as_deref ( ) ,
419+ gpu,
420+ )
421+ . await ?;
422+ start_container ( & target_docker, & name) . await ?;
423+
424+ // Clean up stale k3s nodes left over from previous container instances that
425+ // used the same persistent volume. Without this, pods remain scheduled on
426+ // NotReady ghost nodes and the health check will time out.
427+ match clean_stale_nodes ( & target_docker, & name) . await {
428+ Ok ( 0 ) => { }
429+ Ok ( n) => tracing:: debug!( "removed {n} stale node(s)" ) ,
430+ Err ( err) => {
431+ tracing:: debug!( "stale node cleanup failed (non-fatal): {err}" ) ;
432+ }
427433 }
428- }
429434
430- // Reconcile PKI: reuse existing cluster TLS secrets if they are complete and
431- // valid; only generate fresh PKI when secrets are missing, incomplete,
432- // malformed, or expiring within MIN_REMAINING_VALIDITY_DAYS.
433- //
434- // Ordering is: reconcile secrets -> (if rotated and workload exists:
435- // rollout restart and wait) -> persist CLI-side bundle.
436- //
437- // We check workload presence before reconciliation. On a fresh/recreated
438- // cluster, secrets are always newly generated and a restart is unnecessary.
439- // Restarting only when workload pre-existed avoids extra rollout latency.
440- let workload_existed_before_pki = openshell_workload_exists ( & target_docker, & name) . await ?;
441- let ( pki_bundle, rotated) = reconcile_pki ( & target_docker, & name, & extra_sans, & log) . await ?;
442-
443- if rotated && workload_existed_before_pki {
444- // If an openshell workload is already running, it must be restarted so
445- // it picks up the new TLS secrets before we write CLI-side certs.
446- // A failed rollout is a hard error — CLI certs must not be persisted
447- // if the server cannot come up with the new PKI.
448- restart_openshell_deployment ( & target_docker, & name) . await ?;
449- }
435+ // Reconcile PKI: reuse existing cluster TLS secrets if they are complete and
436+ // valid; only generate fresh PKI when secrets are missing, incomplete,
437+ // malformed, or expiring within MIN_REMAINING_VALIDITY_DAYS.
438+ //
439+ // Ordering is: reconcile secrets -> (if rotated and workload exists:
440+ // rollout restart and wait) -> persist CLI-side bundle.
441+ //
442+ // We check workload presence before reconciliation. On a fresh/recreated
443+ // cluster, secrets are always newly generated and a restart is unnecessary.
444+ // Restarting only when workload pre-existed avoids extra rollout latency.
445+ let workload_existed_before_pki = openshell_workload_exists ( & target_docker, & name) . await ?;
446+ let ( pki_bundle, rotated) = reconcile_pki ( & target_docker, & name, & extra_sans, & log) . await ?;
447+
448+ if rotated && workload_existed_before_pki {
449+ // If an openshell workload is already running, it must be restarted so
450+ // it picks up the new TLS secrets before we write CLI-side certs.
451+ // A failed rollout is a hard error — CLI certs must not be persisted
452+ // if the server cannot come up with the new PKI.
453+ restart_openshell_deployment ( & target_docker, & name) . await ?;
454+ }
450455
451- store_pki_bundle ( & name, & pki_bundle) ?;
456+ store_pki_bundle ( & name, & pki_bundle) ?;
457+
458+ // Push locally-built component images into the k3s containerd runtime.
459+ // This is the "push" path for local development — images are exported from
460+ // the local Docker daemon and streamed into the cluster's containerd so
461+ // k3s can resolve them without pulling from the remote registry.
462+ if remote_opts. is_none ( )
463+ && let Ok ( push_images_str) = std:: env:: var ( "OPENSHELL_PUSH_IMAGES" )
464+ {
465+ let images: Vec < & str > = push_images_str
466+ . split ( ',' )
467+ . map ( str:: trim)
468+ . filter ( |s| !s. is_empty ( ) )
469+ . collect ( ) ;
470+ if !images. is_empty ( ) {
471+ log ( "[status] Deploying components" . to_string ( ) ) ;
472+ let local_docker = Docker :: connect_with_local_defaults ( ) . into_diagnostic ( ) ?;
473+ let container = container_name ( & name) ;
474+ let on_log_ref = Arc :: clone ( & on_log) ;
475+ let mut push_log = move |msg : String | {
476+ if let Ok ( mut f) = on_log_ref. lock ( ) {
477+ f ( msg) ;
478+ }
479+ } ;
480+ push:: push_local_images (
481+ & local_docker,
482+ & target_docker,
483+ & container,
484+ & images,
485+ & mut push_log,
486+ )
487+ . await ?;
452488
453- // Push locally-built component images into the k3s containerd runtime.
454- // This is the "push" path for local development — images are exported from
455- // the local Docker daemon and streamed into the cluster's containerd so
456- // k3s can resolve them without pulling from the remote registry.
457- if remote_opts. is_none ( )
458- && let Ok ( push_images_str) = std:: env:: var ( "OPENSHELL_PUSH_IMAGES" )
459- {
460- let images: Vec < & str > = push_images_str
461- . split ( ',' )
462- . map ( str:: trim)
463- . filter ( |s| !s. is_empty ( ) )
464- . collect ( ) ;
465- if !images. is_empty ( ) {
466- log ( "[status] Deploying components" . to_string ( ) ) ;
467- let local_docker = Docker :: connect_with_local_defaults ( ) . into_diagnostic ( ) ?;
468- let container = container_name ( & name) ;
489+ restart_openshell_deployment ( & target_docker, & name) . await ?;
490+ }
491+ }
492+
493+ log ( "[status] Starting gateway" . to_string ( ) ) ;
494+ {
495+ // Create a short-lived closure that locks on each call rather than holding
496+ // the MutexGuard across await points.
469497 let on_log_ref = Arc :: clone ( & on_log) ;
470- let mut push_log = move |msg : String | {
498+ let mut gateway_log = move |msg : String | {
471499 if let Ok ( mut f) = on_log_ref. lock ( ) {
472500 f ( msg) ;
473501 }
474502 } ;
475- push:: push_local_images (
476- & local_docker,
477- & target_docker,
478- & container,
479- & images,
480- & mut push_log,
481- )
482- . await ?;
483-
484- restart_openshell_deployment ( & target_docker, & name) . await ?;
503+ wait_for_gateway_ready ( & target_docker, & name, & mut gateway_log) . await ?;
485504 }
486- }
487505
488- log ( "[status] Starting gateway" . to_string ( ) ) ;
489- {
490- // Create a short-lived closure that locks on each call rather than holding
491- // the MutexGuard across await points.
492- let on_log_ref = Arc :: clone ( & on_log) ;
493- let mut gateway_log = move |msg : String | {
494- if let Ok ( mut f) = on_log_ref. lock ( ) {
495- f ( msg) ;
506+ // Create and store gateway metadata.
507+ let metadata = create_gateway_metadata_with_host (
508+ & name,
509+ remote_opts. as_ref ( ) ,
510+ port,
511+ ssh_gateway_host. as_deref ( ) ,
512+ disable_tls,
513+ ) ;
514+ store_gateway_metadata ( & name, & metadata) ?;
515+
516+ Ok ( metadata)
517+ }
518+ . await ;
519+
520+ match deploy_result {
521+ Ok ( metadata) => Ok ( GatewayHandle {
522+ name,
523+ metadata,
524+ docker : target_docker,
525+ } ) ,
526+ Err ( deploy_err) => {
527+ // Automatically clean up Docker resources (volume, container, network,
528+ // image) so the environment is left in a retryable state.
529+ tracing:: info!( "deploy failed, cleaning up gateway resources for '{name}'" ) ;
530+ if let Err ( cleanup_err) = destroy_gateway_resources ( & target_docker, & name) . await {
531+ tracing:: warn!(
532+ "automatic cleanup after failed deploy also failed: {cleanup_err}. \
533+ Manual cleanup may be required: \
534+ openshell gateway destroy --name {name}"
535+ ) ;
496536 }
497- } ;
498- wait_for_gateway_ready ( & target_docker , & name , & mut gateway_log ) . await ? ;
537+ Err ( deploy_err )
538+ }
499539 }
500-
501- // Create and store gateway metadata.
502- let metadata = create_gateway_metadata_with_host (
503- & name,
504- remote_opts. as_ref ( ) ,
505- port,
506- ssh_gateway_host. as_deref ( ) ,
507- disable_tls,
508- ) ;
509- store_gateway_metadata ( & name, & metadata) ?;
510-
511- Ok ( GatewayHandle {
512- name,
513- metadata,
514- docker : target_docker,
515- } )
516540}
517541
518542/// Get a handle to an existing gateway.
0 commit comments