@@ -342,10 +342,15 @@ def log_deployment_output(end_point_id, model_id, cmd_container_name, cmd_type,
342
342
if container_obj is not None :
343
343
out_logs , err_logs = None , None
344
344
try :
345
- out_logs = container_obj .logs (stdout = True , stderr = False , stream = False , follow = False ,
346
- since = last_log_time )
347
- err_logs = container_obj .logs (stdout = False , stderr = True , stream = False , follow = False ,
348
- since = last_log_time )
345
+ if container_obj .status == "exited" :
346
+ # If the container has exited, we need to get the whole logs from the container
347
+ out_logs = container_obj .logs (stdout = True , stderr = False , stream = False , follow = False )
348
+ err_logs = container_obj .logs (stdout = False , stderr = True , stream = False , follow = False )
349
+ else :
350
+ out_logs = container_obj .logs (stdout = True , stderr = False , stream = False , follow = False ,
351
+ since = last_log_time )
352
+ err_logs = container_obj .logs (stdout = False , stderr = True , stream = False , follow = False ,
353
+ since = last_log_time )
349
354
except Exception as e :
350
355
logging .error (f"Failed to get the logs from the container with exception { e } " )
351
356
pass
@@ -355,16 +360,29 @@ def log_deployment_output(end_point_id, model_id, cmd_container_name, cmd_type,
355
360
if err_logs is not None :
356
361
err_logs = sys_utils .decode_our_err_result (err_logs )
357
362
if len (err_logs ) > 0 :
358
- logging .error (f"{ format (err_logs )} " )
363
+ logging .error (f"[-- Container Error Logs Start --] \n { format (err_logs )} \n [-- Container Error Logs End --] " )
359
364
360
365
if out_logs is not None :
361
366
out_logs = sys_utils .decode_our_err_result (out_logs )
362
367
if len (out_logs ) > 0 :
363
- logging .info (f"{ format (out_logs )} " )
368
+ logging .info (f"[-- Container Stdout Logs Start --] \n { format (out_logs )} \n [-- Container Stdout Logs End --] " )
364
369
365
370
if container_obj .status == "exited" :
366
371
logging .info ("Container {} has exited, automatically remove it" .format (cmd_container_name ))
367
372
373
+ # try to get the logs from the filesystem
374
+ if out_logs is None or err_logs is None :
375
+ try :
376
+ logs_path = f"/var/lib/docker/containers/{ container_obj .id } /{ container_obj .id } -json.log"
377
+ if os .path .exists (logs_path ):
378
+ with open (logs_path , 'r' ) as f :
379
+ raw_logs = f .readlines ()
380
+ out_logs = '\n ' .join ([line for line in raw_logs if '"stream":"stdout"' in line ])
381
+ err_logs = '\n ' .join ([line for line in raw_logs if '"stream":"stderr"' in line ])
382
+ logging .error (f"read Container Error Logs from log file: { err_logs } " )
383
+ except Exception as e :
384
+ logging .warning (f"Failed to read logs from filesystem: { str (e )} " )
385
+
368
386
# Save the failed log into ~/.fedml/fedml-model-client/fedml/logs/failed_logs/
369
387
# $run_id/$container_name.log
370
388
try :
0 commit comments