Skip to content

Commit b11558b

Browse files
committed
[feature] Enhance container log retrieval for exited containers
1 parent 8ac783d commit b11558b

File tree

1 file changed

+24
-6
lines changed

1 file changed

+24
-6
lines changed

python/fedml/computing/scheduler/model_scheduler/device_model_deployment.py

Lines changed: 24 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -342,10 +342,15 @@ def log_deployment_output(end_point_id, model_id, cmd_container_name, cmd_type,
342342
if container_obj is not None:
343343
out_logs, err_logs = None, None
344344
try:
345-
out_logs = container_obj.logs(stdout=True, stderr=False, stream=False, follow=False,
346-
since=last_log_time)
347-
err_logs = container_obj.logs(stdout=False, stderr=True, stream=False, follow=False,
348-
since=last_log_time)
345+
if container_obj.status == "exited":
346+
# If the container has exited, we need to get the whole logs from the container
347+
out_logs = container_obj.logs(stdout=True, stderr=False, stream=False, follow=False)
348+
err_logs = container_obj.logs(stdout=False, stderr=True, stream=False, follow=False)
349+
else:
350+
out_logs = container_obj.logs(stdout=True, stderr=False, stream=False, follow=False,
351+
since=last_log_time)
352+
err_logs = container_obj.logs(stdout=False, stderr=True, stream=False, follow=False,
353+
since=last_log_time)
349354
except Exception as e:
350355
logging.error(f"Failed to get the logs from the container with exception {e}")
351356
pass
@@ -355,16 +360,29 @@ def log_deployment_output(end_point_id, model_id, cmd_container_name, cmd_type,
355360
if err_logs is not None:
356361
err_logs = sys_utils.decode_our_err_result(err_logs)
357362
if len(err_logs) > 0:
358-
logging.error(f"{format(err_logs)}")
363+
logging.error(f"[-- Container Error Logs Start --]\n{format(err_logs)}\n[-- Container Error Logs End --]")
359364

360365
if out_logs is not None:
361366
out_logs = sys_utils.decode_our_err_result(out_logs)
362367
if len(out_logs) > 0:
363-
logging.info(f"{format(out_logs)}")
368+
logging.info(f"[-- Container Stdout Logs Start --]\n{format(out_logs)}\n[-- Container Stdout Logs End --]")
364369

365370
if container_obj.status == "exited":
366371
logging.info("Container {} has exited, automatically remove it".format(cmd_container_name))
367372

373+
# try to get the logs from the filesystem
374+
if out_logs is None or err_logs is None:
375+
try:
376+
logs_path = f"/var/lib/docker/containers/{container_obj.id}/{container_obj.id}-json.log"
377+
if os.path.exists(logs_path):
378+
with open(logs_path, 'r') as f:
379+
raw_logs = f.readlines()
380+
out_logs = '\n'.join([line for line in raw_logs if '"stream":"stdout"' in line])
381+
err_logs = '\n'.join([line for line in raw_logs if '"stream":"stderr"' in line])
382+
logging.error(f"read Container Error Logs from log file: {err_logs}")
383+
except Exception as e:
384+
logging.warning(f"Failed to read logs from filesystem: {str(e)}")
385+
368386
# Save the failed log into ~/.fedml/fedml-model-client/fedml/logs/failed_logs/
369387
# $run_id/$container_name.log
370388
try:

0 commit comments

Comments
 (0)