feat: allow multiple node jobs

vsoch · vsoch · commit ca6de8ca1113 · 2025-03-24T18:30:38.000-06:00
There is a bug in the kubernetes tracker that we treat
the failed/succeeded as boolean (0/1) when it is actually a count
of indices. We have not done experiments with &gt;1 nodes so this has
not been an issue (or caught). This change will fix it.

Signed-off-by: vsoch &lt;vsoch@users.noreply.github.com&gt;
diff --git a/examples/test/multiple-pods.yaml b/examples/test/multiple-pods.yaml
@@ -0,0 +1,40 @@
+apiVersion: state-machine.converged-computing.org/v1alpha1
+kind: StateMachine
+metadata:
+  name: state-machine
+spec:
+  manager:
+    pullPolicy: Never
+    interactive: true
+  workflow:
+    completed: 2
+  cluster:
+    maxSize: 2
+
+  jobs:
+  - name: job_a
+    properties:
+      save-path: "/opt"
+    config:
+      nodes: 1
+      coresPerTask: 1
+    image: rockylinux:9
+    script: echo This is the first
+
+  - name: job_b
+    properties:
+      save-path: "/opt"
+    config:
+      nodes: 4
+      coresPerTask: 1
+    image: rockylinux:9
+    script: echo This is the second
+
+  - name: job_c
+    config:
+      nodes: 1
+      coresPerTask: 1
+    properties:
+      save-path: "/opt"
+    image: rockylinux:9
+    script: echo This is the third
diff --git a/examples/test/save-logs.yaml b/examples/test/save-logs.yaml
@@ -5,7 +5,7 @@ metadata:
 spec:
   manager:
     pullPolicy: Never
-    
+
   workflow:
     completed: 2
   cluster:
@@ -20,7 +20,7 @@ spec:
       coresPerTask: 1
     image: rockylinux:9
     script: echo This is the first
-    
+
   - name: job_b
     properties:
       save-path: "/opt"
@@ -29,7 +29,7 @@ spec:
       coresPerTask: 1
     image: rockylinux:9
     script: echo This is the second
-    
+
   - name: job_c
     config:
       nodes: 1
diff --git a/python/state_machine_operator/manager/__init__.py b/python/state_machine_operator/manager/__init__.py
@@ -60,6 +60,12 @@ def get_parser():
         "--config-dir",
         help="Directory with configuration files.",
     )
+    start.add_argument(
+        "--quiet",
+        help="Don't print progress",
+        default=False,
+        action="store_true",
+    )
     start.add_argument(
         "--plain-http",
         help="Use plain http for the registry.",
@@ -108,6 +114,7 @@ def help(return_code=0):
         # Will overwrite what is set in config
         workdir=args.workdir,
         plain_http=args.plain_http,
+        quiet=args.quiet,
     )
     manager.start()
 
diff --git a/python/state_machine_operator/manager/manager.py b/python/state_machine_operator/manager/manager.py
@@ -31,6 +31,7 @@ def __init__(
         workdir=None,
         registry=None,
         plain_http=False,
+        quiet=False,
     ):
         """
         Initialize the WorkflowManager. Much of this logic used to be in setup,
@@ -47,6 +48,7 @@ def __init__(
 
         # Working directory = first preference to command line
         self.workflow.set_workdir(workdir)
+        self.quiet = quiet
 
         # Running modes (we only allow kubernetes for now)
         LOGGER.info(f" Job Prefix: [{self.prefix}]")
@@ -137,7 +139,7 @@ def get_current_state(self):
         jobs = self.list_jobs_by_status()
 
         # Give a warning about unknown jobs
-        # In practice, I don't know why this would happen.
+        # In practice, this is a state not properly accounted for
         if jobs["unknown"]:
             LOGGER.warning(f"Found {len(jobs['unknown'])} unknown jobs to investigate.")
 
@@ -265,6 +267,9 @@ def check_complete(self):
             self.watcher.save(self.save_dir)
 
             self.save_times()
+
+            # For extra files to write
+            time.sleep(5)
             sys.exit(0)
 
     @property
@@ -313,19 +318,23 @@ def new_jobs(self):
         # submit_n negative would be OK, a 0-> negative range is empty
         submit_n = max(submit_n, 0)
 
-        LOGGER.info(f"\n> 🌀 Starting step {step['name']}")
-        LOGGER.info("> Workflow needs")
-        LOGGER.info(f"  > total completions           {self.workflow.completions_needed} ")
-        LOGGER.info(f"  > max nodes allowed use       {self.workflow.max_size}\n")
-        LOGGER.info("> Current state")
-        LOGGER.info(f"  > nodes / step                {nodes_needed} ")
-        LOGGER.info(f"  > jobs needed                 {jobs_needed} ")
-        LOGGER.info(f"  > nodes allowed               {nodes_allowed} ")
-        LOGGER.info(f"  > jobs allowed                {jobs_allowed}\n")
-        LOGGER.info("> Workflow progress")
-        LOGGER.info(f"  > Completions                 {completions}")
-        LOGGER.info(f"  > In progress                 {active_jobs}")
-        LOGGER.info(f"  > New job sequences submit    {submit_n} ")
+        logfn = LOGGER.info
+        if self.quiet:
+            logfn = LOGGER.debug
+
+        logfn(f"\n> 🌀 Starting step {step['name']}")
+        logfn("> Workflow needs")
+        logfn(f"  > total completions           {self.workflow.completions_needed} ")
+        logfn(f"  > max nodes allowed use       {self.workflow.max_size}\n")
+        logfn("> Current state")
+        logfn(f"  > nodes / step                {nodes_needed} ")
+        logfn(f"  > jobs needed                 {jobs_needed} ")
+        logfn(f"  > nodes allowed               {nodes_allowed} ")
+        logfn(f"  > jobs allowed                {jobs_allowed}\n")
+        logfn("> Workflow progress")
+        logfn(f"  > Completions                 {completions}")
+        logfn(f"  > In progress                 {active_jobs}")
+        logfn(f"  > New job sequences submit    {submit_n} ")
 
         # If submit is > than completions needed, we don't need that many
         # TODO we would also downscale the cluster here
@@ -416,7 +425,6 @@ def watch(self):
         Watch is an event driven means to watch for changes and update job states
         accordingly.
         """
-        # TODO we should have some kind of check that does not rely on an event
         for job in self.tracker.stream_events():
 
             # Not a job associated with the workflow, or is ignored
diff --git a/python/state_machine_operator/tracker/kubernetes/job.py b/python/state_machine_operator/tracker/kubernetes/job.py
@@ -40,15 +40,16 @@ def is_completed(self):
 
     def is_failed(self):
         """
-        Determine if a job is failed
+        Determine if a job is failed.
         """
-        return self.job.status.failed == 1
+        return not self.is_succeeded()
 
     def is_succeeded(self):
         """
         Determine if a job has succeeded
+        We need to have a completion time and no failed indices.
         """
-        return self.job.status.succeeded == 1
+        return self.is_completed and not self.job.status.failed
 
 
 def get_namespace():
diff --git a/python/state_machine_operator/tracker/kubernetes/state.py b/python/state_machine_operator/tracker/kubernetes/state.py
@@ -58,23 +58,33 @@ def list_jobs_by_status(label_name="app", label_value=None):
     states = {"success": [], "failed": [], "running": [], "queued": [], "unknown": []}
 
     for job in jobs:
+
+        # These are *counts* of job indices, not boolean 0/1
+        succeeded = job.status.succeeded
+        failed = job.status.failed
+        active = job.status.active
+        not_active = active in [0, None]
+
+        # This is a completion time for the job
+        completion_time = job.status.completion_time
+
         # Success means we finished with succeeded condition
-        if job.status.succeeded == 1 and job.status.completion_time is not None:
+        if succeeded is not None and succeeded > 0 and completion_time is not None:
             states["success"].append(Job(job))
             continue
 
         # Failure means we finished with failed condition
-        if job.status.failed == 1:
+        if failed is not None and failed > 0:
             states["failed"].append(Job(job))
             continue
 
         # Not active, and not finished is queued
-        if not job.status.active and not job.status.completion_time:
+        if not_active and not completion_time:
             states["queued"].append(Job(job))
             continue
 
         # Active, and not finished is running
-        if job.status.active == 1 and not job.status.completion_time:
+        if active and not completion_time:
             states["running"].append(Job(job))
             continue
 
diff --git a/python/state_machine_operator/tracker/kubernetes/tracker.py b/python/state_machine_operator/tracker/kubernetes/tracker.py
@@ -349,6 +349,7 @@ def save_log(self, job=None):
 
         # We might have one pod, but can't assume
         for i, pod in enumerate(pods):
+            print(f"Saving log for {pod.metadata.name}")
             try:
                 logs = api.read_namespaced_pod_log(
                     name=pod.metadata.name,
@@ -363,7 +364,10 @@ def save_log(self, job=None):
                 )
                 # Don't write twice
                 if not os.path.exists(log_file):
+                    print(f"Saving log file {log_file}")
                     utils.write_file(logs, log_file)
+                else:
+                    print(f"Log file {log_file} already exists")
 
             except client.exceptions.ApiException as e:
                 print(f"Error getting logs: {e}")