src/scheduler: schedule a job retry

Jeny Sadadia · Jeny Sadadia · commit 752d67d587bd · 2025-07-30T15:08:00.000+05:30
Implement threading to listen to 2 channels simultaneously
including `node` and `retry`.
Schedule a job retry by passing `retry_counter`and `platform_filter`
while creating a job node.

Signed-off-by: Jeny Sadadia &lt;jeny.sadadia@collabora.com&gt;
diff --git a/src/scheduler.py b/src/scheduler.py
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: LGPL-2.1-or-later
 #
-# Copyright (C) 2021, 2022, 2023 Collabora Limited
+# Copyright (C) 2021-2025 Collabora Limited
 # Author: Guillaume Tucker <guillaume.tucker@collabora.com>
 # Author: Jeny Sadadia <jeny.sadadia@collabora.com>
 
@@ -105,11 +105,16 @@ def _cleanup_paths(self):
         # ToDo: if stat != 0 then report error to API?
 
     def _setup(self, args):
-        return self._api.subscribe('node')
-
-    def _stop(self, sub_id):
-        if sub_id:
-            self._api_helper.unsubscribe_filters(sub_id)
+        node_sub_id = self._api.subscribe('node')
+        self.log.debug(f"Node channel sub id: {node_sub_id}")
+        retry_sub_id = self._api.subscribe('retry')
+        self.log.debug(f"Retry channel sub id: {retry_sub_id}")
+        return [node_sub_id, retry_sub_id]
+
+    def _stop(self, sub_ids):
+        for sub_id in sub_ids:
+            if sub_id:
+                self._api_helper.unsubscribe_filters(sub_id)
         self._cleanup_paths()
 
     def backup_cleanup(self):
@@ -144,11 +149,11 @@ def backup_job(self, filename, nodeid):
         except Exception as e:
             self.log.error(f"Failed to backup {filename} to {new_filename}: {e}")
 
-    def _run_job(self, job_config, runtime, platform, input_node):
+    def _run_job(self, job_config, runtime, platform, input_node, retry_counter):
         try:
             node = self._api_helper.create_job_node(job_config,
                                                     input_node,
-                                                    runtime, platform)
+                                                    runtime, platform, retry_counter)
         except KeyError as e:
             self.log.error(' '.join([
                 input_node['id'],
@@ -162,6 +167,7 @@ def _run_job(self, job_config, runtime, platform, input_node):
 
         if not node:
             return
+        self.log.debug(f"Job node created: {node['id']}. Parent: f{node['parent']}")
         # Most of the time, the artifacts we need originate from the parent
         # node. Import those into the current node, working on a copy so the
         # original node doesn't get "polluted" with useless artifacts when we
@@ -371,7 +377,17 @@ def _verify_architecture_filter(self, job, node):
             return False
         return True
 
-    def _run(self, sub_id):
+    def _run(self, sub_ids):
+        threads = []
+        for sub_id in sub_ids:
+            thread = threading.Thread(target=self._run_scheduler, args=(sub_id,))
+            threads.append(thread)
+            thread.start()
+
+        for thread in threads:
+            thread.join()
+
+    def _run_scheduler(self, sub_id):
         self.log.info("Listening for available checkout events")
         self.log.info("Press Ctrl-C to stop.")
         subscribe_retries = 0
@@ -381,33 +397,38 @@ def _run(self, sub_id):
             event = None
             try:
                 event = self._api_helper.receive_event_data(sub_id, block=False)
+                if not event:
+                    # If we received a keep-alive event, just continue
+                    continue
             except Exception as e:
                 self.log.error(f"Error receiving event: {e}, re-subscribing in 10 seconds")
-                time.sleep(10)
-                sub_id = self._api.subscribe('node')
-                subscribe_retries += 1
-                if subscribe_retries > 3:
-                    self.log.error("Failed to re-subscribe to node events")
-                    return False
-                continue
-            if not event:
-                # If we received a keep-alive event, just continue
+                # time.sleep(10)
+                # sub_id = self._api.subscribe('node')
+                # subscribe_retries += 1
+                # if subscribe_retries > 3:
+                #     self.log.error("Failed to re-subscribe to node events")
+                #     return False
                 continue
-            subscribe_retries = 0
+            # subscribe_retries = 0
+            self.log.debug(f"Event received: {sub_id}:{event['id']}:{event.get('debug')}:{event.get('retry_counter')}")
             for job, runtime, platform, rules in self._sched.get_schedule(event):
                 input_node = self._api.node.get(event['id'])
                 jobfilter = event.get('jobfilter')
                 # Add to node data the jobfilter if it exists in event
                 if jobfilter and isinstance(jobfilter, list):
                     input_node['jobfilter'] = jobfilter
+                platform_filter = event.get('platform_filter')
+                if platform_filter and isinstance(platform_filter, list):
+                    input_node['platform_filter'] = platform_filter
                 # we cannot use rules, as we need to have info about job too
                 if job.params.get('frequency', None):
                     if not self._verify_frequency(job, input_node, platform):
                         continue
                 if not self._verify_architecture_filter(job, input_node):
                     continue
                 if self._api_helper.should_create_node(rules, input_node):
-                    self._run_job(job, runtime, platform, input_node)
+                    retry_counter = event.get('retry_counter', 0)
+                    self._run_job(job, runtime, platform, input_node, retry_counter)
 
         return True