Refactor cell tracking using hook from ExecutePreprocessor

agupta01 · agupta01 · commit 48f9d3f296f9 · 2025-07-22T21:01:32.000-04:00
diff --git a/jupyter_scheduler/executors.py b/jupyter_scheduler/executors.py
@@ -11,38 +11,12 @@
 import nbformat
 from nbconvert.preprocessors import CellExecutionError, ExecutePreprocessor
 
-from jupyter_scheduler.models import DescribeJob, JobFeature, Status, UpdateJob
+from jupyter_scheduler.models import DescribeJob, JobFeature, Status
 from jupyter_scheduler.orm import Job, create_session
 from jupyter_scheduler.parameterize import add_parameters
 from jupyter_scheduler.utils import get_utc_timestamp
 
 
-class TrackingExecutePreprocessor(ExecutePreprocessor):
-    """Custom ExecutePreprocessor that tracks completed cells and updates the database"""
-    
-    def __init__(self, db_session, job_id, **kwargs):
-        super().__init__(**kwargs)
-        self.db_session = db_session
-        self.job_id = job_id
-    
-    def preprocess_cell(self, cell, resources, index):
-        """
-        Override to track completed cells in the database.
-        Calls the superclass implementation and then updates the database.
-        """
-        # Call the superclass implementation
-        cell, resources = super().preprocess_cell(cell, resources, index)
-        
-        # Update the database with the current count of completed cells
-        with self.db_session() as session:
-            session.query(Job).filter(Job.job_id == self.job_id).update(
-                {"completed_cells": self.code_cells_executed}
-            )
-            session.commit()
-        
-        return cell, resources
-
-
 class ExecutionManager(ABC):
     """Base execution manager.
     Clients are expected to override this class
@@ -158,14 +132,14 @@ def execute(self):
             nb = add_parameters(nb, job.parameters)
 
         staging_dir = os.path.dirname(self.staging_paths["input"])
-        ep = TrackingExecutePreprocessor(
-            db_session=self.db_session,
-            job_id=self.job_id,
-            kernel_name=nb.metadata.kernelspec["name"], 
-            store_widget_state=True, 
-            cwd=staging_dir
+
+        ep = ExecutePreprocessor(
+            kernel_name=nb.metadata.kernelspec["name"], store_widget_state=True, cwd=staging_dir
         )
 
+        if self.supported_features().get(JobFeature.track_cell_execution, False):
+            ep.on_cell_executed = self.__update_completed_cells_hook(ep)
+
         try:
             ep.preprocess(nb, {"metadata": {"path": staging_dir}})
         except CellExecutionError as e:
@@ -174,6 +148,16 @@ def execute(self):
             self.add_side_effects_files(staging_dir)
             self.create_output_files(job, nb)
 
+    def __update_completed_cells_hook(self, ep: ExecutePreprocessor):
+        """Returns a hook that runs on every cell execution, regardless of success or failure. Updates the completed_cells for the job."""
+        def update_completed_cells(cell, cell_index, execute_reply):
+            with self.db_session() as session:
+                session.query(Job).filter(Job.job_id == self.job_id).update(
+                    {"completed_cells": ep.code_cells_executed}
+                )
+                session.commit()
+        return update_completed_cells
+
     def add_side_effects_files(self, staging_dir: str):
         """Scan for side effect files potentially created after input file execution and update the job's packaged_files with these files"""
         input_notebook = os.path.relpath(self.staging_paths["input"])
@@ -203,6 +187,7 @@ def create_output_files(self, job: DescribeJob, notebook_node):
             with fsspec.open(self.staging_paths[output_format], "w", encoding="utf-8") as f:
                 f.write(output)
 
+    @classmethod
     def supported_features(cls) -> Dict[JobFeature, bool]:
         return {
             JobFeature.job_name: True,
@@ -218,8 +203,10 @@ def supported_features(cls) -> Dict[JobFeature, bool]:
             JobFeature.output_filename_template: False,
             JobFeature.stop_job: True,
             JobFeature.delete_job: True,
+            JobFeature.track_cell_execution: True,
         }
 
+    @classmethod
     def validate(cls, input_path: str) -> bool:
         with open(input_path, encoding="utf-8") as f:
             nb = nbformat.read(f, as_version=4)
diff --git a/jupyter_scheduler/models.py b/jupyter_scheduler/models.py
@@ -297,3 +297,4 @@ class JobFeature(str, Enum):
     output_filename_template = "output_filename_template"
     stop_job = "stop_job"
     delete_job = "delete_job"
+    track_cell_execution = "track_cell_execution"
diff --git a/jupyter_scheduler/scheduler.py b/jupyter_scheduler/scheduler.py
@@ -442,7 +442,7 @@ def create_job(self, model: CreateJob) -> str:
             raise InputUriError(model.input_uri)
 
         input_path = os.path.join(self.root_dir, model.input_uri)
-        if not self.execution_manager_class.validate(self.execution_manager_class, input_path):
+        if not self.execution_manager_class.validate(input_path):
             raise SchedulerError(
                 """There is no kernel associated with the notebook. Please open
                     the notebook, select a kernel, and re-submit the job to execute.
diff --git a/jupyter_scheduler/tests/test_execution_manager.py b/jupyter_scheduler/tests/test_execution_manager.py