Updates from testing: make fewer assumptions about output file locations, minor cleanup of results dict.

rlratzel · rlratzel · commit f928de5ae567 · 2025-10-24T03:46:04.000-05:00
Signed-off-by: rlratzel &lt;rratzel@nvidia.com&gt;
diff --git a/benchmarking/run.py b/benchmarking/run.py
@@ -109,7 +109,7 @@ def run_entry(  # noqa: PLR0915
         (session_entry_path / d).absolute() for d in ["scratch", "ray_cluster", "logs", "benchmark_results"]
     ]
 
-    cmd = entry.get_command_to_run(session_entry_path, dataset_resolver)
+    cmd = entry.get_command_to_run(session_entry_path, benchmark_results_path, dataset_resolver)
     run_id = result.get("run_id", f"{entry.name}-{int(time.time())}")
 
     try:
@@ -141,7 +141,7 @@ def run_entry(  # noqa: PLR0915
                 logger.warning(f"\t\t⏰ Timed out after {entry.timeout_s}s")
         logger.info(f"\t\tLogs found in {logs_path}")
 
-        run_data = {
+        result.update({
             "cmd": cmd,
             "started_at": started_at,
             "ended_at": time.time(),
@@ -151,11 +151,10 @@ def run_entry(  # noqa: PLR0915
             "timed_out": completed["timed_out"],
             "logs_dir": logs_path,
             "success": success,
-        }
+        })
         ray_data = {}
         script_persisted_data = get_entry_script_persisted_data(benchmark_results_path)
         result.update({
-            "run_data": run_data,
             "ray_data": ray_data,
             "script_persisted_data": script_persisted_data,
         })
diff --git a/benchmarking/runner/matrix.py b/benchmarking/runner/matrix.py
@@ -21,11 +21,11 @@ class MatrixEntry:
     # If set, overrides the session-level delete_scratch setting for this entry
     delete_scratch: bool | None = None
 
-    def get_command_to_run(self, session_entry_path: Path, resolver: DatasetResolver) -> str:
+    def get_command_to_run(self, session_entry_path: Path, benchmark_results_path: Path, resolver: DatasetResolver) -> str:
         if self.script:
             script_path = self.script_base_dir / self.script
             # FIXME: should --benchmark-results-path always be passed?
-            cmd = f"python {script_path} {self.args or ''} --benchmark-results-path" + " {session_entry_dir}/benchmark_results"
+            cmd = f"python {script_path} {self.args or ''} --benchmark-results-path={benchmark_results_path}"
 
             cmd = self.substitute_datasets_in_cmd(cmd, resolver)
             cmd = self.substitute_template_placeholders(cmd, session_entry_path)
diff --git a/benchmarking/runner/sinks/slack_sink.py b/benchmarking/runner/sinks/slack_sink.py
@@ -150,7 +150,7 @@ def _post_style1(self) -> None:
             data = [
                 ("name", result["name"]),
                 ("success", result["success"]),
-                ("runtime", f"{result['run_data']['exec_time_s']:.2f} s"),
+                ("runtime", f"{result['exec_time_s']:.2f} s"),
             ]
             left, right = zip(*data)
             right = [str(val) for val in right]
@@ -207,7 +207,7 @@ def _post_style2(self) -> None:
             data = [
                 ("name", result["name"]),
                 ("success", result["success"]),
-                ("runtime", f"{result['run_data']['exec_time_s']:.2f} s"),
+                ("runtime", f"{result['exec_time_s']:.2f} s"),
             ]
             for (var, val) in data:
                 row = [
diff --git a/benchmarking/scripts/test_benchmark.py b/benchmarking/scripts/test_benchmark.py
@@ -33,10 +33,8 @@ def run_demo_benchmark(  # noqa: PLR0913
         from nemo_curator.backends.experimental.ray_data import RayDataExecutor
 
         executor = RayDataExecutor()
-        if use_ray_data_settings:
-            from ray.data import DataContext
-
-            DataContext.get_current().target_max_block_size = 1
+        #from ray.data import DataContext
+        #DataContext.get_current().target_max_block_size = 1
 
     elif executor_name == "xenna":
         from nemo_curator.backends.xenna import XennaExecutor
@@ -55,15 +53,15 @@ def run_demo_benchmark(  # noqa: PLR0913
     try:
         time.sleep(10)
         output_tasks = []
+        run_time_taken = time.perf_counter() - run_start_time
+        num_removed = 0
         logger.success(f"Benchmark completed in {run_time_taken:.2f}s")
         success = True
     except Exception as e:  # noqa: BLE001
         logger.error(f"Benchmark failed: {e}")
         output_tasks = []
         success = False
-    finally:
-        run_time_taken = time.perf_counter() - run_start_time
-        num_removed = 0
+
     return {
         "params": {
             "executor": executor_name,
@@ -125,7 +123,7 @@ def main() -> int:
             "tasks": [],
         }
     finally:
-        write_results(results, args.output_path)
+        write_results(results, args.benchmark_results_path)
 
     # Return proper exit code based on success
     return 0 if results["metrics"]["is_success"] else 1