NVIDIA-NeMo
diff --git a/‎benchmarking/Dockerfile‎
Lines changed: 4 additions & 3 deletions b/‎benchmarking/Dockerfile‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎benchmarking/README.md‎
Lines changed: 15 additions & 16 deletions b/‎benchmarking/README.md‎
Lines changed: 15 additions & 16 deletions
diff --git a/‎benchmarking/__init__.py‎
Lines changed: 0 additions & 1 deletion b/‎benchmarking/__init__.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎benchmarking/commands.sh‎
Lines changed: 1 addition & 1 deletion b/‎benchmarking/commands.sh‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎benchmarking/run.py‎
Lines changed: 49 additions & 67 deletions b/‎benchmarking/run.py‎
Lines changed: 49 additions & 67 deletions
diff --git a/‎benchmarking/runner/__init__.py‎
Lines changed: 0 additions & 1 deletion b/‎benchmarking/runner/__init__.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎benchmarking/runner/datasets.py‎
Lines changed: 6 additions & 4 deletions b/‎benchmarking/runner/datasets.py‎
Lines changed: 6 additions & 4 deletions
diff --git a/‎benchmarking/runner/env_capture.py‎
Lines changed: 1 addition & 2 deletions b/‎benchmarking/runner/env_capture.py‎
Lines changed: 1 addition & 2 deletions
@@ -48,7 +48,7 @@ ENV MAMBA_ROOT_PREFIX=/opt/micromamba
 ENV PATH=$MAMBA_ROOT_PREFIX/bin:$PATH
 RUN curl -Ls https://micromamba.snakepit.net/api/micromamba/linux-64/latest | tar -xvj -C /usr/local/bin --strip-components=1 bin/micromamba && \
     micromamba shell init -s bash -r $MAMBA_ROOT_PREFIX
-   
+
 # Install uv
 ENV UV_VERSION="0.8.22"
 RUN curl -LsSf https://astral.sh/uv/${UV_VERSION}/install.sh | sh
@@ -93,7 +93,7 @@ RUN git clone https://github.com/OpenGVLab/InternVideo.git && \
 
 
 ########################################################################
-# curator_benchmark image - 
+# curator_benchmark image -
 #
 # use cases:
 # * Start a container standalone to run all Curator benchmarks. Datasets are downloaded automatically and reside only in the container.
@@ -118,6 +118,7 @@ FROM curator_system_base AS curator_benchmarking
 COPY --from=curator_setup_deps /opt /opt
 
 # Install Curator, which includes benchmarking tools
+# Update pyproject.toml to get the latest RAPIDS libs
 COPY . /opt/Curator
 RUN cd /opt/Curator \
  && uv sync --link-mode copy --locked --extra all --all-groups \
@@ -133,7 +134,7 @@ ARG NVIDIA_BUILD_REF
 LABEL com.nvidia.build.ref="${NVIDIA_BUILD_REF}"
 
 # Install deps for specific benchmark scripts.
-# FIXME: look into a way that script authors can install their own deps so this does not need to be updated for each new script dep.
+# TODO: look into a way that script authors can install their own deps so this does not need to be updated for each new script dep.
 RUN apt-get install -y --no-install-recommends \
     wget
 
 
@@ -322,9 +322,9 @@ def main():
     # Add your custom arguments
     parser.add_argument("--input", type=str)
     parser.add_argument("--iterations", type=int, default=100)
-    
+
     args = parser.parse_args()
-    
+
     # Your benchmark logic here
     run_benchmark(args)
 
@@ -407,28 +407,28 @@ from nemo_curator.tasks.utils import TaskPerfUtils
 def run_benchmark(args):
     """Main benchmark logic."""
     start_time = time.time()
-    
+
     # Your benchmark code here
     with Task("my_operation", TaskPerfUtils()):
         result = perform_operation(args.input)
-    
+
     execution_time = time.time() - start_time
-    
+
     # Write required output files
     params = {
         "input": str(args.input),
         "parameter1": args.param1,
     }
     with open(args.benchmark_results_path / "params.json", "w") as f:
         json.dump(params, f, indent=2)
-    
+
     metrics = {
         "execution_time_s": execution_time,
         "items_processed": len(result),
     }
     with open(args.benchmark_results_path / "metrics.json", "w") as f:
         json.dump(metrics, f, indent=2)
-    
+
     tasks = Task.get_all_tasks()
     with open(args.benchmark_results_path / "tasks.pkl", "wb") as f:
         pickle.dump(tasks, f)
@@ -439,7 +439,7 @@ def main():
     parser.add_argument("--benchmark-results-path", type=Path, required=True)
     parser.add_argument("--input", type=str, required=True)
     parser.add_argument("--param1", type=str, default="default")
-    
+
     args = parser.parse_args()
     run_benchmark(args)
 
@@ -521,33 +521,33 @@ class MyCustomSink(Sink):
         self.config = config
         self.enabled = config.get("enabled", True)
         self.api_endpoint = config.get("api_endpoint")
-        
+
         # Initialize any resources
         if not self.api_endpoint:
             raise ValueError("MyCustomSink: api_endpoint is required")
-    
+
     def initialize(self, session_name: str, env_data: dict[str, Any]) -> None:
         """Called at session start."""
         self.session_name = session_name
         self.env_data = env_data
-        
+
         if self.enabled:
             logger.info(f"MyCustomSink: Starting session {session_name}")
             # Perform initialization (e.g., create remote session)
-    
+
     def process_result(self, result: dict[str, Any]) -> None:
         """Called after each entry completes."""
         if self.enabled:
             logger.info(f"MyCustomSink: Processing {result['name']}")
             # Send result to your API, database, etc.
             self._send_to_api(result)
-    
+
     def finalize(self) -> None:
         """Called at session end."""
         if self.enabled:
             logger.info("MyCustomSink: Finalizing session")
             # Perform cleanup, send summary, etc.
-    
+
     def _send_to_api(self, data: dict) -> None:
         """Helper method for API calls."""
         # Your implementation
@@ -895,7 +895,7 @@ entries:
   - name: benchmark_v1
     script: my_benchmark.py
     args: --input {dataset:sample_data,parquet} --algorithm v1
-    
+
   - name: benchmark_v2
     script: my_benchmark.py
     args: --input {dataset:sample_data,parquet} --algorithm v2
@@ -1009,4 +1009,3 @@ benchmarking/
 Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
 
 Licensed under the Apache License, Version 2.0. See the main repository LICENSE file for details.
-
@@ -1 +0,0 @@
-
@@ -27,4 +27,4 @@ python ./scripts/common_crawl_benchmark.py \
       --url_limit 10 \
       --add_filename_column \
       --executor ray_data \
-      --ray_data_cast_as_actor
+      --ray_data_cast_as_actor
@@ -13,66 +13,40 @@
 # limitations under the License.
 
 import argparse
-from pathlib import Path
-import yaml
-import os
-import time
-import sys
-from typing import Any
 import json
-import traceback
-from statistics import mean, stdev
+import os
 import pickle
 import shutil
+import sys
+import time
 import traceback
+from pathlib import Path
+from typing import Any
 
+import yaml
 from loguru import logger
 
-from nemo_curator.tasks import Task
 from nemo_curator.tasks.utils import TaskPerfUtils
+from nemo_curator.utils.file_utils import create_or_overwrite_dir
 
-# FIXME: How do we want to package this tool? Perhaps a package extra for
+# TODO: How do we want to package this tool? Perhaps a package extra for
 #  nemo-curator, i.e. nemo-curator[benchmarking]?
 # For now, add this directory to PYTHONPATH to import the runner modules
 sys.path.insert(0, Path(__file__).parent)
-from runner.matrix import MatrixConfig, MatrixEntry    
 from runner.datasets import DatasetResolver
-from runner.utils import get_obj_for_json
-from runner.process import run_command_with_timeout
 from runner.env_capture import dump_env
+from runner.matrix import MatrixConfig, MatrixEntry
+from runner.process import run_command_with_timeout
+from runner.utils import get_obj_for_json
 
 
 def ensure_dir(dir_path: Path) -> None:
     """Ensure dir_path and parents exists, creating them if necessary."""
     dir_path.mkdir(parents=True, exist_ok=True)
 
 
-def create_or_overwrite_dir(dir_path: Path) -> None:
-    """Create directory, removing it if it exists."""
-    if dir_path.exists():
-        shutil.rmtree(dir_path, ignore_errors=True)
-    dir_path.mkdir(parents=True, exist_ok=True)
-
-
-def aggregate_task_metrics(tasks: list[Task], prefix: str | None = None) -> dict[str, Any]:
-    """Aggregate task metrics by computing mean/std/sum."""
-    metrics = {}
-    tasks_metrics = TaskPerfUtils.collect_stage_metrics(tasks)
-    # For each of the metric compute mean/std/sum and flatten the dict
-    for stage_name, stage_data in tasks_metrics.items():
-        for metric_name, values in stage_data.items():
-            for agg_name, agg_func in [("sum", sum), ("mean", mean), ("std", stdev)]:
-                stage_key = stage_name if prefix is None else f"{prefix}_{stage_name}"
-                if len(values) > 0:
-                    metrics[f"{stage_key}_{metric_name}_{agg_name}"] = float(agg_func(values))
-                else:
-                    metrics[f"{stage_key}_{metric_name}_{agg_name}"] = 0.0
-    return metrics
-
-
 def get_entry_script_persisted_data(benchmark_results_path: Path) -> dict[str, Any]:
-    """ Read the files that are expected to be generated by the individual benchmark scripts.
-    """
+    """Read the files that are expected to be generated by the individual benchmark scripts."""
     params_json = benchmark_results_path / "params.json"
     if not params_json.exists():
         logger.warning(f"Params JSON file not found at {params_json}")
@@ -97,22 +71,23 @@ def get_entry_script_persisted_data(benchmark_results_path: Path) -> dict[str, A
         with open(tasks_pkl, "rb") as f:
             script_tasks = pickle.load(f)  # noqa: S301
         if isinstance(script_tasks, list):
-            script_metrics.update(aggregate_task_metrics(script_tasks, prefix="task"))
+            script_metrics.update(TaskPerfUtils.aggregate_task_metrics(script_tasks, prefix="task"))
         elif isinstance(script_tasks, dict):
             for pipeline_name, pipeline_tasks in script_tasks.items():
-                script_metrics.update(aggregate_task_metrics(pipeline_tasks, prefix=pipeline_name.lower()))
+                script_metrics.update(
+                    TaskPerfUtils.aggregate_task_metrics(pipeline_tasks, prefix=pipeline_name.lower())
+                )
 
     return {"params": script_params, "metrics": script_metrics}
 
 
-def run_entry(  # noqa: PLR0915
+def run_entry(
     entry: MatrixEntry,
     dataset_resolver: DatasetResolver,
     session_path: Path,
     result: dict[str, Any],
 ) -> tuple[dict[str, Any], bool, dict[str, Any]]:
-    
-    started_at = time.time()    
+    started_at = time.time()
     session_entry_path = session_path / entry.name
 
     # scratch_path : This is the directory user can use to store scratch data; it'll be cleaned up after the entry is done
@@ -155,23 +130,27 @@ def run_entry(  # noqa: PLR0915
                 logger.warning(f"\t\t⏰ Timed out after {entry.timeout_s}s")
         logger.info(f"\t\tLogs found in {logs_path}")
 
-        result.update({
-            "cmd": cmd,
-            "started_at": started_at,
-            "ended_at": time.time(),
-            "exec_started_at": started_exec,
-            "exec_time_s": ended_exec - started_exec,
-            "exit_code": completed["returncode"],
-            "timed_out": completed["timed_out"],
-            "logs_dir": logs_path,
-            "success": success,
-        })
+        result.update(
+            {
+                "cmd": cmd,
+                "started_at": started_at,
+                "ended_at": time.time(),
+                "exec_started_at": started_exec,
+                "exec_time_s": ended_exec - started_exec,
+                "exit_code": completed["returncode"],
+                "timed_out": completed["timed_out"],
+                "logs_dir": logs_path,
+                "success": success,
+            }
+        )
         ray_data = {}
         script_persisted_data = get_entry_script_persisted_data(benchmark_results_path)
-        result.update({
-            "ray_data": ray_data,
-            "script_persisted_data": script_persisted_data,
-        })
+        result.update(
+            {
+                "ray_data": ray_data,
+                "script_persisted_data": script_persisted_data,
+            }
+        )
         Path(session_entry_path / "results.json").write_text(json.dumps(get_obj_for_json(result)))
 
         return success
@@ -200,10 +179,11 @@ def main() -> None:
     # and use by passing individual components the keys they need
     config_dict = {}
     for yml_file in args.config:
-        config_dicts = yaml.full_load_all(open(yml_file))
+        with open(yml_file) as f:
+            config_dicts = yaml.full_load_all(f)
         for d in config_dicts:
             config_dict.update(d)
-    
+
     config = MatrixConfig.create_from_dict(config_dict)
     resolver = DatasetResolver.create_from_dicts(config_dict["datasets"])
 
@@ -216,7 +196,7 @@ def main() -> None:
     session_overall_success = True
     logger.info(f"Started session {session_name}...")
     env_data = dump_env(session_path)
- 
+
     for sink in config.sinks:
         sink.initialize(session_name, env_data)
 
@@ -242,12 +222,14 @@ def main() -> None:
             error_traceback = traceback.format_exc()
             logger.error(f"\t\t❌ Entry failed with exception: {e}")
             logger.debug(f"Full traceback:\n{error_traceback}")
-            result.update({
-                "error": str(e),
-                "traceback": error_traceback,
-                "success": run_success,
-            })
-        
+            result.update(
+                {
+                    "error": str(e),
+                    "traceback": error_traceback,
+                    "success": run_success,
+                }
+            )
+
         finally:
             session_overall_success &= run_success
             for sink in config.sinks:
 
@@ -1 +0,0 @@
-
@@ -36,20 +36,22 @@ def create_from_dicts(cls, data: list[dict]) -> DatasetResolver:
         # Check for duplicate dataset names before proceeding
         names = [d["name"] for d in data]
         if len(names) != len(set(names)):
-            duplicates = set([name for name in names if names.count(name) > 1])
-            raise ValueError(f"Duplicate dataset name(s) found: {', '.join(duplicates)}")
+            duplicates = {name for name in names if names.count(name) > 1}
+            msg = f"Duplicate dataset name(s) found: {', '.join(duplicates)}"
+            raise ValueError(msg)
 
         instance = cls()
         for dataset in data:
             formats = dataset["formats"]
-            assert isinstance(formats, list), "formats must be a list"
+            if not isinstance(formats, list):
+                msg = "formats must be a list"
+                raise TypeError(msg)
             format_map = {}
             for fmt in formats:
                 format_map[fmt["type"]] = fmt["path"]
             instance._map[dataset["name"]] = format_map
         return instance
 
-
     def resolve(self, dataset_name: str, file_format: str) -> str:
         if dataset_name not in self._map:
             msg = f"Unknown dataset: {dataset_name}"
 
@@ -21,7 +21,6 @@
 from typing import Any
 
 from loguru import logger
-
 from runner.utils import get_obj_for_json
 
 
@@ -68,4 +67,4 @@ def get_env() -> dict[str, Any]:
         "python_version": platform.python_version(),
         "executable": os.getenv("_"),
         "cuda_visible_devices": os.getenv("CUDA_VISIBLE_DEVICES", ""),
-    }
+    }