modelscope
diff --git a/‎README.md‎
Lines changed: 1 addition & 1 deletion b/‎README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎configs/data_juicer_recipes/sandbox/auto_prompt_optimization/sandbox_auto_prompt_optimization.yaml‎
Lines changed: 3 additions & 2 deletions b/‎configs/data_juicer_recipes/sandbox/auto_prompt_optimization/sandbox_auto_prompt_optimization.yaml‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎data_juicer/config/config.py‎
Lines changed: 3 additions & 1 deletion b/‎data_juicer/config/config.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎data_juicer/core/data/ray_dataset.py‎
Lines changed: 11 additions & 28 deletions b/‎data_juicer/core/data/ray_dataset.py‎
Lines changed: 11 additions & 28 deletions
diff --git a/‎data_juicer/core/sandbox/helper_funcs.py‎
Lines changed: 20 additions & 0 deletions b/‎data_juicer/core/sandbox/helper_funcs.py‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎data_juicer/ops/base_op.py‎
Lines changed: 15 additions & 6 deletions b/‎data_juicer/ops/base_op.py‎
Lines changed: 15 additions & 6 deletions
diff --git a/‎data_juicer/ops/filter/flagged_words_filter.py‎
Lines changed: 3 additions & 0 deletions b/‎data_juicer/ops/filter/flagged_words_filter.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎data_juicer/ops/mapper/__init__.py‎
Lines changed: 6 additions & 0 deletions b/‎data_juicer/ops/mapper/__init__.py‎
Lines changed: 6 additions & 0 deletions
@@ -107,7 +107,7 @@ Besides, our paper is also updated to [v3](https://arxiv.org/abs/2309.02033).
   through the [sandbox laboratory](docs/Sandbox.md), and providing features such as feedback loops and visualization, so that you can better understand and improve your data and models. Many effect-proven datasets and models have been derived from DJ, in scenarios such as pre-training, text-to-video and image-to-text generation.
   ![Data-in-the-loop](https://img.alicdn.com/imgextra/i2/O1CN017U7Zz31Y7XtCJ5GOz_!!6000000003012-0-tps-3640-1567.jpg) 
 
-## Doucmentation
+## Documentation
 
 - Tutorial
   - [DJ-Cookbook](docs/tutorial/DJ-Cookbook.md)
 
@@ -10,7 +10,7 @@ resume: true                                                              # allo
 # iteration related parameters
 max_iter_num: 5
 iter_targets:
-  - "grader_model_prompt_optimization.grades_evaluation.min_mse <= 0.26"
+  - "grader_model_prompt_optimization.grades_evaluation.min_mse <= 0.2"
 iter_updater:
   select_and_merge_data_pools.merge_single_prompt_data_pools.merged_top_prompt_dataset: grader_model_prompt_optimization.generate_new_prompts.dj_configs.dataset_path
 
@@ -53,11 +53,12 @@ pipelines:
             type: 'api'
             model: 'qwen2.5-32b-instruct'
             max_retry_num: 5
-            build_messages_func: 'build_messages'
+            build_messages_func: 'build_messages_for_math_qa'
             parse_output_func: 'parse_output'
             func_kwargs:
               system_key: "prompt"
               query_key: "question"
+              response_key: "answer"
             # data related
             dataset_path: '<replaced_by_the_input>'
             export_path: './outputs/auto-prompt-optimization-for-grader-model/infer_results/'
 
@@ -28,6 +28,7 @@
 from data_juicer.utils.constant import RAY_JOB_ENV_VAR
 from data_juicer.utils.logger_utils import setup_logger
 from data_juicer.utils.mm_utils import SpecialTokens
+from data_juicer.utils.ray_utils import is_ray_mode
 
 global_cfg = None
 global_parser = None
@@ -749,7 +750,6 @@ def init_setup_from_cfg(cfg: Namespace, load_configs_only=False):
         "audio_key": cfg.get("audio_key", "audios"),
         "video_key": cfg.get("video_key", "videos"),
         "image_bytes_key": cfg.get("image_bytes_key", "image_bytes"),
-        "num_proc": cfg.get("np", None),
         "turbo": cfg.get("turbo", False),
         "skip_op_error": cfg.get("skip_op_error", True),
         "work_dir": cfg.work_dir,
@@ -758,6 +758,8 @@ def init_setup_from_cfg(cfg: Namespace, load_configs_only=False):
         "video_special_token": cfg.get("video_special_token", SpecialTokens.video),
         "eoc_special_token": cfg.get("eoc_special_token", SpecialTokens.eoc),
     }
+    if not is_ray_mode():
+        op_attrs.update({"num_proc": cfg.get("np", None)})
     cfg.process = update_op_attr(cfg.process, op_attrs)
 
     return cfg
 
@@ -1,7 +1,6 @@
 from __future__ import annotations
 
 import os
-import sys
 from functools import partial
 from typing import Any, Dict, List, Literal, Optional, Union
 
@@ -16,7 +15,6 @@
 from data_juicer.utils.constant import Fields
 from data_juicer.utils.file_utils import is_remote_path
 from data_juicer.utils.lazy_loader import LazyLoader
-from data_juicer.utils.process_utils import calculate_np
 from data_juicer.utils.resource_utils import cuda_device_count
 from data_juicer.utils.webdataset_utils import _custom_default_decoder
 
@@ -148,27 +146,16 @@ def process(self, operators, *, exporter=None, checkpointer=None, tracer=None) -
             return self
         if not isinstance(operators, list):
             operators = [operators]
+
+        from data_juicer.utils.process_utils import calculate_ray_np
+
+        calculate_ray_np(operators)
+
         for op in operators:
             self._run_single_op(op)
         return self
 
     def _run_single_op(self, op):
-        # TODO: optimize auto proc
-        auto_parallel = False
-        if op.num_proc:
-            op_proc = op.num_proc
-        else:
-            auto_parallel = True
-            op_proc = sys.maxsize
-        auto_op_proc = calculate_np(op._name, op.mem_required, op.cpu_required, op.use_cuda(), op.gpu_required)
-        op_proc = min(op_proc, auto_op_proc)
-
-        # use ray default parallelism in cpu mode if op.num_proc is not specified
-        if op.use_cuda() or not auto_parallel:
-            logger.info(f"Op [{op._name}] running with number of procs:{op_proc}")
-
-        num_gpus = op.gpu_required if op.gpu_required else get_num_gpus(op, op_proc)
-
         if op._name in TAGGING_OPS.modules and Fields.meta not in self.data.columns():
 
             def process_batch_arrow(table: pyarrow.Table):
@@ -193,8 +180,8 @@ def process_batch_arrow(table: pyarrow.Table):
                         fn_constructor_kwargs=op_kwargs,
                         batch_size=batch_size,
                         num_cpus=op.cpu_required,
-                        num_gpus=num_gpus,
-                        concurrency=op_proc,
+                        num_gpus=op.gpu_required,
+                        concurrency=op.num_proc,
                         batch_format="pyarrow",
                     )
                 else:
@@ -203,9 +190,7 @@ def process_batch_arrow(table: pyarrow.Table):
                         batch_size=batch_size,
                         batch_format="pyarrow",
                         num_cpus=op.cpu_required,
-                        concurrency=(
-                            None if auto_parallel else op_proc
-                        ),  # use ray default parallelism in cpu mode if num_proc is not specified
+                        concurrency=op.num_proc,
                     )
             elif isinstance(op, Filter):
                 columns = self.data.columns()
@@ -229,8 +214,8 @@ def process_batch_arrow(table: pyarrow.Table):
                         fn_constructor_kwargs=op_kwargs,
                         batch_size=batch_size,
                         num_cpus=op.cpu_required,
-                        num_gpus=num_gpus,
-                        concurrency=op_proc,
+                        num_gpus=op.gpu_required,
+                        concurrency=op.num_proc,
                         batch_format="pyarrow",
                     )
                 else:
@@ -239,9 +224,7 @@ def process_batch_arrow(table: pyarrow.Table):
                         batch_size=batch_size,
                         batch_format="pyarrow",
                         num_cpus=op.cpu_required,
-                        concurrency=(
-                            None if auto_parallel else op_proc
-                        ),  # use ray default parallelism in cpu mode if num_proc is not specified
+                        concurrency=op.num_proc,
                     )
                 if op.stats_export_path is not None:
                     self.data.write_json(op.stats_export_path, force_ascii=False)
 
@@ -27,3 +27,23 @@ def parse_output(output: str, item: dict, **kwargs):
     A simple implementation.
     """
     return output
+
+
+# Math QA grader
+@ALL_FUNCS.register_module("build_messages_for_math_qa")
+def build_messages_for_math_qa(item: dict, **kwargs):
+    """
+    Build message for math QA grader.
+    """
+    system_key = kwargs.get("system_key", "system")
+    query_key = kwargs.get("query_key", "query")
+    response_key = kwargs.get("response_key", "response")
+
+    system_prompt = item.get(system_key, "")
+    question = item[query_key]
+    answer = item[response_key]
+    messages = []
+    if system_prompt:
+        messages.append({"role": "system", "content": system_prompt})
+    messages.append({"role": "user", "content": f"Question: {question}\nAnswer: {answer}"})
+    return messages
@@ -8,6 +8,7 @@
 from data_juicer.utils.mm_utils import SpecialTokens, size_to_bytes
 from data_juicer.utils.model_utils import free_models
 from data_juicer.utils.process_utils import calculate_np
+from data_juicer.utils.ray_utils import is_ray_mode
 from data_juicer.utils.registry import Registry
 from data_juicer.utils.resource_utils import is_cuda_available
 
@@ -191,10 +192,10 @@ def __init__(self, *args, **kwargs):
             self.accelerator = self._accelerator
 
         # parameters to determine the number of procs for this op
-        self.num_proc = kwargs.get("num_proc", None)
-        self.cpu_required = kwargs.get("cpu_required", 1)
-        self.gpu_required = kwargs.get("gpu_required", 0)
-        self.mem_required = kwargs.get("mem_required", 0)
+        self.num_proc = kwargs.get("num_proc", -1)  # -1 means automatic calculation of concurrency
+        self.cpu_required = kwargs.get("cpu_required", None)
+        self.gpu_required = kwargs.get("gpu_required", None)
+        self.mem_required = kwargs.get("mem_required", None)
         if isinstance(self.mem_required, str):
             self.mem_required = size_to_bytes(self.mem_required) / 1024**3
 
@@ -215,6 +216,12 @@ def __init__(self, *args, **kwargs):
                 method = wrap_func_with_nested_access(method)
                 setattr(self, name, method)
 
+    def use_auto_proc(self):
+        if is_ray_mode() and not self.use_cuda():  # ray task
+            return self.num_proc == -1
+        else:
+            return not self.num_proc or self.num_proc == -1
+
     def is_batched_op(self):
         return self._batched_op
 
@@ -228,8 +235,10 @@ def runtime_np(self):
         # Local import to avoid logger being serialized in multiprocessing
         from loguru import logger
 
-        op_proc = calculate_np(self._name, self.mem_required, self.cpu_required, self.use_cuda(), self.gpu_required)
-        if self.num_proc is not None:
+        op_proc = calculate_np(
+            self._name, self.mem_required, self.cpu_required or 1, self.use_cuda(), self.gpu_required
+        )
+        if not self.use_auto_proc():
             op_proc = min(op_proc, self.num_proc)
         logger.debug(f"Op [{self._name}] running with number of procs:{op_proc}")
         return op_proc
 
@@ -1,5 +1,8 @@
 # Some code here has been modified from:
 # https://huggingface.co/spaces/huggingface/text-data-filtering
+#
+# The flagged words list comes from https://huggingface.co/spaces/huggingface/text-data-filtering
+# and https://github.com/LDNOOBW/List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words
 # --------------------------------------------------------
 
 from typing import List
 
@@ -12,6 +12,9 @@
 from .clean_html_mapper import CleanHtmlMapper
 from .clean_ip_mapper import CleanIpMapper
 from .clean_links_mapper import CleanLinksMapper
+from .detect_character_attributes_mapper import DetectCharacterAttributesMapper
+from .detect_character_locations_mapper import DetectCharacterLocationsMapper
+from .detect_main_character_mapper import DetectMainCharacterMapper
 from .dialog_intent_detection_mapper import DialogIntentDetectionMapper
 from .dialog_sentiment_detection_mapper import DialogSentimentDetectionMapper
 from .dialog_sentiment_intensity_mapper import DialogSentimentIntensityMapper
@@ -101,6 +104,9 @@
     "CleanHtmlMapper",
     "CleanIpMapper",
     "CleanLinksMapper",
+    "DetectCharacterAttributesMapper",
+    "DetectCharacterLocationsMapper",
+    "DetectMainCharacterMapper",
     "DialogIntentDetectionMapper",
     "DialogSentimentDetectionMapper",
     "DialogSentimentIntensityMapper",