11from __future__ import annotations
22
33import os
4- import sys
54from functools import partial
65from typing import Any , Dict , List , Literal , Optional , Union
76
1615from data_juicer .utils .constant import Fields
1716from data_juicer .utils .file_utils import is_remote_path
1817from data_juicer .utils .lazy_loader import LazyLoader
19- from data_juicer .utils .process_utils import calculate_np
2018from data_juicer .utils .resource_utils import cuda_device_count
2119from data_juicer .utils .webdataset_utils import _custom_default_decoder
2220
@@ -148,27 +146,16 @@ def process(self, operators, *, exporter=None, checkpointer=None, tracer=None) -
148146 return self
149147 if not isinstance (operators , list ):
150148 operators = [operators ]
149+
150+ from data_juicer .utils .process_utils import calculate_ray_np
151+
152+ calculate_ray_np (operators )
153+
151154 for op in operators :
152155 self ._run_single_op (op )
153156 return self
154157
155158 def _run_single_op (self , op ):
156- # TODO: optimize auto proc
157- auto_parallel = False
158- if op .num_proc :
159- op_proc = op .num_proc
160- else :
161- auto_parallel = True
162- op_proc = sys .maxsize
163- auto_op_proc = calculate_np (op ._name , op .mem_required , op .cpu_required , op .use_cuda (), op .gpu_required )
164- op_proc = min (op_proc , auto_op_proc )
165-
166- # use ray default parallelism in cpu mode if op.num_proc is not specified
167- if op .use_cuda () or not auto_parallel :
168- logger .info (f"Op [{ op ._name } ] running with number of procs:{ op_proc } " )
169-
170- num_gpus = op .gpu_required if op .gpu_required else get_num_gpus (op , op_proc )
171-
172159 if op ._name in TAGGING_OPS .modules and Fields .meta not in self .data .columns ():
173160
174161 def process_batch_arrow (table : pyarrow .Table ):
@@ -193,8 +180,8 @@ def process_batch_arrow(table: pyarrow.Table):
193180 fn_constructor_kwargs = op_kwargs ,
194181 batch_size = batch_size ,
195182 num_cpus = op .cpu_required ,
196- num_gpus = num_gpus ,
197- concurrency = op_proc ,
183+ num_gpus = op . gpu_required ,
184+ concurrency = op . num_proc ,
198185 batch_format = "pyarrow" ,
199186 )
200187 else :
@@ -203,9 +190,7 @@ def process_batch_arrow(table: pyarrow.Table):
203190 batch_size = batch_size ,
204191 batch_format = "pyarrow" ,
205192 num_cpus = op .cpu_required ,
206- concurrency = (
207- None if auto_parallel else op_proc
208- ), # use ray default parallelism in cpu mode if num_proc is not specified
193+ concurrency = op .num_proc ,
209194 )
210195 elif isinstance (op , Filter ):
211196 columns = self .data .columns ()
@@ -229,8 +214,8 @@ def process_batch_arrow(table: pyarrow.Table):
229214 fn_constructor_kwargs = op_kwargs ,
230215 batch_size = batch_size ,
231216 num_cpus = op .cpu_required ,
232- num_gpus = num_gpus ,
233- concurrency = op_proc ,
217+ num_gpus = op . gpu_required ,
218+ concurrency = op . num_proc ,
234219 batch_format = "pyarrow" ,
235220 )
236221 else :
@@ -239,9 +224,7 @@ def process_batch_arrow(table: pyarrow.Table):
239224 batch_size = batch_size ,
240225 batch_format = "pyarrow" ,
241226 num_cpus = op .cpu_required ,
242- concurrency = (
243- None if auto_parallel else op_proc
244- ), # use ray default parallelism in cpu mode if num_proc is not specified
227+ concurrency = op .num_proc ,
245228 )
246229 if op .stats_export_path is not None :
247230 self .data .write_json (op .stats_export_path , force_ascii = False )
0 commit comments