-
Notifications
You must be signed in to change notification settings - Fork 2.3k
Description
Reproduction
File "/media/1luik/46BB55AF65F351D4/4/xl.py", line 730, in train
trainer.train()
File "/home/venv/lib/python3.12/site-packages/transformers/trainer.py", line 2325, in train
return inner_training_loop(
^^^^^^^^^^^^^^^^^^^^
File "/home/venv/lib/python3.12/site-packages/transformers/trainer.py", line 2674, in _inner_training_loop
tr_loss_step = self.training_step(model, inputs, num_items_in_batch)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/venv/lib/python3.12/site-packages/transformers/trainer.py", line 4020, in training_step
loss = self.compute_loss(model, inputs, num_items_in_batch=num_items_in_batch)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/venv/lib/python3.12/site-packages/trl/extras/profiling.py", line 98, in wrapper
return func(self, *args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/media/1luik/46BB55AF65F351D4/4/xl.py", line 730, in train
trainer.train()
File "/home/venv/lib/python3.12/site-packages/trl/trainer/grpo_trainer.py", line 1761, in compute_loss
return self._compute_loss(model, inputs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/venv/lib/python3.12/site-packages/trl/trainer/grpo_trainer.py", line 1772, in _compute_loss
per_token_logps, entropies = self._get_per_token_logps_and_entropies(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/venv/lib/python3.12/site-packages/transformers/trainer.py", line 2325, in train
return inner_training_loop(
^^^^^^^^^^^^^^^^^^^^
File "/home/venv/lib/python3.12/site-packages/trl/extras/profiling.py", line 98, in wrapper
return func(self, *args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/venv/lib/python3.12/site-packages/transformers/trainer.py", line 2674, in _inner_training_loop
tr_loss_step = self.training_step(model, inputs, num_items_in_batch)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/venv/lib/python3.12/site-packages/trl/trainer/grpo_trainer.py", line 903, in _get_per_token_logps_and_entropies
logits = model(**model_inputs).logits
^^^^^^^^^^^^^^^^^^^^^
File "/home/venv/lib/python3.12/site-packages/transformers/trainer.py", line 4020, in training_step
loss = self.compute_loss(model, inputs, num_items_in_batch=num_items_in_batch)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/venv/lib/python3.12/site-packages/trl/extras/profiling.py", line 98, in wrapper
return func(self, *args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/venv/lib/python3.12/site-packages/trl/trainer/grpo_trainer.py", line 1761, in compute_loss
return self._compute_loss(model, inputs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/venv/lib/python3.12/site-packages/deepspeed/utils/nvtx.py", line 20, in wrapped_fn
ret_val = func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/home/venv/lib/python3.12/site-packages/trl/trainer/grpo_trainer.py", line 1772, in _compute_loss
per_token_logps, entropies = self._get_per_token_logps_and_entropies(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/venv/lib/python3.12/site-packages/deepspeed/runtime/engine.py", line 2179, in forward
loss = self.module(*inputs, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/venv/lib/python3.12/site-packages/trl/extras/profiling.py", line 98, in wrapper
return func(self, *args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/venv/lib/python3.12/site-packages/trl/trainer/grpo_trainer.py", line 903, in _get_per_token_logps_and_entropies
logits = model(**model_inputs).logits
^^^^^^^^^^^^^^^^^^^^^
File "/home/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1881, in _call_impl
return inner()
^^^^^^^
File "/home/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1818, in inner
args_result = hook(self, args)
^^^^^^^^^^^^^^^^
File "/home/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/venv/lib/python3.12/site-packages/deepspeed/utils/nvtx.py", line 20, in wrapped_fn
ret_val = func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/home/venv/lib/python3.12/site-packages/deepspeed/utils/nvtx.py", line 20, in wrapped_fn
ret_val = func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/home/venv/lib/python3.12/site-packages/deepspeed/runtime/zero/parameter_offload.py", line 253, in _start_of_forward_hook
self.get_param_coordinator().reset_step()
File "/home/venv/lib/python3.12/site-packages/deepspeed/runtime/engine.py", line 2179, in forward
loss = self.module(*inputs, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/venv/lib/python3.12/site-packages/torch/_dynamo/eval_frame.py", line 1044, in _fn
return fn(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^
File "/home/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/venv/lib/python3.12/site-packages/deepspeed/runtime/zero/partitioned_param_coordinator.py", line 241, in reset_step
self.construct_parameter_trace_from_module_trace()
File "/home/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1881, in _call_impl
return inner()
^^^^^^^
File "/home/venv/lib/python3.12/site-packages/deepspeed/runtime/zero/partitioned_param_coordinator.py", line 225, in construct_parameter_trace_from_module_trace
self.record_parameters(sub_module)
File "/home/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1818, in inner
args_result = hook(self, args)
^^^^^^^^^^^^^^^^
File "/home/venv/lib/python3.12/site-packages/deepspeed/runtime/zero/partitioned_param_coordinator.py", line 217, in record_parameters
step_id = self.__step_id_module_fetched_for[sub_module.ds_id].popleft()
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/venv/lib/python3.12/site-packages/deepspeed/utils/nvtx.py", line 20, in wrapped_fn
ret_val = func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/home/venv/lib/python3.12/site-packages/deepspeed/runtime/zero/parameter_offload.py", line 253, in _start_of_forward_hook
self.get_param_coordinator().reset_step()
IndexError: pop from an empty deque
File "/home/venv/lib/python3.12/site-packages/torch/_dynamo/eval_frame.py", line 1044, in _fn
return fn(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^
File "/home/venv/lib/python3.12/site-packages/deepspeed/runtime/zero/partitioned_param_coordinator.py", line 241, in reset_step
self.construct_parameter_trace_from_module_trace()
File "/home/venv/lib/python3.12/site-packages/deepspeed/runtime/zero/partitioned_param_coordinator.py", line 225, in construct_parameter_trace_from_module_trace
self.record_parameters(sub_module)
File "/home/venv/lib/python3.12/site-packages/deepspeed/runtime/zero/partitioned_param_coordinator.py", line 217, in record_parameters
step_id = self.__step_id_module_fetched_for[sub_module.ds_id].popleft()
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
IndexError: pop from an empty deque
System Info
((venv) ) 1luik@1luik-PC:/media/1luik/46BB55AF65F351D4/4$ trl env
Copy-paste the following information when reporting an issue:
- Platform: Linux-6.6.104-amd64-desktop-hwe-x86_64-with-glibc2.38
- Python version: 3.12.11
- TRL version: 0.25.1
- PyTorch version: 2.9.0
- accelerator(s): Tesla V100-SXM2-16GB, Tesla V100-SXM2-16GB, Tesla V100-SXM2-16GB, Tesla V100-SXM2-16GB
- Transformers version: 4.57.1
- Accelerate version: 1.11.0
- Accelerate config:
- compute_environment: LOCAL_MACHINE
- distributed_type: DEEPSPEED
- mixed_precision: fp16
- use_cpu: False
- debug: True
- num_processes: 4
- machine_rank: 0
- num_machines: 1
- rdzv_backend: static
- same_network: True
- main_training_function: main
- enable_cpu_affinity: False
- deepspeed_config: {'gradient_accumulation_steps': 1, 'offload_optimizer_device': 'cpu', 'offload_param_device': 'cpu', 'zero3_init_flag': True, 'zero3_save_16bit_model': False, 'zero_stage': 3}
- downcast_bf16: no
- tpu_use_cluster: False
- tpu_use_sudo: False
- tpu_env: []
- Datasets version: 4.3.0
- HF Hub version: 0.36.0
- bitsandbytes version: 0.48.2
- DeepSpeed version: 0.18.2
- Liger-Kernel version: not installed
- LLM-Blender version: not installed
- OpenAI version: not installed
- PEFT version: 0.18.0
- vLLM version: not installed
Checklist
- I have checked that my issue isn't already filed (see open issues)
- I have included my system information
- Any code provided is minimal, complete, and reproducible (more on MREs)
- Any code provided is properly formatted in code blocks, (no screenshot, more on code blocks)
- Any traceback provided is complete