Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 54 additions & 0 deletions examples/config/dpo/full_tp_pp_ep_sd_no_packing.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
### data
train_dataset_type: erniekit
eval_dataset_type: erniekit
train_dataset_path: ./data/dpo/train.jsonl
train_dataset_prob: "1.0"
eval_dataset_path: ./data/dpo/dev.jsonl
eval_dataset_prob: "1.0"
max_seq_len: 8192
num_samples_each_epoch: 6000000
packing: false
mix_strategy: concat

### model
model_name_or_path: Qwen/Qwen3-30B-A3B
attn_impl: flashmask

### finetuning
# base
stage: DPO
fine_tuning: full
seed: 23
do_train: true
do_eval: true
per_device_eval_batch_size: 1
per_device_train_batch_size: 1
num_train_epochs: 1
max_steps: -1
eval_steps: 100
evaluation_strategy: steps
save_steps: 100
save_strategy: steps
logging_steps: 1
gradient_accumulation_steps: 4
logging_dir: ./vdl_log
output_dir: ./checkpoints/qwen3_hf_0p6b_dpo_ckpts_parallel
disable_tqdm: true
eval_accumulation_steps: 16

# train
warmup_steps: 20
learning_rate: 1.0e-6

# performance
tensor_parallel_degree: 2
pipeline_parallel_degree: 2
pipeline_parallel_config: enable_clear_every_step_cache disable_partial_send_recv
expert_parallel_degree: 4
sharding_parallel_degree: 2
sharding: stage1
sequence_parallel: false
recompute: true
bf16: true
fp16_opt_level: O2
unified_checkpoint: true
52 changes: 52 additions & 0 deletions examples/config/dpo/full_tp_pp_no_packing.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
### data
train_dataset_type: erniekit
eval_dataset_type: erniekit
train_dataset_path: ./data/dpo/train.jsonl
train_dataset_prob: "1.0"
eval_dataset_path: ./data/dpo/dev.jsonl
eval_dataset_prob: "1.0"
max_seq_len: 8192
num_samples_each_epoch: 6000000
packing: false
mix_strategy: concat

### model
model_name_or_path: Qwen/Qwen3-0.6B-Base
attn_impl: flashmask

### finetuning
# base
stage: DPO
fine_tuning: full
seed: 23
do_train: true
do_eval: true
per_device_eval_batch_size: 1
per_device_train_batch_size: 1
num_train_epochs: 1
max_steps: -1
eval_steps: 100
evaluation_strategy: steps
save_steps: 100
save_strategy: steps
logging_steps: 1
gradient_accumulation_steps: 4
logging_dir: ./vdl_log
output_dir: ./checkpoints/qwen3_hf_0p6b_dpo_ckpts_parallel
disable_tqdm: true
eval_accumulation_steps: 16

# train
warmup_steps: 20
learning_rate: 1.0e-6

# performance
tensor_parallel_degree: 2
pipeline_parallel_degree: 2
pipeline_parallel_config: enable_clear_every_step_cache disable_partial_send_recv
sharding: stage1
sequence_parallel: false
recompute: true
bf16: true
fp16_opt_level: O2
unified_checkpoint: true
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,8 @@ learning_rate: 1.0e-6
tensor_parallel_degree: 2
pipeline_parallel_degree: 2
pipeline_parallel_config: enable_clear_every_step_cache disable_partial_send_recv
sequence_parallel: true
sharding: stage1
sequence_parallel: true
recompute: true
bf16: true
fp16_opt_level: O2
Expand Down
55 changes: 55 additions & 0 deletions examples/config/dpo/full_tp_pp_packing_optim_vram.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
### data
train_dataset_type: erniekit
eval_dataset_type: erniekit
train_dataset_path: ./data/dpo/train.jsonl
train_dataset_prob: "1.0"
eval_dataset_path: ./data/dpo/dev.jsonl
eval_dataset_prob: "1.0"
max_seq_len: 8192
num_samples_each_epoch: 6000000
packing: true
mix_strategy: concat

### model
model_name_or_path: Qwen/Qwen3-0.6B-Base
attn_impl: flashmask

### finetuning
# base
stage: DPO
fine_tuning: full
seed: 23
do_train: true
do_eval: true
per_device_eval_batch_size: 1
per_device_train_batch_size: 1
num_train_epochs: 1
max_steps: -1
eval_steps: 100
evaluation_strategy: steps
save_steps: 100
save_strategy: steps
logging_steps: 1
gradient_accumulation_steps: 4
logging_dir: ./vdl_log
output_dir: ./checkpoints/qwen3_hf_0p6b_dpo_ckpts_parallel
disable_tqdm: true
eval_accumulation_steps: 16

# train
warmup_steps: 20
learning_rate: 1.0e-6

# performance
tensor_parallel_degree: 2
pipeline_parallel_degree: 2
pipeline_parallel_config: enable_clear_every_step_cache disable_partial_send_recv
sharding: stage1
sequence_parallel: true
use_fused_head_and_loss_fn: true
loss_subbatch_sequence_length: 8192
tensorwise_offload_optimizer: true
recompute: true
bf16: true
fp16_opt_level: O2
unified_checkpoint: true
53 changes: 53 additions & 0 deletions examples/config/dpo/full_tp_pp_sd_no_packing.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
### data
train_dataset_type: erniekit
eval_dataset_type: erniekit
train_dataset_path: ./data/dpo/train.jsonl
train_dataset_prob: "1.0"
eval_dataset_path: ./data/dpo/dev.jsonl
eval_dataset_prob: "1.0"
max_seq_len: 8192
num_samples_each_epoch: 6000000
packing: false
mix_strategy: concat

### model
model_name_or_path: Qwen/Qwen3-0.6B-Base
attn_impl: flashmask

### finetuning
# base
stage: DPO
fine_tuning: full
seed: 23
do_train: true
do_eval: true
per_device_eval_batch_size: 1
per_device_train_batch_size: 1
num_train_epochs: 1
max_steps: -1
eval_steps: 100
evaluation_strategy: steps
save_steps: 100
save_strategy: steps
logging_steps: 1
gradient_accumulation_steps: 4
logging_dir: ./vdl_log
output_dir: ./checkpoints/qwen3_hf_0p6b_dpo_ckpts_parallel
disable_tqdm: true
eval_accumulation_steps: 16

# train
warmup_steps: 20
learning_rate: 1.0e-6

# performance
tensor_parallel_degree: 2
pipeline_parallel_degree: 2
pipeline_parallel_config: enable_clear_every_step_cache disable_partial_send_recv
sharding_parallel_degree: 2
sharding: stage1
sequence_parallel: false
recompute: true
bf16: true
fp16_opt_level: O2
unified_checkpoint: true
51 changes: 51 additions & 0 deletions examples/config/dpo/full_tp_sd_packing.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
### data
train_dataset_type: erniekit
eval_dataset_type: erniekit
train_dataset_path: ./data/dpo/train.jsonl
train_dataset_prob: "1.0"
eval_dataset_path: ./data/dpo/dev.jsonl
eval_dataset_prob: "1.0"
max_seq_len: 8192
num_samples_each_epoch: 6000000
packing: true
mix_strategy: concat

### model
model_name_or_path: Qwen/Qwen3-0.6B-Base
attn_impl: flashmask

### finetuning
# base
stage: DPO
fine_tuning: full
seed: 23
do_train: true
do_eval: true
per_device_eval_batch_size: 1
per_device_train_batch_size: 1
num_train_epochs: 1
max_steps: -1
eval_steps: 100
evaluation_strategy: steps
save_steps: 100
save_strategy: steps
logging_steps: 1
gradient_accumulation_steps: 4
logging_dir: ./vdl_log
output_dir: ./checkpoints/qwen3_hf_0p6b_dpo_ckpts_parallel
disable_tqdm: true
eval_accumulation_steps: 16

# train
warmup_steps: 20
learning_rate: 1.0e-6

# performance
tensor_parallel_degree: 2
sharding_parallel_degree: 2
sharding: stage1
sequence_parallel: true
recompute: true
bf16: true
fp16_opt_level: O2
unified_checkpoint: true
55 changes: 55 additions & 0 deletions examples/config/dpo/lora_tp_pp_ep_sd_no_packing.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
### data
train_dataset_type: erniekit
eval_dataset_type: erniekit
train_dataset_path: ./data/dpo/train.jsonl
train_dataset_prob: "1.0"
eval_dataset_path: ./data/dpo/dev.jsonl
eval_dataset_prob: "1.0"
max_seq_len: 8192
packing: false
mix_strategy: concat

### model
model_name_or_path: Qwen/Qwen3-30B-A3B
attn_impl: flashmask
lora: true
lora_rank: 8

### finetuning
# base
stage: DPO
fine_tuning: lora
seed: 23
do_train: true
do_eval: true
per_device_eval_batch_size: 1
per_device_train_batch_size: 1
num_train_epochs: 1
max_steps: -1
eval_steps: 100
evaluation_strategy: steps
save_steps: 100
save_strategy: steps
logging_steps: 1
gradient_accumulation_steps: 4
logging_dir: ./vdl_log
output_dir: ./checkpoints/qwen3_hf_0p6b_dpo_lora_ckpts_parallel
disable_tqdm: true
eval_accumulation_steps: 16

# train
warmup_steps: 20
learning_rate: 1.0e-5

# performance
tensor_parallel_degree: 2
pipeline_parallel_degree: 2
pipeline_parallel_config: enable_clear_every_step_cache disable_partial_send_recv
expert_parallel_degree: 4
sharding_parallel_degree: 2
sharding: stage1
sequence_parallel: false
recompute: true
bf16: true
fp16_opt_level: O2
unified_checkpoint: true
53 changes: 53 additions & 0 deletions examples/config/dpo/lora_tp_pp_no_packing.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
### data
train_dataset_type: erniekit
eval_dataset_type: erniekit
train_dataset_path: ./data/dpo/train.jsonl
train_dataset_prob: "1.0"
eval_dataset_path: ./data/dpo/dev.jsonl
eval_dataset_prob: "1.0"
max_seq_len: 8192
packing: false
mix_strategy: concat

### model
model_name_or_path: Qwen/Qwen3-0.6B-Base
attn_impl: flashmask
lora: true
lora_rank: 8

### finetuning
# base
stage: DPO
fine_tuning: lora
seed: 23
do_train: true
do_eval: true
per_device_eval_batch_size: 1
per_device_train_batch_size: 1
num_train_epochs: 1
max_steps: -1
eval_steps: 100
evaluation_strategy: steps
save_steps: 100
save_strategy: steps
logging_steps: 1
gradient_accumulation_steps: 4
logging_dir: ./vdl_log
output_dir: ./checkpoints/qwen3_hf_0p6b_dpo_lora_ckpts_parallel
disable_tqdm: true
eval_accumulation_steps: 16

# train
warmup_steps: 20
learning_rate: 1.0e-5

# performance
tensor_parallel_degree: 2
pipeline_parallel_degree: 2
pipeline_parallel_config: enable_clear_every_step_cache disable_partial_send_recv
sharding: stage1
sequence_parallel: false
recompute: true
bf16: true
fp16_opt_level: O2
unified_checkpoint: true
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,8 @@ learning_rate: 1.0e-5
tensor_parallel_degree: 2
pipeline_parallel_degree: 2
pipeline_parallel_config: enable_clear_every_step_cache disable_partial_send_recv
sequence_parallel: true
sharding: stage1
sequence_parallel: true
recompute: true
bf16: true
fp16_opt_level: O2
Expand Down
Loading