generated from fastai/nbdev_template
-
Notifications
You must be signed in to change notification settings - Fork 2.3k
Open
Description
Reproduction
I am using the SFTTrainer, and I found the training and evaluation exhibit significant difference between zero_stage=1 and zero_stage=2, shown as below:
The red lines correspond to zero_stage=1, whereas the black lines correspond to zero_stage=2. Anyone who can help me? Thanks a lot.
To reproduce my issue, please first preprocess the dataset via
import os
import datasets
import pandas as pd
from datasets import load_dataset
from tqdm import tqdm
datasets.builder.has_sufficient_disk_space = lambda needed_bytes, directory=".": True
output_dir = "data/UltraFeedback"
os.makedirs(output_dir, exist_ok=True)
dataset = load_dataset("HuggingFaceH4/ultrafeedback_binarized")
# get train set
prompts, chosen, rejected = [], [], []
for item in tqdm(dataset["train_prefs"]):
if item["chosen"][1]["content"] == "" or item["rejected"][1]["content"] == "":
continue
prompts.append(item["prompt"])
chosen.append(item["chosen"][1]["content"])
rejected.append(item["rejected"][1]["content"])
df = pd.DataFrame(
{
"prompt": prompts,
"chosen": chosen,
"rejected": rejected,
}
)
df.to_parquet(os.path.join(output_dir, "all_train.parquet"))
# split into sft, rm, rl
split_ratio = [0.2, 0.4, 0.4]
total_samples = len(prompts)
sft_size = int(total_samples * split_ratio[0])
rm_size = int(total_samples * split_ratio[1])
rl_size = total_samples - sft_size - rm_size
random_df = df.sample(frac=1, random_state=42).reset_index(drop=True)
sft_df = random_df[:sft_size]
rm_df = random_df[sft_size : sft_size + rm_size]
rl_df = random_df[sft_size + rm_size :]
print(f"Original: {total_samples}")
print(f"SFT: {len(sft_df)} ({len(sft_df)/total_samples:.1%})")
print(f"RM: {len(rm_df)} ({len(rm_df)/total_samples:.1%})")
print(f"RL: {len(rl_df)} ({len(rl_df)/total_samples:.1%})")
# sft
sft_df.to_parquet(os.path.join(output_dir, "raw_sft.parquet"), index=False)
sft_df = sft_df.drop(columns=["rejected"]).rename(columns={"chosen": "completion"})
sft_df.to_parquet(os.path.join(output_dir, "sft.parquet"), index=False)
# rm
rm_df.to_parquet(os.path.join(output_dir, "rm.parquet"), index=False)
# rl
rl_df.to_parquet(os.path.join(output_dir, "rl.parquet"), index=False)
# get test set
prompts, chosen, rejected = [], [], []
for item in tqdm(dataset["test_prefs"]):
if item["chosen"][1]["content"] == "" or item["rejected"][1]["content"] == "":
continue
prompts.append(item["prompt"])
chosen.append(item["chosen"][1]["content"])
rejected.append(item["rejected"][1]["content"])
test_df = pd.DataFrame(
{
"prompt": prompts,
"chosen": chosen,
"rejected": rejected,
}
)
# sft
sft_df = test_df.drop(columns=["rejected"]).rename(columns={"chosen": "completion"})
sft_df.to_parquet(os.path.join(output_dir, "test_sft.parquet"), index=False)
# rm
test_df.to_parquet(os.path.join(output_dir, "test_rm.parquet"), index=False)
# rl
test_df.to_parquet(os.path.join(output_dir, "test_rl.parquet"), index=False)Then, start training via bash train.sh. The content of train.sh is
set -x
umask 000
source .venv/bin/activate
export TORCH_CUDA_ALLOC_CONF=expandable_segments:True
export OMP_NUM_THREADS=1
export PRETRAINED_MODEL_NAME=Qwen/Qwen2.5-1.5B
export N_GPU=$(nvidia-smi --query-gpu=count --format=csv,noheader | head -n 1)
accelerate launch \
--config_file configs/sft_zero1_4gpu.yaml \
sft.py \
model_name=${PRETRAINED_MODEL_NAME} \
trainer.per_device_train_batch_size=4 \
trainer.eval_steps=10 \
trainer.seed=42 \
trainer.full_determinism=true \
compute.n_gpus=${N_GPU}where my sft.py is
import os
os.environ["TOKENIZERS_PARALLELISM"] = "true"
from datetime import datetime
import hydra
from accelerate import PartialState
from accelerate.utils import broadcast_object_list
from datasets import Dataset, load_dataset
from omegaconf import DictConfig, OmegaConf
from transformers import AutoTokenizer, PreTrainedTokenizer
from trl import SFTTrainer
from trl.trainer.sft_config import SFTConfig
def sync_cfg(state: PartialState, cfg: DictConfig, key: str):
value_list = [getattr(cfg, key)]
broadcast_object_list(value_list, from_process=0)
setattr(cfg, key, value_list[0])
state.wait_for_everyone()
def format_sft_dataset(
dataset: Dataset, tokenizer: PreTrainedTokenizer, num_proc: int = 16
) -> Dataset:
def formatter(example: dict) -> dict:
return {
"prompt": tokenizer.apply_chat_template(
(
[{"role": "user", "content": example["prompt"]}]
if isinstance(example["prompt"], str)
else list(example["prompt"])
),
tokenize=False,
add_generation_prompt=True,
),
"completion": example["completion"] + tokenizer.eos_token,
}
return dataset.map(formatter, num_proc=num_proc)
@hydra.main(config_path="configs", config_name="sft.yaml", version_base=None)
def main(cfg: DictConfig):
state = PartialState()
# add timestamp to exp_dir
if state.is_main_process:
cfg.exp_dir = f"{cfg.exp_dir}_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
os.makedirs(cfg.exp_dir, exist_ok=True)
# sync
sync_cfg(state, cfg, "exp_dir")
OmegaConf.resolve(cfg)
# resolve compute config
assert (
cfg.compute.global_batch_size
% (cfg.trainer.per_device_train_batch_size * cfg.compute.n_gpus)
== 0
), "global_batch_size must be divisible by per_device_train_batch_size * n_gpus"
cfg.trainer.gradient_accumulation_steps = cfg.compute.global_batch_size // (
cfg.trainer.per_device_train_batch_size * cfg.compute.n_gpus
)
print(f"Gradient accumulation steps: {cfg.trainer.gradient_accumulation_steps}")
# load dataset
if cfg.dataset.is_local:
train_dataset = load_dataset(
cfg.dataset.train.path.split(".")[-1],
data_files=cfg.dataset.train.path,
split=cfg.dataset.train.split,
)
eval_dataset = load_dataset(
cfg.dataset.eval.path.split(".")[-1],
data_files=cfg.dataset.eval.path,
split=cfg.dataset.eval.split,
)
else:
train_dataset = load_dataset(
cfg.dataset.train.name, split=cfg.dataset.train.split
)
eval_dataset = load_dataset(cfg.dataset.eval.name, split=cfg.dataset.eval.split)
if cfg.use_ms:
from modelscope.utils.hf_util import patch_hub
patch_hub()
tokenizer = AutoTokenizer.from_pretrained(cfg.model_name)
train_dataset = format_sft_dataset(train_dataset, tokenizer)
eval_dataset = format_sft_dataset(eval_dataset, tokenizer)
# start training
if state.is_main_process:
OmegaConf.save(cfg, os.path.join(cfg.exp_dir, "args.yaml"))
trainer = SFTTrainer(
model=cfg.model_name,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
args=SFTConfig(**OmegaConf.to_container(cfg.trainer, resolve=True)),
)
trainer.train()
trainer.save_model(cfg.trainer.output_dir)
if __name__ == "__main__":
main()The two config files are in the same folder, named as configs. Please first make it and then create sft.yaml and sft_zero1_4gpu.yaml sequentially.
# sft.yaml
defaults:
- override hydra/hydra_logging: disabled
- override hydra/job_logging: disabled
hydra:
output_subdir: null
run:
dir: .
exp_dir: logs/sft
# model
model_name: Qwen/Qwen2.5-1.5B
use_ms: true
# dataset
dataset:
is_local: true
train:
path: data/UltraFeedback/sft.parquet
split: train
eval:
path: data/UltraFeedback/test_sft.parquet
split: train
compute:
global_batch_size: 256
n_gpus: 4
# sft
trainer:
output_dir: ${exp_dir}/ckpts
max_length: 4096
eval_strategy: steps
eval_steps: 10
per_device_train_batch_size: 2
gradient_accumulation_steps: ???
num_train_epochs: 3
gradient_checkpointing: true
activation_offloading: false
bf16: true
use_liger_kernel: true
packing: false
seed: 42
full_determinism: true
report_to: ["tensorboard"]
logging_dir: ${exp_dir}/tensorboard
save_strategy: "no"# sft_zero1_4gpu.yaml
compute_environment: LOCAL_MACHINE
debug: false
deepspeed_config:
deepspeed_multinode_launcher: standard
offload_optimizer_device: cpu
zero3_init_flag: false
zero_stage: 1 # change to 2 if test zero_stage 2
distributed_type: DEEPSPEED
downcast_bf16: 'no'
machine_rank: 0
main_training_function: main
mixed_precision: 'bf16'
num_machines: 1
num_processes: 4
rdzv_backend: static
same_network: true
tpu_env: []
tpu_use_cluster: false
tpu_use_sudo: false
use_cpu: falseSystem Info
- Platform: Linux-5.10.134-13.101.al8.x86_64-x86_64-with-glibc2.39
- Python version: 3.10.16
- TRL version: 0.25.1+056703c
- PyTorch version: 2.8.0
- accelerator(s): NVIDIA L20Z, NVIDIA L20Z, NVIDIA L20Z, NVIDIA L20Z
- Transformers version: 4.57.1
- Accelerate version: 1.11.0
- Accelerate config: not found
- Datasets version: 4.4.1
- HF Hub version: 0.36.0
- bitsandbytes version: not installed
- DeepSpeed version: 0.18.2
- Liger-Kernel version: 0.6.3
- LLM-Blender version: not installed
- OpenAI version: 2.6.1
- PEFT version: 0.18.0
- vLLM version: not installed
Checklist
- I have checked that my issue isn't already filed (see open issues)
- I have included my system information
- Any code provided is minimal, complete, and reproducible (more on MREs)
- Any code provided is properly formatted in code blocks, (no screenshot, more on code blocks)
- Any traceback provided is complete
Metadata
Metadata
Assignees
Labels
No labels