k2-fsa · yuekaizhang · Feb 26, 2025 · Feb 26, 2025 · Apr 11, 2025 · Apr 14, 2025
diff --git a/egs/speech_llm/SPEECH2SPEECH/README.md b/egs/speech_llm/SPEECH2SPEECH/README.md
@@ -0,0 +1,55 @@
+
+# Introduction
+
+This recipe includes scripts for training speech2speech models.
+
+# SPEECH2SPEECH
+
+The following table lists the folders for different tasks.
+
+|Recipe        | Speech Input | Speech Output | Comment|
+|--------------|--------------|---------------|--------|
+|Qwen-omni like| Continuous Embeddins| Cosyvoice1 50Hz Single-codebook Token | Text-driven; using Thinker LLM for text token, small Talker LLM for speech token |
+
+### [Qwen-omni like Speech2speech Recipe](./qwen_omni)
+
+[Qwen2.5-Omni](https://github.com/QwenLM/Qwen2.5-Omni) style model using [worstchan/Belle_1.4M-SLAM-Omni](https://huggingface.co/datasets/worstchan/Belle_1.4M-SLAM-Omni) dataset.
+
+<br>
+<p align="center">
+    <img src="assets/framework.png" width="800"/>
+<p>
+<br>
+
+Command for training is:
+```bash
+torchrun --nproc_per_node $ngpu ./qwen_omni/train.py \
+    --max-duration 50 \
+    --enable-musan False \
+    --exp-dir $exp_dir \
+    --speech-encoder-path-or-name models/whisper/v1.1/whisper-large-v2-multi-hans-zh-epoch-3-avg-10.pt \
+    --llm-path-or-name Qwen/Qwen2.5-0.5B-Instruct \
+    --manifest-dir data/fbank \
+    --deepspeed \
+    --deepspeed_config ./qwen_omni/ds_config_zero1.json \
+    --use-flash-attn True \
+    --use-lora True --unfreeze-llm True --unfreeze-speech-projector True --enable-speech-output True
+```
+
+Command for decoding is:
+```bash
+python3 ./qwen_omni/decode.py \
+    --max-duration 1 \
+    --exp-dir $exp_dir \
+    --speech-encoder-path-or-name models/whisper/v1.1/whisper-large-v2-multi-hans-zh-epoch-3-avg-10.pt  \
+    --llm-path-or-name models/Qwen2.5-0.5B-Instruct \
+    --epoch 999 --avg 1 \
+    --manifest-dir data/fbank \
+    --use-flash-attn True \
+    --method e2e-epoch10_speech2speech \
+    --enable-speech-output True \
+    --token2wav-path models/CosyVoice-300M-SFT \
+    --use-lora True
+```
+
+Please see [`prepare.sh`](./prepare.sh) for more details.
diff --git a/egs/speech_llm/SPEECH2SPEECH/assets/framework.png b/egs/speech_llm/SPEECH2SPEECH/assets/framework.png
diff --git a/egs/speech_llm/SPEECH2SPEECH/exp.sh b/egs/speech_llm/SPEECH2SPEECH/exp.sh
@@ -0,0 +1,234 @@
+#!/usr/bin/env bash
+
+# fix segmentation fault reported in https://github.com/k2-fsa/icefall/issues/674
+export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
+export PYTHONPATH=$PYTHONPATH:/workspace/CosyVoice
+# export HF_HOME="/lustre/fsw/general_sa/yuekaiz/.cache/huggingface"
+set -eou pipefail
+
+stage=$1
+stop_stage=$2
+
+
+log() {
+  # This function is from espnet
+  local fname=${BASH_SOURCE[1]##*/}
+  echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+if [ $stage -le 17 ] && [ $stop_stage -ge 17 ]; then
+  echo "cd /workspace && ln -s /lustre/fsw/general_sa/yuekaiz/s2s slam && cd -"
+  if [ ! -L "/workspace/slam" ]; then
+    cd /workspace && ln -s /lustre/fsw/general_sa/yuekaiz/s2s slam && cd -
+  fi
+  log "stage 17: Training Speech2Speech Model, full parameters"
+  exp_dir=./qwen_omni/exp_speech2text_first_multi_en_continuation_second_three_s2s
+  pretrained_dir=./qwen_omni/exp_speech2text
+  ngpu=4
+
+  latest_checkpoint_step=-1
+  # Check if exp_dir exists and is a directory
+  if [ -d "$exp_dir" ]; then
+    # List directories matching checkpoint-* and find the one with the largest step number
+    for checkpoint_dir in $(ls -d $exp_dir/checkpoint-*/ 2>/dev/null | sort -V); do
+      checkpoint_name=$(basename "$checkpoint_dir") # e.g., checkpoint-1000
+      # Extract step number using parameter expansion
+      current_step=${checkpoint_name#checkpoint-}
+      # Ensure current_step is a number
+      if [[ "$current_step" =~ ^[0-9]+$ ]] && [ "$current_step" -gt "$latest_checkpoint_step" ]; then
+        latest_checkpoint_step=$current_step
+      fi
+    done
+  fi
+
+  train_cmd_args="--max-duration 200 \
+    --enable-musan False \
+    --exp-dir $exp_dir \
+    --last-stage-model-path $pretrained_dir/checkpoint-58548/pytorch_model.bin \
+    --speech-encoder-path-or-name models/large-v2.pt \
+    --llm-path-or-name models/Qwen2.5-0.5B-Instruct \
+    --on-the-fly-feats True --on-the-fly-speed-perturb False\
+    --deepspeed \
+    --huggingface-dataset-path-or-name /lustre/fsw/general_sa/yuekaiz/s2s \
+    --deepspeed_config ./qwen_omni/ds_config_zero1.json \
+    --use-flash-attn True --on-the-fly-feats True \
+    --dataset vocalnet_ultrachat_voiceassistant_instruct_s2s --num-epochs 10 \
+    --use-lora True --unfreeze-llm True --unfreeze-speech-projector True --enable-speech-output False"
+
+  if [ "$latest_checkpoint_step" -ge 0 ]; then
+    log "Continuing training from checkpoint-$latest_checkpoint_step"
+    step=$latest_checkpoint_step
+    train_cmd_args="$train_cmd_args --pretrained-model-path $exp_dir/checkpoint-${step}/pytorch_model.bin --sampler-state-dict-path $exp_dir/checkpoint-${step}/sampler.pt"
+  else
+    log "Starting training from scratch as no checkpoint was found in $exp_dir"
+    # No pretrained model or sampler state dict needed for the first run
+  fi
+
+  torchrun --nproc_per_node $ngpu --nnodes $SLURM_JOB_NUM_NODES --rdzv_endpoint $MASTER_ADDR:$MASTER_PORT --rdzv_backend c10d --rdzv_id $SLURM_JOBID ./qwen_omni/train.py \
+    $train_cmd_args
+fi
+
+if [ $stage -le 18 ] && [ $stop_stage -ge 18 ]; then
+  echo "cd /workspace && ln -s /lustre/fsw/general_sa/yuekaiz/s2s slam && cd -"
+  # check if the link exists, if not exist, create it
+  if [ ! -L "/workspace/slam" ]; then
+    cd /workspace && ln -s /lustre/fsw/general_sa/yuekaiz/s2s slam && cd -
+  fi
+  log "stage 17: Training Speech2Speech Model, full parameters"
+  exp_dir=./qwen_omni/exp_speech2text_first_multi_en_continuation_second_three_s2s_librispeech
+  pretrained_dir=./qwen_omni/exp_speech2text
+  ngpu=4
+
+  latest_checkpoint_step=-1
+  # Check if exp_dir exists and is a directory
+  if [ -d "$exp_dir" ]; then
+    # List directories matching checkpoint-* and find the one with the largest step number
+    for checkpoint_dir in $(ls -d $exp_dir/checkpoint-*/ 2>/dev/null | sort -V); do
+      checkpoint_name=$(basename "$checkpoint_dir") # e.g., checkpoint-1000
+      # Extract step number using parameter expansion
+      current_step=${checkpoint_name#checkpoint-}
+      # Ensure current_step is a number
+      if [[ "$current_step" =~ ^[0-9]+$ ]] && [ "$current_step" -gt "$latest_checkpoint_step" ]; then
+        latest_checkpoint_step=$current_step
+      fi
+    done
+  fi
+
+  train_cmd_args="--max-duration 200 \
+    --enable-musan False \
+    --exp-dir $exp_dir \
+    --last-stage-model-path $pretrained_dir/checkpoint-58548/pytorch_model.bin \
+    --speech-encoder-path-or-name models/large-v2.pt \
+    --llm-path-or-name models/Qwen2.5-0.5B-Instruct \
+    --on-the-fly-feats True --on-the-fly-speed-perturb False\
+    --deepspeed \
+    --huggingface-dataset-path-or-name /lustre/fsw/general_sa/yuekaiz/s2s \
+    --deepspeed_config ./qwen_omni/ds_config_zero1.json \
+    --use-flash-attn True --on-the-fly-feats True \
+    --dataset vocalnet_ultrachat_voiceassistant_instruct_s2s_librispeech --num-epochs 10 \
+    --use-lora True --unfreeze-llm True --unfreeze-speech-projector True --enable-speech-output False"
+
+  if [ "$latest_checkpoint_step" -ge 0 ]; then
+    log "Continuing training from checkpoint-$latest_checkpoint_step"
+    step=$latest_checkpoint_step
+    train_cmd_args="$train_cmd_args --pretrained-model-path $exp_dir/checkpoint-${step}/pytorch_model.bin --sampler-state-dict-path $exp_dir/checkpoint-${step}/sampler.pt"
+  else
+    log "Starting training from scratch as no checkpoint was found in $exp_dir"
+    # No pretrained model or sampler state dict needed for the first run
+  fi
+
+  torchrun --nproc_per_node $ngpu --nnodes $SLURM_JOB_NUM_NODES --rdzv_endpoint $MASTER_ADDR:$MASTER_PORT --rdzv_backend c10d --rdzv_id $SLURM_JOBID ./qwen_omni/train.py \
+    $train_cmd_args
+fi
+
+if [ $stage -le 19 ] && [ $stop_stage -ge 19 ]; then
+  log "stage 19: Training TTS Model"
+  exp_dir=./qwen_omni/exp_tts_ultra_chat_voice_assistant
+  exp_dir=./qwen_omni/exp_tts_emilia_en_tts_only_template
+  exp_dir=./qwen_omni/exp_tts_emilia_en_tts_three_concat
+  pretrained_dir=./qwen_omni/exp_speech2text
+  ngpu=4
+
+  latest_checkpoint_step=-1
+  # Check if exp_dir exists and is a directory
+  if [ -d "$exp_dir" ]; then
+    # List directories matching checkpoint-* and find the one with the largest step number
+    for checkpoint_dir in $(ls -d $exp_dir/checkpoint-*/ 2>/dev/null | sort -V); do
+      checkpoint_name=$(basename "$checkpoint_dir") # e.g., checkpoint-1000
+      # Extract step number using parameter expansion
+      current_step=${checkpoint_name#checkpoint-}
+      # Ensure current_step is a number
+      if [[ "$current_step" =~ ^[0-9]+$ ]] && [ "$current_step" -gt "$latest_checkpoint_step" ]; then
+        latest_checkpoint_step=$current_step
+      fi
+    done
+  fi
+  # --dataset ultra_chat_voice_assistant
+  train_cmd_args="--batch-size 30 \
+    --exp-dir $exp_dir \
+    --llm-path-or-name models/Qwen2.5-0.5B-Instruct \
+    --enable-speech-input False \
+    --deepspeed \
+    --dataset  /lustre/fsw/general_sa/yuekaiz/s2s/VoxBox/manifests_emilia_en \
+    --deepspeed_config ./qwen_omni/ds_config_zero1.json \
+    --use-flash-attn True  \
+    --num-epochs 3 \
+    --use-lora False --unfreeze-llm False --enable-speech-output True"
+
+  if [ "$latest_checkpoint_step" -ge 0 ]; then
+    log "Continuing training from checkpoint-$latest_checkpoint_step"
+    step=$latest_checkpoint_step
+    train_cmd_args="$train_cmd_args --pretrained-model-path $exp_dir/checkpoint-${step}/pytorch_model.bin --sampler-state-dict-path $exp_dir/checkpoint-${step}/sampler.pt"
+  else
+    log "Starting training from scratch as no checkpoint was found in $exp_dir"
+    # No pretrained model or sampler state dict needed for the first run
+  fi
+
+  torchrun --nproc_per_node $ngpu --nnodes $SLURM_JOB_NUM_NODES --rdzv_endpoint $MASTER_ADDR:$MASTER_PORT --rdzv_backend c10d --rdzv_id $SLURM_JOBID ./qwen_omni/train_tts.py \
+    $train_cmd_args
+fi
+
+
+# if [ $stage -le 20 ] && [ $stop_stage -ge 20 ]; then
+#   log "stage 20: Training TTS Model"
+#   echo "cd /workspace && ln -s /lustre/fsw/general_sa/yuekaiz/s2s slam && cd -"
+#   if [ ! -L "/workspace/slam" ]; then
+#     cd /workspace && ln -s /lustre/fsw/general_sa/yuekaiz/s2s slam && cd -
+#   fi
+#   exp_dir=./qwen_omni/exp_test
+#   ngpu=4
+
+#   latest_checkpoint_step=-1
+#   # Check if exp_dir exists and is a directory
+#   if [ -d "$exp_dir" ]; then
+#     # List directories matching checkpoint-* and find the one with the largest step number
+#     for checkpoint_dir in $(ls -d $exp_dir/checkpoint-*/ 2>/dev/null | sort -V); do
+#       checkpoint_name=$(basename "$checkpoint_dir") # e.g., checkpoint-1000
+#       # Extract step number using parameter expansion
+#       current_step=${checkpoint_name#checkpoint-}
+#       # Ensure current_step is a number
+#       if [[ "$current_step" =~ ^[0-9]+$ ]] && [ "$current_step" -gt "$latest_checkpoint_step" ]; then
+#         latest_checkpoint_step=$current_step
+#       fi
+#     done
+#   fi
+
+#   train_cmd_args="--max-duration 150 \
+#     --enable-musan False \
+#     --exp-dir $exp_dir \
+#     --speech-encoder-path-or-name models/large-v2.pt \
+#     --llm-path-or-name Qwen/Qwen2.5-0.5B-Instruct \
+#     --dataset vocalnet_ultrachat_voiceassistant \
+#     --manifest-dir data/fbank \
+#     --deepspeed \
+#     --deepspeed_config ./qwen_omni/ds_config_zero1.json \
+#     --use-flash-attn True --on-the-fly-feats True \
+#     --use-lora True --unfreeze-llm True --unfreeze-speech-projector True --enable-speech-output True"
+
+#   if [ "$latest_checkpoint_step" -ge 0 ]; then
+#     log "Continuing training from checkpoint-$latest_checkpoint_step"
+#     step=$latest_checkpoint_step
+#     train_cmd_args="$train_cmd_args --pretrained-model-path $exp_dir/checkpoint-${step}/pytorch_model.bin --sampler-state-dict-path $exp_dir/checkpoint-${step}/sampler.pt"
+#   else
+#     log "Starting training from scratch as no checkpoint was found in $exp_dir"
+#     # No pretrained model or sampler state dict needed for the first run
+#   fi
+
+#   torchrun --nproc_per_node $ngpu --nnodes $SLURM_JOB_NUM_NODES --rdzv_endpoint $MASTER_ADDR:$MASTER_PORT --rdzv_backend c10d --rdzv_id $SLURM_JOBID ./qwen_omni/train.py \
+#     $train_cmd_args
+# fi
+
+
+# if [ $stage -le 21 ] && [ $stop_stage -ge 21 ]; then
+#   log "stage 21: TTS Decoding Test Set"
+#   exp_dir=./qwen_omni/exp_tts
+#   torchrun --nproc_per_node=2 ./qwen_omni/decode_tts.py \
+#     --exp-dir $exp_dir \
+#     --speech-encoder-path-or-name models/large-v2.pt  \
+#     --llm-path-or-name models/Qwen2.5-0.5B-Instruct \
+#     --pretrained-model-path $exp_dir/checkpoint-32001/pytorch_model.bin \
+#     --use-flash-attn True \
+#     --enable-speech-output True \
+#     --token2wav-path /workspace/CosyVoice2-0.5B \
+#     --use-lora True
+# fi