wenet-e2e · Emiyassstar · Mar 15, 2022 · Mar 15, 2022 · Mar 15, 2022 · Mar 15, 2022
diff --git a/examples/librispeech/ssl/README.md b/examples/librispeech/ssl/README.md
@@ -0,0 +1,33 @@
+# Performance Record
+
+## Conformer Result  (Base 12layer)
+
+pretrain Conformer 
+* pretrain config: conf/pretrain/train_conformer_pretrain_w2v.yaml
+* finetune config: conf/finetune/train_conformer_100h.yaml
+* beam: 10
+* num of gpu: 8
+* num of averaged model: 20
+* ctc weight (used for attention rescoring): 0.5
+* pretrain 90 epochs ,finetune  80 epochs
+
+test set results trained with 100 hours train-clean set
+
+## wav2vec2.0 Results
+
+test clean
+| decoding mode   | full |
+|--------------------------|------|
+| ctc prefix beam search   | 5.77 | 
+| attention rescoring      | 5.30 | 
+
+test other
+| decoding mode | full | 
+|--------------------------|------|
+| ctc prefix beam search   | 12.73 | 
+| attention rescoring      | 12.14 | 
+
+
+## data2vec Results
+
+going
diff --git a/examples/librispeech/ssl/conf/finetune/train_conformer_100h.yaml b/examples/librispeech/ssl/conf/finetune/train_conformer_100h.yaml
@@ -0,0 +1,91 @@
+# network architecture
+# encoder related
+encoder: conformer
+encoder_conf:
+    output_size: 512    # dimension of attention
+    attention_heads: 8
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.0
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
+    normalize_before: true
+    cnn_module_kernel: 31
+    use_cnn_module: True
+    activation_type: 'swish'
+    pos_enc_layer_type: 'rel_pos'
+    selfattention_layer_type: 'rel_selfattn'
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 512
+    num_blocks: 2
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.0
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.7
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
+
+# use raw_wav or kaldi feature
+raw_wav: true
+
+# dataset related
+dataset_conf:
+    filter_conf:
+        max_length: 4000
+        min_length: 50
+        token_max_length: 400 
+        token_min_length: 1
+    resample_conf:
+        resample_rate: 16000
+    speed_perturb: true
+    fbank_conf:
+        num_mel_bins: 80
+        frame_shift: 10
+        frame_length: 25
+        dither: 1.0 
+    spec_aug: true
+    spec_aug_conf:
+        num_t_mask: 3
+        num_f_mask: 2
+        max_t: 50
+        max_f: 10
+    shuffle: true
+    shuffle_conf:
+        shuffle_size: 1500
+    sort: true
+    sort_conf:
+        sort_size: 500  # sort_size should be less than shuffle_size
+    batch_conf:
+        batch_type: 'static' # static or dynamic
+        batch_size: 12
+
+pretrain: False
+wav2vec_conf:
+    pretrain: False
+    quantize_targets: True
+    project_targets: True
+    latent_vars: 320
+    latent_dim: 512
+    latent_groups: 2
+    mask: False
+
+grad_clip: 5
+accum_grad: 1
+max_epoch: 120
+log_interval: 100
+
+optim: adam
+optim_conf:
+    lr: 0.001
+scheduler: warmuplr     # pytorch v1.1.0+ required
+scheduler_conf:
+    warmup_steps: 15000
diff --git a/examples/librispeech/ssl/conf/finetune/train_conformer_100h_data2vec.yaml b/examples/librispeech/ssl/conf/finetune/train_conformer_100h_data2vec.yaml
@@ -0,0 +1,89 @@
+# network architecture
+# encoder related
+encoder: conformer
+encoder_conf:
+    output_size: 512    # dimension of attention
+    attention_heads: 8
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.0
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
+    normalize_before: true
+    cnn_module_kernel: 31
+    use_cnn_module: True
+    activation_type: 'swish'
+    pos_enc_layer_type: 'rel_pos'
+    selfattention_layer_type: 'rel_selfattn'
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 512
+    num_blocks: 2
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.0
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.7
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
+
+# use raw_wav or kaldi feature
+raw_wav: true
+
+# dataset related
+dataset_conf:
+    filter_conf:
+        max_length: 2000
+        min_length: 50
+        token_max_length: 400 
+        token_min_length: 1
+    resample_conf:
+        resample_rate: 16000
+    speed_perturb: true
+    fbank_conf:
+        num_mel_bins: 80
+        frame_shift: 10
+        frame_length: 25
+        dither: 1.0 
+    spec_aug: true
+    spec_aug_conf:
+        num_t_mask: 3
+        num_f_mask: 2
+        max_t: 50
+        max_f: 10
+    shuffle: true
+    shuffle_conf:
+        shuffle_size: 1500
+    sort: true
+    sort_conf:
+        sort_size: 500  # sort_size should be less than shuffle_size
+    batch_conf:
+        batch_type: 'static' # static or dynamic
+        batch_size: 10
+
+pretrain: False
+data2vec_conf:
+    pretrain: False
+    intermediate_layers: [4,5,6,7,8,9,10,11]
+    ema_anneal_end_step: 30000
+    mask: False
+    mask_prob: 0.65
+
+grad_clip: 5
+accum_grad: 1
+max_epoch: 120
+log_interval: 100
+
+optim: adam
+optim_conf:
+    lr: 0.001
+scheduler: warmuplr     # pytorch v1.1.0+ required
+scheduler_conf:
+    warmup_steps: 15000
diff --git a/examples/librispeech/ssl/conf/pretrain/train_conformer_pretrain_data2vec.yaml b/examples/librispeech/ssl/conf/pretrain/train_conformer_pretrain_data2vec.yaml
@@ -0,0 +1,91 @@
+# network architecture
+# encoder related
+encoder: conformer
+encoder_conf:
+    output_size: 512    # dimension of attention
+    attention_heads: 8
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.0
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
+    normalize_before: true
+    cnn_module_kernel: 31
+    cnn_module_norm: 'layer_norm'
+    use_cnn_module: True
+    activation_type: 'swish'
+    pos_enc_layer_type: 'rel_pos'
+    selfattention_layer_type: 'rel_selfattn'
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.0
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 1.0
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
+
+# use raw_wav or kaldi feature
+raw_wav: true
+
+# dataset related
+dataset_conf:
+    filter_conf:
+        max_length: 2000
+        min_length: 50
+        token_max_length: 400 
+        token_min_length: 1
+    resample_conf:
+        resample_rate: 16000
+    speed_perturb: false
+    fbank_conf:
+        num_mel_bins: 80
+        frame_shift: 10
+        frame_length: 25
+        dither: 1.0 
+    spec_aug: false
+    spec_aug_conf:
+        num_t_mask: 3
+        num_f_mask: 2
+        max_t: 50
+        max_f: 10
+    shuffle: true
+    shuffle_conf:
+        shuffle_size: 1500
+    sort: true
+    sort_conf:
+        sort_size: 500  # sort_size should be less than shuffle_size
+    batch_conf:
+        batch_type: 'dynamic' # static or dynamic
+        max_frames_in_batch: 20000
+        batch_size: 20
+
+pretrain: True
+data2vec_conf:
+    pretrain: True
+    intermediate_layers: [4,5,6,7,8,9,10,11]
+    ema_anneal_end_step: 30000
+    mask: True
+    mask_prob: 0.65
+
+grad_clip: 5
+accum_grad: 4
+max_epoch: 90
+log_interval: 100
+
+optim: adam
+optim_conf:
+    lr: 0.001
+scheduler: warmuplr     # pytorch v1.1.0+ required
+scheduler_conf:
+    warmup_steps: 25000